grammar.py and friends: Make list parsing run through

First time parsing doesn't error out with a syntax error. No usable AST is produced, strings are not returned from lexer, and AST lists aren't lists, really. TEXT:="Hello world!"; had to be excluded from the example, because I don't get how this could be parsed with the given syntax. There's a special sequence "all visible characters", but any lexer regex I could think of will also match the types defining "alphabetic character" and return the respective tokens (e.g. T_A) or vice-versa, depending on the order in the lexer input file. I suppose, the only sensible way to handle this, is to define "all visible characters" by defining the tokens for the missing characters, and then use them along T_A ... T_Z or their derived types. Signed-off-by: Jan Lindemann <jan@janware.com>
2026-01-15 01:52:56 +01:00 · 2017-11-02 13:47:19 +01:00 · 2017-11-02 13:47:19 +01:00 · a3a8313ce8
commit a3a8313ce8
parent 6ca2eeef61
6 changed files with 61 additions and 30 deletions
--- a/test/grammar/Makefile
+++ b/test/grammar/Makefile
@ -1,10 +1,13 @@
 TOPDIR = ../..

+-include local.mk
+
 EXE_ARGS              ?= grammartest.code
 PREREQ_BUILD          += ytools
 FB_NAME                = grammartest
 NAMESPACE_IN_GENERATED = gt
 GENERATE_CONFIG_FILE   = generate.conf
+IRRELEVANT_SYMBOLS    ?= white_space

 include $(TOPDIR)/make/proj.mk
 include $(TOPDIR)/make/generate-flex-bison.mk
--- a/test/grammar/generate.conf
+++ b/test/grammar/generate.conf
@ -1,8 +1,18 @@
 [symbols]
+
  [white_space[
+    type = token
    regex = "[ \n\t\r]+" 
  ]
+
+  [all_characters[
+    type = non-terminal
+    regex = "[[:print:]]"
+    #lex_as = yytext[0]
+  ]
+
  [test[
+    type = token
    dings = bums
    regex = "bumsdings"
  ]
--- a/test/grammar/grammartest-input.ebnf
+++ b/test/grammar/grammartest-input.ebnf
@ -2,7 +2,7 @@
 program = 'PROGRAM', white space, identifier, white space, 
            'BEGIN', white space, 
            { assignment, ";", white space }, 
-            'END.' ;
+            'END.', [ white space ];
 identifier = alphabetic character, { alphabetic character | digit } ;
 number = [ "-" ], digit, { digit } ;
 string = '"' , { all characters }, '"' ;
--- a/test/grammar/grammartest.code
+++ b/test/grammar/grammartest.code
@ -6,5 +6,4 @@ BEGIN
  C:=A;
  D123:=B34A;
  BABOON:=GIRAFFE;
-  TEXT:="Hello world!";
 END.
--- a/tools/python/jwutils/grammar.py
+++ b/tools/python/jwutils/grammar.py
@ -34,6 +34,9 @@ mode_keep    = "keep"
 mode_discard = "discard"
 fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]

+c_token      = "token"
+c_non_terminal = "non-terminal"
+
 member_prefix = ''

 special_terminals = {
@ -638,23 +641,25 @@ def grammar_unroll_lists(grammar):
                if c.token == '}':
                    if len(listrule) == 0:
                        raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
-                    name = ""
                    delpos = []
+                    name = "list"
                    for i, rule in enumerate(listrule):
                        if rule.token in delimiters:
                            delpos.append(i)
                            continue
                        if rule.tp != t_target_lang:
                            continue
-                        name += tok2name(rule.token) + "_"
+                        name += "_" + tok2name(rule.token)

                    # not really: there are lists without delimiters, too
                    #if len(delpos) != 1:
                    #    p.dump(ERR)
                    #    raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))

-                    name = name + "my_list"
                    newrule.append(RuleComp(name, t_target_lang))
+                    listrule.insert(0, RuleComp('(', t_grammar))
+                    listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration
+                    listrule.append(RuleComp(')', t_grammar))
                    p = Symbol(name, rules=[[], listrule])
                    #p = Symbol(name)
                    #p.rules = [ [], listrule ]
@ -1065,7 +1070,9 @@ def grammar_create_ebnf(grammar, opts):
 	out += ' ' * indent + ' ;\n'
    return out

-def tokens_from_config(conf):
+def symbols_from_config(conf, types = None):
+    if types == None or types == "all":
+        types = [ c_token, c_non_terminal ]
    r = set()
    if conf is None:
        return r
@ -1073,7 +1080,7 @@ def tokens_from_config(conf):
    if symbols is None:
        return r
    for k, v in symbols.iteritems():
-        if v.get('regex') is not None:
+        if v["type"].value() in types:
            r.add(k)
    return r

@ -1094,7 +1101,7 @@ def grammar_create_y(grammar, opts):
    indent = '\t' * (spaces / 8)

    conf = opts['config']
-    tokens = tokens_from_config(conf)
+    conf_tokens = symbols_from_config(conf, [ c_token, c_non_terminal ])

    out = ""

@ -1144,7 +1151,7 @@ def grammar_create_y(grammar, opts):

    types = grammar_get_types(grammar)
    for t in types.keys():
-        if t in tokens:
+        if conf is not None and t in conf['symbols'].keys():
            continue
        out += '\n\t' + opts['namespace'] + '::' +  t + '_t *' + t + ';'
    out += '\n'
@ -1183,6 +1190,7 @@ def grammar_create_y(grammar, opts):
            out += format_token(p.sym, t) +'\n'

    # tokens from config
+    if conf is not None:
        for k, t in conf['symbols'].iteritems():
            slog(NOTICE, "adding token", k)
            out += format_token(k, "blah") + '\n'
@ -1190,7 +1198,7 @@ def grammar_create_y(grammar, opts):
    # types
    out += '\n'
    for t, p in grammar.iteritems():
-        if p.sym in conf['symbols'].keys():
+        if conf is not None and p.sym in conf['symbols'].keys():
            continue
        if p.tp == p_ruleset:
            out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
@ -1212,7 +1220,7 @@ def grammar_create_y(grammar, opts):
            continue
        if p.tp == p_special:
            continue
-        if p.sym in conf['symbols'].keys():
+        if conf is not None and p.sym in conf['symbols'].keys():
            continue
        slog(INFO, "creating production for symbol", p.str())

@ -1240,10 +1248,12 @@ def grammar_create_y(grammar, opts):
                    tokens.append(c.token)
            idx = 0
            for c in rule:
-                n += 1
                if c.tp == t_grammar:
                    s.update(c.token, 0)
                    continue
+                if c.token in tokens:
+                    continue
+                n += 1
                p = grammar[c.token]
                #if is_terminal(c.token) is not None:
                #    continue
@ -1327,8 +1337,15 @@ def grammar_create_l(grammar, opts):
            assert p.term[-1] in [ '"', "'" ], p.term
            out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n'

+    if conf is not None:
        for k, v in conf['symbols'].iteritems():
-        out += v['regex'].value() + ' { slog(PRI_NOTICE, "found regex ' + k + '"); return ' + k + '; }\n'
+            lex_as = v.get('lex_as')
+            if lex_as is not None:
+                retval = lex_as.value()
+            else:
+                retval = k
+            regex = v['regex'].value()
+            out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + k + '"); return ' + retval + '; }\n'

    #out += textwrap.dedent("""\
    #
@ -1464,7 +1481,7 @@ def grammar_create_l(grammar, opts):
 def grammar_create_h(grammar, opts):
    out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
    ns = opts['namespace']
-    tokens = tokens_from_config(opts['config'])
+    tokens = symbols_from_config(opts['config'], "all")

    if ns is not None:
        out += 'namespace ' + ns + '{\n\n'
@ -1508,6 +1525,8 @@ def grammar_create_h(grammar, opts):
                if rule.count(m) > 1:
                    idx += 1
                    suffix = '_' + str(idx)
+                if m in tokens:
+                    continue
                p = grammar[m]
                out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
            out += '\n\t};'
@ -1575,10 +1594,10 @@ class GrammarCmd(jwutils.Cmd):
        if args.fix_extensions not in fix_extensions_mode:
            raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
        grammar = grammar_fix_extensions(grammar, args.fix_extensions)
-        if args.unroll_alternatives:
-            grammar = grammar_unroll_alternatives(grammar)
        if args.unroll_lists:
            grammar = grammar_unroll_lists(grammar)
+        if args.unroll_alternatives:
+            grammar = grammar_unroll_alternatives(grammar)
        if args.unroll_options:
            grammar = grammar_unroll_options(grammar)
        if len(args.check_symbols):