From a3a8313ce84377101d8fa75fd0ae362a7c67d88b Mon Sep 17 00:00:00 2001
From: Jan Lindemann <jan@janware.com>
Date: Thu, 2 Nov 2017 13:47:19 +0100
Subject: [PATCH] grammar.py and friends: Make list parsing run through

First time parsing doesn't error out with a syntax error. No usable AST
is produced, strings are not returned from lexer, and AST lists aren't
lists, really.

TEXT:="Hello world!"; had to be excluded from the example, because I
don't get how this could be parsed with the given syntax. There's a
special sequence "all visible characters", but any lexer regex I could
think of will also match the types defining "alphabetic character" and
return the respective tokens (e.g. T_A) or vice-versa, depending on the
order in the lexer input file. I suppose, the only sensible way to
handle this, is to define "all visible characters" by defining the
tokens for the missing characters, and then use them along T_A ... T_Z
or their derived types.

Signed-off-by: Jan Lindemann <jan@janware.com>
---
 make/generate-flex-bison.mk         | 20 +++++------
 test/grammar/Makefile               |  3 ++
 test/grammar/generate.conf          | 10 ++++++
 test/grammar/grammartest-input.ebnf |  2 +-
 test/grammar/grammartest.code       |  1 -
 tools/python/jwutils/grammar.py     | 55 +++++++++++++++++++----------
 6 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/make/generate-flex-bison.mk b/make/generate-flex-bison.mk
index 5f05d7f..1d34848 100644
--- a/make/generate-flex-bison.mk
+++ b/make/generate-flex-bison.mk
@@ -34,16 +34,16 @@ GENERATE             ?= python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL
 			--fix-extensions $(FIX_EXTENSIONS) \
 			--unroll-lists \
 			--unroll-options \
-				--unroll-alternatives \
-				--replace-whitespace \
-				$(OPT_CHECK_SYMBOLS) \
-				$(OPT_CONFIG_FILE) \
-				--trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/  */,/g') \
-				--cut-symbols=$(shell echo $(CUT_SYMBOLS) | sed 's/  */,/g') \
-				--irrelevant-symbols=$(shell echo $(IRRELEVANT_SYMBOLS) | sed 's/  */,/g') \
-				--namespace=$(NAMESPACE_IN_GENERATED) \
-				--includes=$(shell echo $(INCLUDED_BY_GENERATED) | sed 's/  */,/g') \
-				$(CREATE_EXTRA_ARGS)
+			--unroll-alternatives \
+			--replace-whitespace \
+			$(OPT_CHECK_SYMBOLS) \
+			$(OPT_CONFIG_FILE) \
+			--trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/  */,/g') \
+			--cut-symbols=$(shell echo $(CUT_SYMBOLS) | sed 's/  */,/g') \
+			--irrelevant-symbols=$(shell echo $(IRRELEVANT_SYMBOLS) | sed 's/  */,/g') \
+			--namespace=$(NAMESPACE_IN_GENERATED) \
+			--includes=$(shell echo $(INCLUDED_BY_GENERATED) | sed 's/  */,/g') \
+			$(CREATE_EXTRA_ARGS)
 include $(TOPDIR)/make/proj.mk
 include $(MODDIR)/make/flex-bison.mk
 include $(MODDIR)/make/py-defs.mk
diff --git a/test/grammar/Makefile b/test/grammar/Makefile
index 9d687e8..8c57a37 100644
--- a/test/grammar/Makefile
+++ b/test/grammar/Makefile
@@ -1,10 +1,13 @@
 TOPDIR = ../..
 
+-include local.mk
+
 EXE_ARGS              ?= grammartest.code
 PREREQ_BUILD          += ytools
 FB_NAME                = grammartest
 NAMESPACE_IN_GENERATED = gt
 GENERATE_CONFIG_FILE   = generate.conf
+IRRELEVANT_SYMBOLS    ?= white_space
 
 include $(TOPDIR)/make/proj.mk
 include $(TOPDIR)/make/generate-flex-bison.mk
diff --git a/test/grammar/generate.conf b/test/grammar/generate.conf
index 6e5dfa0..1f63307 100644
--- a/test/grammar/generate.conf
+++ b/test/grammar/generate.conf
@@ -1,8 +1,18 @@
 [symbols]
+
   [white_space[
+    type = token
     regex = "[ \n\t\r]+" 
   ]
+
+  [all_characters[
+    type = non-terminal
+    regex = "[[:print:]]"
+    #lex_as = yytext[0]
+  ]
+
   [test[
+    type = token
     dings = bums
     regex = "bumsdings"
   ]
diff --git a/test/grammar/grammartest-input.ebnf b/test/grammar/grammartest-input.ebnf
index acbc89e..e06d923 100644
--- a/test/grammar/grammartest-input.ebnf
+++ b/test/grammar/grammartest-input.ebnf
@@ -2,7 +2,7 @@
  program = 'PROGRAM', white space, identifier, white space, 
             'BEGIN', white space, 
             { assignment, ";", white space }, 
-            'END.' ;
+            'END.', [ white space ];
  identifier = alphabetic character, { alphabetic character | digit } ;
  number = [ "-" ], digit, { digit } ;
  string = '"' , { all characters }, '"' ;
diff --git a/test/grammar/grammartest.code b/test/grammar/grammartest.code
index 2545cbd..b6f8c4d 100644
--- a/test/grammar/grammartest.code
+++ b/test/grammar/grammartest.code
@@ -6,5 +6,4 @@ BEGIN
   C:=A;
   D123:=B34A;
   BABOON:=GIRAFFE;
-  TEXT:="Hello world!";
 END.
diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py
index 1567521..39f6cf8 100644
--- a/tools/python/jwutils/grammar.py
+++ b/tools/python/jwutils/grammar.py
@@ -34,6 +34,9 @@ mode_keep    = "keep"
 mode_discard = "discard"
 fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
 
+c_token      = "token"
+c_non_terminal = "non-terminal"
+
 member_prefix = ''
 
 special_terminals = {
@@ -638,23 +641,25 @@ def grammar_unroll_lists(grammar):
                 if c.token == '}':
                     if len(listrule) == 0:
                         raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
-                    name = ""
                     delpos = []
+                    name = "list"
                     for i, rule in enumerate(listrule):
                         if rule.token in delimiters:
                             delpos.append(i)
                             continue
                         if rule.tp != t_target_lang:
                             continue
-                        name += tok2name(rule.token) + "_"
+                        name += "_" + tok2name(rule.token)
 
                     # not really: there are lists without delimiters, too
                     #if len(delpos) != 1:
                     #    p.dump(ERR)
                     #    raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
 
-                    name = name + "my_list"
                     newrule.append(RuleComp(name, t_target_lang))
+                    listrule.insert(0, RuleComp('(', t_grammar))
+                    listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration
+                    listrule.append(RuleComp(')', t_grammar))
                     p = Symbol(name, rules=[[], listrule])
                     #p = Symbol(name)
                     #p.rules = [ [], listrule ]
@@ -1065,7 +1070,9 @@ def grammar_create_ebnf(grammar, opts):
 	out += ' ' * indent + ' ;\n'
     return out
 
-def tokens_from_config(conf):
+def symbols_from_config(conf, types = None):
+    if types == None or types == "all":
+        types = [ c_token, c_non_terminal ]
     r = set()
     if conf is None:
         return r
@@ -1073,7 +1080,7 @@ def tokens_from_config(conf):
     if symbols is None:
         return r
     for k, v in symbols.iteritems():
-        if v.get('regex') is not None:
+        if v["type"].value() in types:
             r.add(k)
     return r
 
@@ -1094,7 +1101,7 @@ def grammar_create_y(grammar, opts):
     indent = '\t' * (spaces / 8)
 
     conf = opts['config']
-    tokens = tokens_from_config(conf)
+    conf_tokens = symbols_from_config(conf, [ c_token, c_non_terminal ])
 
     out = ""
 
@@ -1144,7 +1151,7 @@ def grammar_create_y(grammar, opts):
 
     types = grammar_get_types(grammar)
     for t in types.keys():
-        if t in tokens:
+        if conf is not None and t in conf['symbols'].keys():
             continue
         out += '\n\t' + opts['namespace'] + '::' +  t + '_t *' + t + ';'
     out += '\n'
@@ -1183,14 +1190,15 @@ def grammar_create_y(grammar, opts):
             out += format_token(p.sym, t) +'\n'
 
     # tokens from config
-    for k, t in conf['symbols'].iteritems():
-        slog(NOTICE, "adding token", k)
-        out += format_token(k, "blah") + '\n'
+    if conf is not None:
+        for k, t in conf['symbols'].iteritems():
+            slog(NOTICE, "adding token", k)
+            out += format_token(k, "blah") + '\n'
 
     # types
     out += '\n'
     for t, p in grammar.iteritems():
-        if p.sym in conf['symbols'].keys():
+        if conf is not None and p.sym in conf['symbols'].keys():
             continue
         if p.tp == p_ruleset:
             out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
@@ -1212,7 +1220,7 @@ def grammar_create_y(grammar, opts):
             continue
         if p.tp == p_special:
             continue
-        if p.sym in conf['symbols'].keys():
+        if conf is not None and p.sym in conf['symbols'].keys():
             continue
         slog(INFO, "creating production for symbol", p.str())
 
@@ -1240,10 +1248,12 @@ def grammar_create_y(grammar, opts):
                     tokens.append(c.token)
             idx = 0
             for c in rule:
-                n += 1
                 if c.tp == t_grammar:
                     s.update(c.token, 0)
                     continue
+                if c.token in tokens:
+                    continue
+                n += 1
                 p = grammar[c.token]
                 #if is_terminal(c.token) is not None:
                 #    continue
@@ -1327,8 +1337,15 @@ def grammar_create_l(grammar, opts):
             assert p.term[-1] in [ '"', "'" ], p.term
             out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n'
 
-    for k, v in conf['symbols'].iteritems():
-        out += v['regex'].value() + ' { slog(PRI_NOTICE, "found regex ' + k + '"); return ' + k + '; }\n'
+    if conf is not None:
+        for k, v in conf['symbols'].iteritems():
+            lex_as = v.get('lex_as')
+            if lex_as is not None:
+                retval = lex_as.value()
+            else:
+                retval = k
+            regex = v['regex'].value()
+            out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + k + '"); return ' + retval + '; }\n'
 
     #out += textwrap.dedent("""\
     #
@@ -1464,7 +1481,7 @@ def grammar_create_l(grammar, opts):
 def grammar_create_h(grammar, opts):
     out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
     ns = opts['namespace']
-    tokens = tokens_from_config(opts['config'])
+    tokens = symbols_from_config(opts['config'], "all")
 
     if ns is not None:
         out += 'namespace ' + ns + '{\n\n'
@@ -1508,6 +1525,8 @@ def grammar_create_h(grammar, opts):
                 if rule.count(m) > 1:
                     idx += 1
                     suffix = '_' + str(idx)
+                if m in tokens:
+                    continue
                 p = grammar[m]
                 out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
             out += '\n\t};'
@@ -1575,10 +1594,10 @@ class GrammarCmd(jwutils.Cmd):
         if args.fix_extensions not in fix_extensions_mode:
             raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
         grammar = grammar_fix_extensions(grammar, args.fix_extensions)
-        if args.unroll_alternatives:
-            grammar = grammar_unroll_alternatives(grammar)
         if args.unroll_lists:
             grammar = grammar_unroll_lists(grammar)
+        if args.unroll_alternatives:
+            grammar = grammar_unroll_alternatives(grammar)
         if args.unroll_options:
             grammar = grammar_unroll_options(grammar)
         if len(args.check_symbols):