From a3a8313ce84377101d8fa75fd0ae362a7c67d88b Mon Sep 17 00:00:00 2001 From: Jan Lindemann Date: Thu, 2 Nov 2017 13:47:19 +0100 Subject: [PATCH] grammar.py and friends: Make list parsing run through First time parsing doesn't error out with a syntax error. No usable AST is produced, strings are not returned from lexer, and AST lists aren't lists, really. TEXT:="Hello world!"; had to be excluded from the example, because I don't get how this could be parsed with the given syntax. There's a special sequence "all visible characters", but any lexer regex I could think of will also match the types defining "alphabetic character" and return the respective tokens (e.g. T_A) or vice-versa, depending on the order in the lexer input file. I suppose, the only sensible way to handle this, is to define "all visible characters" by defining the tokens for the missing characters, and then use them along T_A ... T_Z or their derived types. Signed-off-by: Jan Lindemann --- make/generate-flex-bison.mk | 20 +++++------ test/grammar/Makefile | 3 ++ test/grammar/generate.conf | 10 ++++++ test/grammar/grammartest-input.ebnf | 2 +- test/grammar/grammartest.code | 1 - tools/python/jwutils/grammar.py | 55 +++++++++++++++++++---------- 6 files changed, 61 insertions(+), 30 deletions(-) diff --git a/make/generate-flex-bison.mk b/make/generate-flex-bison.mk index 5f05d7f..1d34848 100644 --- a/make/generate-flex-bison.mk +++ b/make/generate-flex-bison.mk @@ -34,16 +34,16 @@ GENERATE ?= python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL --fix-extensions $(FIX_EXTENSIONS) \ --unroll-lists \ --unroll-options \ - --unroll-alternatives \ - --replace-whitespace \ - $(OPT_CHECK_SYMBOLS) \ - $(OPT_CONFIG_FILE) \ - --trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \ - --cut-symbols=$(shell echo $(CUT_SYMBOLS) | sed 's/ */,/g') \ - --irrelevant-symbols=$(shell echo $(IRRELEVANT_SYMBOLS) | sed 's/ */,/g') \ - --namespace=$(NAMESPACE_IN_GENERATED) \ - --includes=$(shell echo $(INCLUDED_BY_GENERATED) | sed 's/ */,/g') \ - $(CREATE_EXTRA_ARGS) + --unroll-alternatives \ + --replace-whitespace \ + $(OPT_CHECK_SYMBOLS) \ + $(OPT_CONFIG_FILE) \ + --trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \ + --cut-symbols=$(shell echo $(CUT_SYMBOLS) | sed 's/ */,/g') \ + --irrelevant-symbols=$(shell echo $(IRRELEVANT_SYMBOLS) | sed 's/ */,/g') \ + --namespace=$(NAMESPACE_IN_GENERATED) \ + --includes=$(shell echo $(INCLUDED_BY_GENERATED) | sed 's/ */,/g') \ + $(CREATE_EXTRA_ARGS) include $(TOPDIR)/make/proj.mk include $(MODDIR)/make/flex-bison.mk include $(MODDIR)/make/py-defs.mk diff --git a/test/grammar/Makefile b/test/grammar/Makefile index 9d687e8..8c57a37 100644 --- a/test/grammar/Makefile +++ b/test/grammar/Makefile @@ -1,10 +1,13 @@ TOPDIR = ../.. +-include local.mk + EXE_ARGS ?= grammartest.code PREREQ_BUILD += ytools FB_NAME = grammartest NAMESPACE_IN_GENERATED = gt GENERATE_CONFIG_FILE = generate.conf +IRRELEVANT_SYMBOLS ?= white_space include $(TOPDIR)/make/proj.mk include $(TOPDIR)/make/generate-flex-bison.mk diff --git a/test/grammar/generate.conf b/test/grammar/generate.conf index 6e5dfa0..1f63307 100644 --- a/test/grammar/generate.conf +++ b/test/grammar/generate.conf @@ -1,8 +1,18 @@ [symbols] + [white_space[ + type = token regex = "[ \n\t\r]+" ] + + [all_characters[ + type = non-terminal + regex = "[[:print:]]" + #lex_as = yytext[0] + ] + [test[ + type = token dings = bums regex = "bumsdings" ] diff --git a/test/grammar/grammartest-input.ebnf b/test/grammar/grammartest-input.ebnf index acbc89e..e06d923 100644 --- a/test/grammar/grammartest-input.ebnf +++ b/test/grammar/grammartest-input.ebnf @@ -2,7 +2,7 @@ program = 'PROGRAM', white space, identifier, white space, 'BEGIN', white space, { assignment, ";", white space }, - 'END.' ; + 'END.', [ white space ]; identifier = alphabetic character, { alphabetic character | digit } ; number = [ "-" ], digit, { digit } ; string = '"' , { all characters }, '"' ; diff --git a/test/grammar/grammartest.code b/test/grammar/grammartest.code index 2545cbd..b6f8c4d 100644 --- a/test/grammar/grammartest.code +++ b/test/grammar/grammartest.code @@ -6,5 +6,4 @@ BEGIN C:=A; D123:=B34A; BABOON:=GIRAFFE; - TEXT:="Hello world!"; END. diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py index 1567521..39f6cf8 100644 --- a/tools/python/jwutils/grammar.py +++ b/tools/python/jwutils/grammar.py @@ -34,6 +34,9 @@ mode_keep = "keep" mode_discard = "discard" fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] +c_token = "token" +c_non_terminal = "non-terminal" + member_prefix = '' special_terminals = { @@ -638,23 +641,25 @@ def grammar_unroll_lists(grammar): if c.token == '}': if len(listrule) == 0: raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) - name = "" delpos = [] + name = "list" for i, rule in enumerate(listrule): if rule.token in delimiters: delpos.append(i) continue if rule.tp != t_target_lang: continue - name += tok2name(rule.token) + "_" + name += "_" + tok2name(rule.token) # not really: there are lists without delimiters, too #if len(delpos) != 1: # p.dump(ERR) # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) - name = name + "my_list" newrule.append(RuleComp(name, t_target_lang)) + listrule.insert(0, RuleComp('(', t_grammar)) + listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration + listrule.append(RuleComp(')', t_grammar)) p = Symbol(name, rules=[[], listrule]) #p = Symbol(name) #p.rules = [ [], listrule ] @@ -1065,7 +1070,9 @@ def grammar_create_ebnf(grammar, opts): out += ' ' * indent + ' ;\n' return out -def tokens_from_config(conf): +def symbols_from_config(conf, types = None): + if types == None or types == "all": + types = [ c_token, c_non_terminal ] r = set() if conf is None: return r @@ -1073,7 +1080,7 @@ def tokens_from_config(conf): if symbols is None: return r for k, v in symbols.iteritems(): - if v.get('regex') is not None: + if v["type"].value() in types: r.add(k) return r @@ -1094,7 +1101,7 @@ def grammar_create_y(grammar, opts): indent = '\t' * (spaces / 8) conf = opts['config'] - tokens = tokens_from_config(conf) + conf_tokens = symbols_from_config(conf, [ c_token, c_non_terminal ]) out = "" @@ -1144,7 +1151,7 @@ def grammar_create_y(grammar, opts): types = grammar_get_types(grammar) for t in types.keys(): - if t in tokens: + if conf is not None and t in conf['symbols'].keys(): continue out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';' out += '\n' @@ -1183,14 +1190,15 @@ def grammar_create_y(grammar, opts): out += format_token(p.sym, t) +'\n' # tokens from config - for k, t in conf['symbols'].iteritems(): - slog(NOTICE, "adding token", k) - out += format_token(k, "blah") + '\n' + if conf is not None: + for k, t in conf['symbols'].iteritems(): + slog(NOTICE, "adding token", k) + out += format_token(k, "blah") + '\n' # types out += '\n' for t, p in grammar.iteritems(): - if p.sym in conf['symbols'].keys(): + if conf is not None and p.sym in conf['symbols'].keys(): continue if p.tp == p_ruleset: out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n' @@ -1212,7 +1220,7 @@ def grammar_create_y(grammar, opts): continue if p.tp == p_special: continue - if p.sym in conf['symbols'].keys(): + if conf is not None and p.sym in conf['symbols'].keys(): continue slog(INFO, "creating production for symbol", p.str()) @@ -1240,10 +1248,12 @@ def grammar_create_y(grammar, opts): tokens.append(c.token) idx = 0 for c in rule: - n += 1 if c.tp == t_grammar: s.update(c.token, 0) continue + if c.token in tokens: + continue + n += 1 p = grammar[c.token] #if is_terminal(c.token) is not None: # continue @@ -1327,8 +1337,15 @@ def grammar_create_l(grammar, opts): assert p.term[-1] in [ '"', "'" ], p.term out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n' - for k, v in conf['symbols'].iteritems(): - out += v['regex'].value() + ' { slog(PRI_NOTICE, "found regex ' + k + '"); return ' + k + '; }\n' + if conf is not None: + for k, v in conf['symbols'].iteritems(): + lex_as = v.get('lex_as') + if lex_as is not None: + retval = lex_as.value() + else: + retval = k + regex = v['regex'].value() + out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + k + '"); return ' + retval + '; }\n' #out += textwrap.dedent("""\ # @@ -1464,7 +1481,7 @@ def grammar_create_l(grammar, opts): def grammar_create_h(grammar, opts): out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n' ns = opts['namespace'] - tokens = tokens_from_config(opts['config']) + tokens = symbols_from_config(opts['config'], "all") if ns is not None: out += 'namespace ' + ns + '{\n\n' @@ -1508,6 +1525,8 @@ def grammar_create_h(grammar, opts): if rule.count(m) > 1: idx += 1 suffix = '_' + str(idx) + if m in tokens: + continue p = grammar[m] out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' out += '\n\t};' @@ -1575,10 +1594,10 @@ class GrammarCmd(jwutils.Cmd): if args.fix_extensions not in fix_extensions_mode: raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") grammar = grammar_fix_extensions(grammar, args.fix_extensions) - if args.unroll_alternatives: - grammar = grammar_unroll_alternatives(grammar) if args.unroll_lists: grammar = grammar_unroll_lists(grammar) + if args.unroll_alternatives: + grammar = grammar_unroll_alternatives(grammar) if args.unroll_options: grammar = grammar_unroll_options(grammar) if len(args.check_symbols):