grammar.py and friends: Make list parsing run through

First time parsing doesn't error out with a syntax error. No usable AST
is produced, strings are not returned from lexer, and AST lists aren't
lists, really.

TEXT:="Hello world!"; had to be excluded from the example, because I
don't get how this could be parsed with the given syntax. There's a
special sequence "all visible characters", but any lexer regex I could
think of will also match the types defining "alphabetic character" and
return the respective tokens (e.g. T_A) or vice-versa, depending on the
order in the lexer input file. I suppose, the only sensible way to
handle this, is to define "all visible characters" by defining the
tokens for the missing characters, and then use them along T_A ... T_Z
or their derived types.

Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
Jan Lindemann 2017-11-02 13:47:19 +01:00
commit a3a8313ce8
6 changed files with 61 additions and 30 deletions

View file

@ -1,10 +1,13 @@
TOPDIR = ../.. TOPDIR = ../..
-include local.mk
EXE_ARGS ?= grammartest.code EXE_ARGS ?= grammartest.code
PREREQ_BUILD += ytools PREREQ_BUILD += ytools
FB_NAME = grammartest FB_NAME = grammartest
NAMESPACE_IN_GENERATED = gt NAMESPACE_IN_GENERATED = gt
GENERATE_CONFIG_FILE = generate.conf GENERATE_CONFIG_FILE = generate.conf
IRRELEVANT_SYMBOLS ?= white_space
include $(TOPDIR)/make/proj.mk include $(TOPDIR)/make/proj.mk
include $(TOPDIR)/make/generate-flex-bison.mk include $(TOPDIR)/make/generate-flex-bison.mk

View file

@ -1,8 +1,18 @@
[symbols] [symbols]
[white_space[ [white_space[
type = token
regex = "[ \n\t\r]+" regex = "[ \n\t\r]+"
] ]
[all_characters[
type = non-terminal
regex = "[[:print:]]"
#lex_as = yytext[0]
]
[test[ [test[
type = token
dings = bums dings = bums
regex = "bumsdings" regex = "bumsdings"
] ]

View file

@ -2,7 +2,7 @@
program = 'PROGRAM', white space, identifier, white space, program = 'PROGRAM', white space, identifier, white space,
'BEGIN', white space, 'BEGIN', white space,
{ assignment, ";", white space }, { assignment, ";", white space },
'END.' ; 'END.', [ white space ];
identifier = alphabetic character, { alphabetic character | digit } ; identifier = alphabetic character, { alphabetic character | digit } ;
number = [ "-" ], digit, { digit } ; number = [ "-" ], digit, { digit } ;
string = '"' , { all characters }, '"' ; string = '"' , { all characters }, '"' ;

View file

@ -6,5 +6,4 @@ BEGIN
C:=A; C:=A;
D123:=B34A; D123:=B34A;
BABOON:=GIRAFFE; BABOON:=GIRAFFE;
TEXT:="Hello world!";
END. END.

View file

@ -34,6 +34,9 @@ mode_keep = "keep"
mode_discard = "discard" mode_discard = "discard"
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
c_token = "token"
c_non_terminal = "non-terminal"
member_prefix = '' member_prefix = ''
special_terminals = { special_terminals = {
@ -638,23 +641,25 @@ def grammar_unroll_lists(grammar):
if c.token == '}': if c.token == '}':
if len(listrule) == 0: if len(listrule) == 0:
raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
name = ""
delpos = [] delpos = []
name = "list"
for i, rule in enumerate(listrule): for i, rule in enumerate(listrule):
if rule.token in delimiters: if rule.token in delimiters:
delpos.append(i) delpos.append(i)
continue continue
if rule.tp != t_target_lang: if rule.tp != t_target_lang:
continue continue
name += tok2name(rule.token) + "_" name += "_" + tok2name(rule.token)
# not really: there are lists without delimiters, too # not really: there are lists without delimiters, too
#if len(delpos) != 1: #if len(delpos) != 1:
# p.dump(ERR) # p.dump(ERR)
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
name = name + "my_list"
newrule.append(RuleComp(name, t_target_lang)) newrule.append(RuleComp(name, t_target_lang))
listrule.insert(0, RuleComp('(', t_grammar))
listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration
listrule.append(RuleComp(')', t_grammar))
p = Symbol(name, rules=[[], listrule]) p = Symbol(name, rules=[[], listrule])
#p = Symbol(name) #p = Symbol(name)
#p.rules = [ [], listrule ] #p.rules = [ [], listrule ]
@ -1065,7 +1070,9 @@ def grammar_create_ebnf(grammar, opts):
out += ' ' * indent + ' ;\n' out += ' ' * indent + ' ;\n'
return out return out
def tokens_from_config(conf): def symbols_from_config(conf, types = None):
if types == None or types == "all":
types = [ c_token, c_non_terminal ]
r = set() r = set()
if conf is None: if conf is None:
return r return r
@ -1073,7 +1080,7 @@ def tokens_from_config(conf):
if symbols is None: if symbols is None:
return r return r
for k, v in symbols.iteritems(): for k, v in symbols.iteritems():
if v.get('regex') is not None: if v["type"].value() in types:
r.add(k) r.add(k)
return r return r
@ -1094,7 +1101,7 @@ def grammar_create_y(grammar, opts):
indent = '\t' * (spaces / 8) indent = '\t' * (spaces / 8)
conf = opts['config'] conf = opts['config']
tokens = tokens_from_config(conf) conf_tokens = symbols_from_config(conf, [ c_token, c_non_terminal ])
out = "" out = ""
@ -1144,7 +1151,7 @@ def grammar_create_y(grammar, opts):
types = grammar_get_types(grammar) types = grammar_get_types(grammar)
for t in types.keys(): for t in types.keys():
if t in tokens: if conf is not None and t in conf['symbols'].keys():
continue continue
out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';' out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';'
out += '\n' out += '\n'
@ -1183,6 +1190,7 @@ def grammar_create_y(grammar, opts):
out += format_token(p.sym, t) +'\n' out += format_token(p.sym, t) +'\n'
# tokens from config # tokens from config
if conf is not None:
for k, t in conf['symbols'].iteritems(): for k, t in conf['symbols'].iteritems():
slog(NOTICE, "adding token", k) slog(NOTICE, "adding token", k)
out += format_token(k, "blah") + '\n' out += format_token(k, "blah") + '\n'
@ -1190,7 +1198,7 @@ def grammar_create_y(grammar, opts):
# types # types
out += '\n' out += '\n'
for t, p in grammar.iteritems(): for t, p in grammar.iteritems():
if p.sym in conf['symbols'].keys(): if conf is not None and p.sym in conf['symbols'].keys():
continue continue
if p.tp == p_ruleset: if p.tp == p_ruleset:
out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n' out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
@ -1212,7 +1220,7 @@ def grammar_create_y(grammar, opts):
continue continue
if p.tp == p_special: if p.tp == p_special:
continue continue
if p.sym in conf['symbols'].keys(): if conf is not None and p.sym in conf['symbols'].keys():
continue continue
slog(INFO, "creating production for symbol", p.str()) slog(INFO, "creating production for symbol", p.str())
@ -1240,10 +1248,12 @@ def grammar_create_y(grammar, opts):
tokens.append(c.token) tokens.append(c.token)
idx = 0 idx = 0
for c in rule: for c in rule:
n += 1
if c.tp == t_grammar: if c.tp == t_grammar:
s.update(c.token, 0) s.update(c.token, 0)
continue continue
if c.token in tokens:
continue
n += 1
p = grammar[c.token] p = grammar[c.token]
#if is_terminal(c.token) is not None: #if is_terminal(c.token) is not None:
# continue # continue
@ -1327,8 +1337,15 @@ def grammar_create_l(grammar, opts):
assert p.term[-1] in [ '"', "'" ], p.term assert p.term[-1] in [ '"', "'" ], p.term
out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n' out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n'
if conf is not None:
for k, v in conf['symbols'].iteritems(): for k, v in conf['symbols'].iteritems():
out += v['regex'].value() + ' { slog(PRI_NOTICE, "found regex ' + k + '"); return ' + k + '; }\n' lex_as = v.get('lex_as')
if lex_as is not None:
retval = lex_as.value()
else:
retval = k
regex = v['regex'].value()
out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + k + '"); return ' + retval + '; }\n'
#out += textwrap.dedent("""\ #out += textwrap.dedent("""\
# #
@ -1464,7 +1481,7 @@ def grammar_create_l(grammar, opts):
def grammar_create_h(grammar, opts): def grammar_create_h(grammar, opts):
out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n' out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
ns = opts['namespace'] ns = opts['namespace']
tokens = tokens_from_config(opts['config']) tokens = symbols_from_config(opts['config'], "all")
if ns is not None: if ns is not None:
out += 'namespace ' + ns + '{\n\n' out += 'namespace ' + ns + '{\n\n'
@ -1508,6 +1525,8 @@ def grammar_create_h(grammar, opts):
if rule.count(m) > 1: if rule.count(m) > 1:
idx += 1 idx += 1
suffix = '_' + str(idx) suffix = '_' + str(idx)
if m in tokens:
continue
p = grammar[m] p = grammar[m]
out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
out += '\n\t};' out += '\n\t};'
@ -1575,10 +1594,10 @@ class GrammarCmd(jwutils.Cmd):
if args.fix_extensions not in fix_extensions_mode: if args.fix_extensions not in fix_extensions_mode:
raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
grammar = grammar_fix_extensions(grammar, args.fix_extensions) grammar = grammar_fix_extensions(grammar, args.fix_extensions)
if args.unroll_alternatives:
grammar = grammar_unroll_alternatives(grammar)
if args.unroll_lists: if args.unroll_lists:
grammar = grammar_unroll_lists(grammar) grammar = grammar_unroll_lists(grammar)
if args.unroll_alternatives:
grammar = grammar_unroll_alternatives(grammar)
if args.unroll_options: if args.unroll_options:
grammar = grammar_unroll_options(grammar) grammar = grammar_unroll_options(grammar)
if len(args.check_symbols): if len(args.check_symbols):