mirror of
ssh://git.janware.com/srv/git/janware/proj/jw-python
synced 2026-01-15 09:53:32 +01:00
grammar.py and friends: Make list parsing run through
First time parsing doesn't error out with a syntax error. No usable AST is produced, strings are not returned from lexer, and AST lists aren't lists, really. TEXT:="Hello world!"; had to be excluded from the example, because I don't get how this could be parsed with the given syntax. There's a special sequence "all visible characters", but any lexer regex I could think of will also match the types defining "alphabetic character" and return the respective tokens (e.g. T_A) or vice-versa, depending on the order in the lexer input file. I suppose, the only sensible way to handle this, is to define "all visible characters" by defining the tokens for the missing characters, and then use them along T_A ... T_Z or their derived types. Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
parent
6ca2eeef61
commit
a3a8313ce8
6 changed files with 61 additions and 30 deletions
|
|
@ -1,10 +1,13 @@
|
||||||
TOPDIR = ../..
|
TOPDIR = ../..
|
||||||
|
|
||||||
|
-include local.mk
|
||||||
|
|
||||||
EXE_ARGS ?= grammartest.code
|
EXE_ARGS ?= grammartest.code
|
||||||
PREREQ_BUILD += ytools
|
PREREQ_BUILD += ytools
|
||||||
FB_NAME = grammartest
|
FB_NAME = grammartest
|
||||||
NAMESPACE_IN_GENERATED = gt
|
NAMESPACE_IN_GENERATED = gt
|
||||||
GENERATE_CONFIG_FILE = generate.conf
|
GENERATE_CONFIG_FILE = generate.conf
|
||||||
|
IRRELEVANT_SYMBOLS ?= white_space
|
||||||
|
|
||||||
include $(TOPDIR)/make/proj.mk
|
include $(TOPDIR)/make/proj.mk
|
||||||
include $(TOPDIR)/make/generate-flex-bison.mk
|
include $(TOPDIR)/make/generate-flex-bison.mk
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,18 @@
|
||||||
[symbols]
|
[symbols]
|
||||||
|
|
||||||
[white_space[
|
[white_space[
|
||||||
|
type = token
|
||||||
regex = "[ \n\t\r]+"
|
regex = "[ \n\t\r]+"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[all_characters[
|
||||||
|
type = non-terminal
|
||||||
|
regex = "[[:print:]]"
|
||||||
|
#lex_as = yytext[0]
|
||||||
|
]
|
||||||
|
|
||||||
[test[
|
[test[
|
||||||
|
type = token
|
||||||
dings = bums
|
dings = bums
|
||||||
regex = "bumsdings"
|
regex = "bumsdings"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
program = 'PROGRAM', white space, identifier, white space,
|
program = 'PROGRAM', white space, identifier, white space,
|
||||||
'BEGIN', white space,
|
'BEGIN', white space,
|
||||||
{ assignment, ";", white space },
|
{ assignment, ";", white space },
|
||||||
'END.' ;
|
'END.', [ white space ];
|
||||||
identifier = alphabetic character, { alphabetic character | digit } ;
|
identifier = alphabetic character, { alphabetic character | digit } ;
|
||||||
number = [ "-" ], digit, { digit } ;
|
number = [ "-" ], digit, { digit } ;
|
||||||
string = '"' , { all characters }, '"' ;
|
string = '"' , { all characters }, '"' ;
|
||||||
|
|
|
||||||
|
|
@ -6,5 +6,4 @@ BEGIN
|
||||||
C:=A;
|
C:=A;
|
||||||
D123:=B34A;
|
D123:=B34A;
|
||||||
BABOON:=GIRAFFE;
|
BABOON:=GIRAFFE;
|
||||||
TEXT:="Hello world!";
|
|
||||||
END.
|
END.
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,9 @@ mode_keep = "keep"
|
||||||
mode_discard = "discard"
|
mode_discard = "discard"
|
||||||
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
|
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
|
||||||
|
|
||||||
|
c_token = "token"
|
||||||
|
c_non_terminal = "non-terminal"
|
||||||
|
|
||||||
member_prefix = ''
|
member_prefix = ''
|
||||||
|
|
||||||
special_terminals = {
|
special_terminals = {
|
||||||
|
|
@ -638,23 +641,25 @@ def grammar_unroll_lists(grammar):
|
||||||
if c.token == '}':
|
if c.token == '}':
|
||||||
if len(listrule) == 0:
|
if len(listrule) == 0:
|
||||||
raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
|
raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
|
||||||
name = ""
|
|
||||||
delpos = []
|
delpos = []
|
||||||
|
name = "list"
|
||||||
for i, rule in enumerate(listrule):
|
for i, rule in enumerate(listrule):
|
||||||
if rule.token in delimiters:
|
if rule.token in delimiters:
|
||||||
delpos.append(i)
|
delpos.append(i)
|
||||||
continue
|
continue
|
||||||
if rule.tp != t_target_lang:
|
if rule.tp != t_target_lang:
|
||||||
continue
|
continue
|
||||||
name += tok2name(rule.token) + "_"
|
name += "_" + tok2name(rule.token)
|
||||||
|
|
||||||
# not really: there are lists without delimiters, too
|
# not really: there are lists without delimiters, too
|
||||||
#if len(delpos) != 1:
|
#if len(delpos) != 1:
|
||||||
# p.dump(ERR)
|
# p.dump(ERR)
|
||||||
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
|
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
|
||||||
|
|
||||||
name = name + "my_list"
|
|
||||||
newrule.append(RuleComp(name, t_target_lang))
|
newrule.append(RuleComp(name, t_target_lang))
|
||||||
|
listrule.insert(0, RuleComp('(', t_grammar))
|
||||||
|
listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration
|
||||||
|
listrule.append(RuleComp(')', t_grammar))
|
||||||
p = Symbol(name, rules=[[], listrule])
|
p = Symbol(name, rules=[[], listrule])
|
||||||
#p = Symbol(name)
|
#p = Symbol(name)
|
||||||
#p.rules = [ [], listrule ]
|
#p.rules = [ [], listrule ]
|
||||||
|
|
@ -1065,7 +1070,9 @@ def grammar_create_ebnf(grammar, opts):
|
||||||
out += ' ' * indent + ' ;\n'
|
out += ' ' * indent + ' ;\n'
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def tokens_from_config(conf):
|
def symbols_from_config(conf, types = None):
|
||||||
|
if types == None or types == "all":
|
||||||
|
types = [ c_token, c_non_terminal ]
|
||||||
r = set()
|
r = set()
|
||||||
if conf is None:
|
if conf is None:
|
||||||
return r
|
return r
|
||||||
|
|
@ -1073,7 +1080,7 @@ def tokens_from_config(conf):
|
||||||
if symbols is None:
|
if symbols is None:
|
||||||
return r
|
return r
|
||||||
for k, v in symbols.iteritems():
|
for k, v in symbols.iteritems():
|
||||||
if v.get('regex') is not None:
|
if v["type"].value() in types:
|
||||||
r.add(k)
|
r.add(k)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
@ -1094,7 +1101,7 @@ def grammar_create_y(grammar, opts):
|
||||||
indent = '\t' * (spaces / 8)
|
indent = '\t' * (spaces / 8)
|
||||||
|
|
||||||
conf = opts['config']
|
conf = opts['config']
|
||||||
tokens = tokens_from_config(conf)
|
conf_tokens = symbols_from_config(conf, [ c_token, c_non_terminal ])
|
||||||
|
|
||||||
out = ""
|
out = ""
|
||||||
|
|
||||||
|
|
@ -1144,7 +1151,7 @@ def grammar_create_y(grammar, opts):
|
||||||
|
|
||||||
types = grammar_get_types(grammar)
|
types = grammar_get_types(grammar)
|
||||||
for t in types.keys():
|
for t in types.keys():
|
||||||
if t in tokens:
|
if conf is not None and t in conf['symbols'].keys():
|
||||||
continue
|
continue
|
||||||
out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';'
|
out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';'
|
||||||
out += '\n'
|
out += '\n'
|
||||||
|
|
@ -1183,6 +1190,7 @@ def grammar_create_y(grammar, opts):
|
||||||
out += format_token(p.sym, t) +'\n'
|
out += format_token(p.sym, t) +'\n'
|
||||||
|
|
||||||
# tokens from config
|
# tokens from config
|
||||||
|
if conf is not None:
|
||||||
for k, t in conf['symbols'].iteritems():
|
for k, t in conf['symbols'].iteritems():
|
||||||
slog(NOTICE, "adding token", k)
|
slog(NOTICE, "adding token", k)
|
||||||
out += format_token(k, "blah") + '\n'
|
out += format_token(k, "blah") + '\n'
|
||||||
|
|
@ -1190,7 +1198,7 @@ def grammar_create_y(grammar, opts):
|
||||||
# types
|
# types
|
||||||
out += '\n'
|
out += '\n'
|
||||||
for t, p in grammar.iteritems():
|
for t, p in grammar.iteritems():
|
||||||
if p.sym in conf['symbols'].keys():
|
if conf is not None and p.sym in conf['symbols'].keys():
|
||||||
continue
|
continue
|
||||||
if p.tp == p_ruleset:
|
if p.tp == p_ruleset:
|
||||||
out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
|
out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
|
||||||
|
|
@ -1212,7 +1220,7 @@ def grammar_create_y(grammar, opts):
|
||||||
continue
|
continue
|
||||||
if p.tp == p_special:
|
if p.tp == p_special:
|
||||||
continue
|
continue
|
||||||
if p.sym in conf['symbols'].keys():
|
if conf is not None and p.sym in conf['symbols'].keys():
|
||||||
continue
|
continue
|
||||||
slog(INFO, "creating production for symbol", p.str())
|
slog(INFO, "creating production for symbol", p.str())
|
||||||
|
|
||||||
|
|
@ -1240,10 +1248,12 @@ def grammar_create_y(grammar, opts):
|
||||||
tokens.append(c.token)
|
tokens.append(c.token)
|
||||||
idx = 0
|
idx = 0
|
||||||
for c in rule:
|
for c in rule:
|
||||||
n += 1
|
|
||||||
if c.tp == t_grammar:
|
if c.tp == t_grammar:
|
||||||
s.update(c.token, 0)
|
s.update(c.token, 0)
|
||||||
continue
|
continue
|
||||||
|
if c.token in tokens:
|
||||||
|
continue
|
||||||
|
n += 1
|
||||||
p = grammar[c.token]
|
p = grammar[c.token]
|
||||||
#if is_terminal(c.token) is not None:
|
#if is_terminal(c.token) is not None:
|
||||||
# continue
|
# continue
|
||||||
|
|
@ -1327,8 +1337,15 @@ def grammar_create_l(grammar, opts):
|
||||||
assert p.term[-1] in [ '"', "'" ], p.term
|
assert p.term[-1] in [ '"', "'" ], p.term
|
||||||
out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n'
|
out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n'
|
||||||
|
|
||||||
|
if conf is not None:
|
||||||
for k, v in conf['symbols'].iteritems():
|
for k, v in conf['symbols'].iteritems():
|
||||||
out += v['regex'].value() + ' { slog(PRI_NOTICE, "found regex ' + k + '"); return ' + k + '; }\n'
|
lex_as = v.get('lex_as')
|
||||||
|
if lex_as is not None:
|
||||||
|
retval = lex_as.value()
|
||||||
|
else:
|
||||||
|
retval = k
|
||||||
|
regex = v['regex'].value()
|
||||||
|
out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + k + '"); return ' + retval + '; }\n'
|
||||||
|
|
||||||
#out += textwrap.dedent("""\
|
#out += textwrap.dedent("""\
|
||||||
#
|
#
|
||||||
|
|
@ -1464,7 +1481,7 @@ def grammar_create_l(grammar, opts):
|
||||||
def grammar_create_h(grammar, opts):
|
def grammar_create_h(grammar, opts):
|
||||||
out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
|
out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
|
||||||
ns = opts['namespace']
|
ns = opts['namespace']
|
||||||
tokens = tokens_from_config(opts['config'])
|
tokens = symbols_from_config(opts['config'], "all")
|
||||||
|
|
||||||
if ns is not None:
|
if ns is not None:
|
||||||
out += 'namespace ' + ns + '{\n\n'
|
out += 'namespace ' + ns + '{\n\n'
|
||||||
|
|
@ -1508,6 +1525,8 @@ def grammar_create_h(grammar, opts):
|
||||||
if rule.count(m) > 1:
|
if rule.count(m) > 1:
|
||||||
idx += 1
|
idx += 1
|
||||||
suffix = '_' + str(idx)
|
suffix = '_' + str(idx)
|
||||||
|
if m in tokens:
|
||||||
|
continue
|
||||||
p = grammar[m]
|
p = grammar[m]
|
||||||
out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
|
out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
|
||||||
out += '\n\t};'
|
out += '\n\t};'
|
||||||
|
|
@ -1575,10 +1594,10 @@ class GrammarCmd(jwutils.Cmd):
|
||||||
if args.fix_extensions not in fix_extensions_mode:
|
if args.fix_extensions not in fix_extensions_mode:
|
||||||
raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
|
raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
|
||||||
grammar = grammar_fix_extensions(grammar, args.fix_extensions)
|
grammar = grammar_fix_extensions(grammar, args.fix_extensions)
|
||||||
if args.unroll_alternatives:
|
|
||||||
grammar = grammar_unroll_alternatives(grammar)
|
|
||||||
if args.unroll_lists:
|
if args.unroll_lists:
|
||||||
grammar = grammar_unroll_lists(grammar)
|
grammar = grammar_unroll_lists(grammar)
|
||||||
|
if args.unroll_alternatives:
|
||||||
|
grammar = grammar_unroll_alternatives(grammar)
|
||||||
if args.unroll_options:
|
if args.unroll_options:
|
||||||
grammar = grammar_unroll_options(grammar)
|
grammar = grammar_unroll_options(grammar)
|
||||||
if len(args.check_symbols):
|
if len(args.check_symbols):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue