grammar.py: Add grammar_parse_ebnf_tokens()

Add grammar_parse_ebnf_tokens(), to be used by external grammar parsers
and grammar_parse_ebnf()

Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
Jan Lindemann 2017-10-28 19:16:00 +02:00
commit 432d78cdc5
4 changed files with 216 additions and 91 deletions

View file

@ -1,5 +1,8 @@
TOPDIR = ../..
USE_PROJECT_LIB = true
MEMBERS += local.a($(OBJ))
GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h
# These types are meant to be cut off the tree and turned into hand coded flex

View file

@ -47,12 +47,6 @@ class GrammarCmd(jwutils.grammar.GrammarCmd):
with open(args.input, 'r') as infile:
contents = infile.read()
grammar = jwutils.grammar.grammar_parse_ebnf(contents)
slog(INFO, "grammar size is", len(grammar))
for t in grammar.keys():
slog(INFO, "key =", t)
slog(INFO, "grammar size is", len(grammar))
jwutils.grammar.dump_grammar(INFO, grammar)
grammar = super(GrammarCmd, self).processGrammar(args, grammar)
self._run(args, grammar)

View file

@ -5,7 +5,7 @@
'END.' ;
identifier = alphabetic character, { alphabetic character | digit } ;
number = [ "-" ], digit, { digit } ;
string = '"' , { all characters - '"' }, '"' ;
string = '"' , { all characters }, '"' ;
assignment = identifier , ":=" , ( number | identifier | string ) ;
alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G"
| "H" | "I" | "J" | "K" | "L" | "M" | "N"

View file

@ -22,6 +22,7 @@ p_ruleset = "ruleset"
p_terminal = "term"
p_literal = "literal"
p_lexical = "lexical"
p_special = "special"
mode_unroll = "unroll"
mode_concat = "concat"
@ -104,6 +105,11 @@ def cleanup_token(tok):
tok = '"' + tok[1:-1] + '"'
return tok
def tok2ctype(tok):
if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?' ]:
return t_grammar
return t_target_lang
def is_terminal(tok):
size = len(tok)
if size < 2:
@ -174,13 +180,24 @@ def format_yacc_rule(rule):
r += tok2sym(c.token) + ' '
return r[:-1]
class SourceElement:
def __init__(self, token, line):
self.token = token
self.line = line
class RuleComp:
def __init__(self, token, tp):
def __init__(self, token, tp = None, line=-1):
assert(token is not None)
# assert(token != '|')
self.token = token
if tp is None:
tp = tok2ctype(token)
self.tp = tp
slog(INFO, "creating rule component >" + self.str() + "<")
assert(token != "{ assignment")
self.line = line
def __eq__(self, rhs):
if self.token != rhs.token:
@ -205,27 +222,54 @@ class RuleComp:
class State:
def __init__(self):
self.curly = 0
self.square = 0
self.reset()
def reset(self):
self.curly = 0
self.square = 0
self.ext = 0
self.group = 0
self.in_comment = False
self.in_special = False
self.production = None
self.rule = []
self.rules = []
def optional(self):
return self.square != 0 or self.curly != 0
def update(self, tok):
if tok == '[':
self.square += 1
elif tok == ']':
self.square -= 1
elif tok == '{':
self.curly += 1
elif tok == '}':
self.curly -= 1
if self.curly < 0 or self.square < 0:
raise Exception("Unbalanced BNF bracket", tok)
def update(self, tok, line):
if not self.in_comment:
if tok == '[':
self.square += 1
elif tok == ']':
self.square -= 1
elif tok == '{':
self.curly += 1
elif tok == '}':
self.curly -= 1
elif tok == '(':
self.group += 1
elif tok == ')':
self.group -= 1
elif tok == '<':
self.ext += 1
elif tok == '>':
self.ext -= 1
elif tok == '?':
self.in_special = not self.in_special
elif tok == '(*':
self.in_comment = True
elif tok == '*)':
raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
else:
if tok == '(*':
raise Exception("Nested EBNF comment", tok, "in line", line)
elif tok == '*)':
self.in_comment = False
if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
raise Exception("Unbalanced BNF bracket", tok, "in line", line)
return self.optional()
def in_list(self):
@ -234,9 +278,25 @@ class State:
def in_option(self):
return self.square > 0
def in_group(self):
return self.group > 0
def in_ext(self):
return self.ext > 0
def in_something(self):
if self.square > 0 or self.curly > 0 or self.group > 0 or self.ext > 0 or self.in_comment or self.in_special:
return True
return False
class Symbol:
def __init__(self, token, tp = p_ruleset, rules = None):
def __init__(self, token, tp = None, rules = None):
if tp == None:
if is_terminal(token) is not None:
tp = p_terminal
else:
tp = p_ruleset
self.tp = tp
self.token = token
self.name = tok2name(token)
@ -261,22 +321,27 @@ class Symbol:
self.term = None
self.regex = tok2regex(self.token)
self.is_lexical_element = False
self.datatype = "std::string"
elif tp == p_lexical:
assert(len(self.rules) == 0)
self.datatype = 'std::string'
elif tp == p_special or tp == p_lexical:
if len(self.rules):
self.dump(ERR)
raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
self.term = None
self.regex = tok2regex(self.token)
self.regex = None
self.is_lexical_element = True
self.datatype = None
self.datatype = 'std::string'
elif tp == p_terminal:
assert(len(self.rules) == 0)
if len(self.rules):
slog(ERR, "rules = ", self.rules)
self.dump(ERR)
raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
self.term = self.token
self.regex = tok2regex(self.token)
self.is_lexical_element = False
self.datatype = None
else:
self.dump()
raise Exception("Tried to set production to unknown type", tp)
raise Exception("Tried to set symbol to unknown type", tp)
self.tp = tp
def str(self):
@ -309,71 +374,133 @@ def split_list_by(l_, tok):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
def split_list_by_regex(l_, regex):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]
def grammar_parse_ebnf(content_):
# remove comments
def grammar_tokenize_ebnf(content):
r = []
c = ''
l = 0
in_comment = False
quoted = None
raw_tokens = re.split("([, ])", content_)
tokens = []
for t in raw_tokens:
t = t.strip()
if not len(t):
continue
if quoted:
if t == quoted: # FIXME: check backslash before
quoted = None
elif in_comment:
if t == '*)':
in_comment = False
continue
elif t == '(*':
in_comment = True
continue
elif t in [ '"', "'" ]:
quoted = t
tokens.append(t)
in_quote = None
for line in content.splitlines(True):
end = len(line) - 1
l += 1
tok = ''
p = -1
while p < end:
p += 1
if p < end and in_quote == None:
cand = line[p:p+2]
if cand == '(*':
if in_comment:
raise Exception("Syntax error in line", l, ": spurious comment closure")
in_comment = True
p += 1
continue
elif cand == '*)':
if not in_comment:
raise Exception("Syntax error in line", l, ": spurious comment opener")
in_comment = False
p += 1
continue
if in_comment:
continue
c = line[p]
if c in [ '"', "'" ]:
if in_quote is None:
in_quote = c
else:
if in_quote == c:
in_quote = None
if in_quote is not None:
tok += c
continue
if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
tok = tok.strip()
if len(tok):
r.append((tok, l))
tok = ''
if not c.isspace():
r.append((c, l))
continue
tok += c
tok = tok.strip()
if len(tok):
r.append((tok, l))
return r
def grammar_add_symbol(grammar, tok, rules):
assert(tok is not None)
if tok in grammar.keys():
s = grammar[tok]
else:
s = Symbol(tok, rules=rules)
grammar[tok] = s
if rules is not None:
slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
for rule in rules:
if not rule in s.rules:
s.rules.append(rule)
grammar[tok] = s
def grammar_parse_ebnf_tokens(tokens):
grammar = OrderedDict()
raw_productions = split_list_by(tokens, ';')
#slog(INFO, "raw_productions =", raw_productions)
for raw_production in raw_productions:
#slog(INFO, "raw_production =", '@'.join(raw_production))
raw_lhs_rhs = split_list_by(raw_production, '=')
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
assert(len(raw_lhs_rhs) == 2)
lhs = ' '.join(raw_lhs_rhs[0])
p = Symbol(lhs)
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
for raw_rule in raw_rules:
slog(INFO, "raw_rule =", raw_rule)
rule_tokens = split_list_by_regex(raw_rule, ',{}\(\)\[\]')
#slog(INFO, "rule_tokens =", rule_tokens)
rule = []
for raw_tok in rule_tokens:
tok = cleanup_token(' '.join(raw_tok))
tp = t_target_lang
if is_terminal(tok) is not None:
if not tok in grammar.keys():
litp = Symbol(tok, p_terminal)
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
grammar[tok] = litp
tp = t_target_lang
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
tp = t_grammar
rule.append(RuleComp(tok, tp))
p.rules.append(rule)
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
grammar[lhs] = p
state = State()
lhs = None
last = None
ruleset = []
rule = []
terminals = []
specials = []
for tok, line in tokens:
try:
state.update(tok, line)
if tok == '=':
lhs = last
continue
last = tok
if tok == ';':
ruleset.append(rule)
grammar_add_symbol(grammar, lhs, ruleset)
ruleset = []
rule = []
lhs = None
continue
if tok == ',':
continue
if tok == '|' and not state.in_something():
ruleset.append(rule)
rule = []
continue
if is_terminal(tok) and tok not in terminals:
terminals.append(tok)
elif state.in_special and tok not in specials:
specials.append(tok)
if lhs is not None:
rule.append(RuleComp(tok, line=line))
except Exception as err:
for t in tokens:
slog(ERR, t)
slog(ERR, "Unexpected error in line", line, ":", str(err))
raise
exit(1)
for s in terminals:
grammar_add_symbol(grammar, s, None)
grammar[s].set_type(p_terminal)
for s in specials:
grammar_add_symbol(grammar, s, None)
grammar[s].set_type(p_special)
return grammar
def grammar_parse_ebnf(content_):
tokens = grammar_tokenize_ebnf(content_)
grammar = grammar_parse_ebnf_tokens(tokens)
return grammar
def grammar_get_types(grammar):
types = dict()
for t, p in grammar.iteritems():
@ -427,9 +554,9 @@ def grammar_fix_extensions(grammar, mode):
prefix = prefix[1:]
slog(INFO, "Found prefix", prefix)
if mode == mode_keep:
newrule.append(RuleComp('<', t_grammar))
newrule.append(RuleComp('<'))
newrule.append(RuleComp(prefix, t_target_lang))
newrule.append(RuleComp('>', t_grammar))
newrule.append(RuleComp('>'))
newrule.append(c)
elif mode == mode_discard:
prefix = ''
@ -464,8 +591,9 @@ def grammar_unroll_lists(grammar):
listrule = []
prefix = None
s = State()
slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
for c in rule:
s.update(c.token)
s.update(c.token, c.line)
if c.token == '{':
continue
if c.token == '}':
@ -614,7 +742,7 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
s = State()
for c in rule:
slog(INFO, indent, "testing token", c.token)
if c.tp == t_grammar and s.update(c.token):
if c.tp == t_grammar and s.update(c.token, 0):
continue
if c.tp != t_target_lang:
slog(INFO, indent, " token", c.token, "is not a VHDL token")
@ -942,7 +1070,7 @@ def create_yacc(grammar):
for c in rule:
n += 1
if c.tp == t_grammar:
s.update(c.token)
s.update(c.token, 0)
continue
p = grammar[c.token]
#if is_terminal(c.token) is not None:
@ -1015,8 +1143,8 @@ def create_lex(grammar):
for t, p in grammar.iteritems():
if p.term is not None:
# \. { return T_DOT; }
assert(p.term[0] == '"')
assert(p.term[-1] == '"')
assert p.term[0] in [ '"', "'" ], p.term
assert p.term[-1] in [ '"', "'" ], p.term
out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'
out += textwrap.dedent("""\