mirror of
ssh://git.janware.com/srv/git/janware/proj/jw-python
synced 2026-01-15 01:52:56 +01:00
grammar.py: Add grammar_parse_ebnf_tokens()
Add grammar_parse_ebnf_tokens(), to be used by external grammar parsers and grammar_parse_ebnf() Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
parent
860f7d8cab
commit
432d78cdc5
4 changed files with 216 additions and 91 deletions
|
|
@ -1,5 +1,8 @@
|
|||
TOPDIR = ../..
|
||||
|
||||
USE_PROJECT_LIB = true
|
||||
MEMBERS += local.a($(OBJ))
|
||||
|
||||
GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h
|
||||
|
||||
# These types are meant to be cut off the tree and turned into hand coded flex
|
||||
|
|
|
|||
|
|
@ -47,12 +47,6 @@ class GrammarCmd(jwutils.grammar.GrammarCmd):
|
|||
with open(args.input, 'r') as infile:
|
||||
contents = infile.read()
|
||||
grammar = jwutils.grammar.grammar_parse_ebnf(contents)
|
||||
|
||||
slog(INFO, "grammar size is", len(grammar))
|
||||
for t in grammar.keys():
|
||||
slog(INFO, "key =", t)
|
||||
slog(INFO, "grammar size is", len(grammar))
|
||||
jwutils.grammar.dump_grammar(INFO, grammar)
|
||||
grammar = super(GrammarCmd, self).processGrammar(args, grammar)
|
||||
self._run(args, grammar)
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
'END.' ;
|
||||
identifier = alphabetic character, { alphabetic character | digit } ;
|
||||
number = [ "-" ], digit, { digit } ;
|
||||
string = '"' , { all characters - '"' }, '"' ;
|
||||
string = '"' , { all characters }, '"' ;
|
||||
assignment = identifier , ":=" , ( number | identifier | string ) ;
|
||||
alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G"
|
||||
| "H" | "I" | "J" | "K" | "L" | "M" | "N"
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ p_ruleset = "ruleset"
|
|||
p_terminal = "term"
|
||||
p_literal = "literal"
|
||||
p_lexical = "lexical"
|
||||
p_special = "special"
|
||||
|
||||
mode_unroll = "unroll"
|
||||
mode_concat = "concat"
|
||||
|
|
@ -104,6 +105,11 @@ def cleanup_token(tok):
|
|||
tok = '"' + tok[1:-1] + '"'
|
||||
return tok
|
||||
|
||||
def tok2ctype(tok):
|
||||
if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?' ]:
|
||||
return t_grammar
|
||||
return t_target_lang
|
||||
|
||||
def is_terminal(tok):
|
||||
size = len(tok)
|
||||
if size < 2:
|
||||
|
|
@ -174,13 +180,24 @@ def format_yacc_rule(rule):
|
|||
r += tok2sym(c.token) + ' '
|
||||
return r[:-1]
|
||||
|
||||
class SourceElement:
|
||||
|
||||
def __init__(self, token, line):
|
||||
self.token = token
|
||||
self.line = line
|
||||
|
||||
class RuleComp:
|
||||
|
||||
def __init__(self, token, tp):
|
||||
def __init__(self, token, tp = None, line=-1):
|
||||
assert(token is not None)
|
||||
# assert(token != '|')
|
||||
self.token = token
|
||||
if tp is None:
|
||||
tp = tok2ctype(token)
|
||||
self.tp = tp
|
||||
slog(INFO, "creating rule component >" + self.str() + "<")
|
||||
assert(token != "{ assignment")
|
||||
self.line = line
|
||||
|
||||
def __eq__(self, rhs):
|
||||
if self.token != rhs.token:
|
||||
|
|
@ -205,27 +222,54 @@ class RuleComp:
|
|||
class State:
|
||||
|
||||
def __init__(self):
|
||||
self.curly = 0
|
||||
self.square = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.curly = 0
|
||||
self.square = 0
|
||||
self.ext = 0
|
||||
self.group = 0
|
||||
self.in_comment = False
|
||||
self.in_special = False
|
||||
self.production = None
|
||||
self.rule = []
|
||||
self.rules = []
|
||||
|
||||
def optional(self):
|
||||
return self.square != 0 or self.curly != 0
|
||||
|
||||
def update(self, tok):
|
||||
if tok == '[':
|
||||
self.square += 1
|
||||
elif tok == ']':
|
||||
self.square -= 1
|
||||
elif tok == '{':
|
||||
self.curly += 1
|
||||
elif tok == '}':
|
||||
self.curly -= 1
|
||||
if self.curly < 0 or self.square < 0:
|
||||
raise Exception("Unbalanced BNF bracket", tok)
|
||||
def update(self, tok, line):
|
||||
if not self.in_comment:
|
||||
if tok == '[':
|
||||
self.square += 1
|
||||
elif tok == ']':
|
||||
self.square -= 1
|
||||
elif tok == '{':
|
||||
self.curly += 1
|
||||
elif tok == '}':
|
||||
self.curly -= 1
|
||||
elif tok == '(':
|
||||
self.group += 1
|
||||
elif tok == ')':
|
||||
self.group -= 1
|
||||
elif tok == '<':
|
||||
self.ext += 1
|
||||
elif tok == '>':
|
||||
self.ext -= 1
|
||||
elif tok == '?':
|
||||
self.in_special = not self.in_special
|
||||
elif tok == '(*':
|
||||
self.in_comment = True
|
||||
elif tok == '*)':
|
||||
raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
|
||||
else:
|
||||
if tok == '(*':
|
||||
raise Exception("Nested EBNF comment", tok, "in line", line)
|
||||
elif tok == '*)':
|
||||
self.in_comment = False
|
||||
|
||||
if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
|
||||
raise Exception("Unbalanced BNF bracket", tok, "in line", line)
|
||||
return self.optional()
|
||||
|
||||
def in_list(self):
|
||||
|
|
@ -234,9 +278,25 @@ class State:
|
|||
def in_option(self):
|
||||
return self.square > 0
|
||||
|
||||
def in_group(self):
|
||||
return self.group > 0
|
||||
|
||||
def in_ext(self):
|
||||
return self.ext > 0
|
||||
|
||||
def in_something(self):
|
||||
if self.square > 0 or self.curly > 0 or self.group > 0 or self.ext > 0 or self.in_comment or self.in_special:
|
||||
return True
|
||||
return False
|
||||
|
||||
class Symbol:
|
||||
|
||||
def __init__(self, token, tp = p_ruleset, rules = None):
|
||||
def __init__(self, token, tp = None, rules = None):
|
||||
if tp == None:
|
||||
if is_terminal(token) is not None:
|
||||
tp = p_terminal
|
||||
else:
|
||||
tp = p_ruleset
|
||||
self.tp = tp
|
||||
self.token = token
|
||||
self.name = tok2name(token)
|
||||
|
|
@ -261,22 +321,27 @@ class Symbol:
|
|||
self.term = None
|
||||
self.regex = tok2regex(self.token)
|
||||
self.is_lexical_element = False
|
||||
self.datatype = "std::string"
|
||||
elif tp == p_lexical:
|
||||
assert(len(self.rules) == 0)
|
||||
self.datatype = 'std::string'
|
||||
elif tp == p_special or tp == p_lexical:
|
||||
if len(self.rules):
|
||||
self.dump(ERR)
|
||||
raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
|
||||
self.term = None
|
||||
self.regex = tok2regex(self.token)
|
||||
self.regex = None
|
||||
self.is_lexical_element = True
|
||||
self.datatype = None
|
||||
self.datatype = 'std::string'
|
||||
elif tp == p_terminal:
|
||||
assert(len(self.rules) == 0)
|
||||
if len(self.rules):
|
||||
slog(ERR, "rules = ", self.rules)
|
||||
self.dump(ERR)
|
||||
raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
|
||||
self.term = self.token
|
||||
self.regex = tok2regex(self.token)
|
||||
self.is_lexical_element = False
|
||||
self.datatype = None
|
||||
else:
|
||||
self.dump()
|
||||
raise Exception("Tried to set production to unknown type", tp)
|
||||
raise Exception("Tried to set symbol to unknown type", tp)
|
||||
self.tp = tp
|
||||
|
||||
def str(self):
|
||||
|
|
@ -309,71 +374,133 @@ def split_list_by(l_, tok):
|
|||
l = copy.deepcopy(l_)
|
||||
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
|
||||
|
||||
|
||||
def split_list_by_regex(l_, regex):
|
||||
l = copy.deepcopy(l_)
|
||||
return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]
|
||||
|
||||
def grammar_parse_ebnf(content_):
|
||||
|
||||
# remove comments
|
||||
def grammar_tokenize_ebnf(content):
|
||||
r = []
|
||||
c = ''
|
||||
l = 0
|
||||
in_comment = False
|
||||
quoted = None
|
||||
raw_tokens = re.split("([, ])", content_)
|
||||
tokens = []
|
||||
for t in raw_tokens:
|
||||
t = t.strip()
|
||||
if not len(t):
|
||||
continue
|
||||
if quoted:
|
||||
if t == quoted: # FIXME: check backslash before
|
||||
quoted = None
|
||||
elif in_comment:
|
||||
if t == '*)':
|
||||
in_comment = False
|
||||
continue
|
||||
elif t == '(*':
|
||||
in_comment = True
|
||||
continue
|
||||
elif t in [ '"', "'" ]:
|
||||
quoted = t
|
||||
tokens.append(t)
|
||||
in_quote = None
|
||||
for line in content.splitlines(True):
|
||||
end = len(line) - 1
|
||||
l += 1
|
||||
tok = ''
|
||||
p = -1
|
||||
while p < end:
|
||||
p += 1
|
||||
if p < end and in_quote == None:
|
||||
cand = line[p:p+2]
|
||||
if cand == '(*':
|
||||
if in_comment:
|
||||
raise Exception("Syntax error in line", l, ": spurious comment closure")
|
||||
in_comment = True
|
||||
p += 1
|
||||
continue
|
||||
elif cand == '*)':
|
||||
if not in_comment:
|
||||
raise Exception("Syntax error in line", l, ": spurious comment opener")
|
||||
in_comment = False
|
||||
p += 1
|
||||
continue
|
||||
if in_comment:
|
||||
continue
|
||||
c = line[p]
|
||||
if c in [ '"', "'" ]:
|
||||
if in_quote is None:
|
||||
in_quote = c
|
||||
else:
|
||||
if in_quote == c:
|
||||
in_quote = None
|
||||
if in_quote is not None:
|
||||
tok += c
|
||||
continue
|
||||
if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
|
||||
tok = tok.strip()
|
||||
if len(tok):
|
||||
r.append((tok, l))
|
||||
tok = ''
|
||||
if not c.isspace():
|
||||
r.append((c, l))
|
||||
continue
|
||||
tok += c
|
||||
|
||||
tok = tok.strip()
|
||||
if len(tok):
|
||||
r.append((tok, l))
|
||||
return r
|
||||
|
||||
def grammar_add_symbol(grammar, tok, rules):
|
||||
assert(tok is not None)
|
||||
if tok in grammar.keys():
|
||||
s = grammar[tok]
|
||||
else:
|
||||
s = Symbol(tok, rules=rules)
|
||||
grammar[tok] = s
|
||||
if rules is not None:
|
||||
slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
|
||||
for rule in rules:
|
||||
if not rule in s.rules:
|
||||
s.rules.append(rule)
|
||||
grammar[tok] = s
|
||||
|
||||
def grammar_parse_ebnf_tokens(tokens):
|
||||
grammar = OrderedDict()
|
||||
raw_productions = split_list_by(tokens, ';')
|
||||
#slog(INFO, "raw_productions =", raw_productions)
|
||||
for raw_production in raw_productions:
|
||||
#slog(INFO, "raw_production =", '@'.join(raw_production))
|
||||
raw_lhs_rhs = split_list_by(raw_production, '=')
|
||||
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
|
||||
assert(len(raw_lhs_rhs) == 2)
|
||||
lhs = ' '.join(raw_lhs_rhs[0])
|
||||
p = Symbol(lhs)
|
||||
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
|
||||
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
|
||||
for raw_rule in raw_rules:
|
||||
slog(INFO, "raw_rule =", raw_rule)
|
||||
rule_tokens = split_list_by_regex(raw_rule, ',{}\(\)\[\]')
|
||||
#slog(INFO, "rule_tokens =", rule_tokens)
|
||||
rule = []
|
||||
for raw_tok in rule_tokens:
|
||||
tok = cleanup_token(' '.join(raw_tok))
|
||||
tp = t_target_lang
|
||||
if is_terminal(tok) is not None:
|
||||
if not tok in grammar.keys():
|
||||
litp = Symbol(tok, p_terminal)
|
||||
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
|
||||
grammar[tok] = litp
|
||||
tp = t_target_lang
|
||||
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
|
||||
tp = t_grammar
|
||||
rule.append(RuleComp(tok, tp))
|
||||
p.rules.append(rule)
|
||||
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
|
||||
grammar[lhs] = p
|
||||
state = State()
|
||||
lhs = None
|
||||
last = None
|
||||
ruleset = []
|
||||
rule = []
|
||||
terminals = []
|
||||
specials = []
|
||||
for tok, line in tokens:
|
||||
try:
|
||||
state.update(tok, line)
|
||||
if tok == '=':
|
||||
lhs = last
|
||||
continue
|
||||
last = tok
|
||||
if tok == ';':
|
||||
ruleset.append(rule)
|
||||
grammar_add_symbol(grammar, lhs, ruleset)
|
||||
ruleset = []
|
||||
rule = []
|
||||
lhs = None
|
||||
continue
|
||||
if tok == ',':
|
||||
continue
|
||||
if tok == '|' and not state.in_something():
|
||||
ruleset.append(rule)
|
||||
rule = []
|
||||
continue
|
||||
if is_terminal(tok) and tok not in terminals:
|
||||
terminals.append(tok)
|
||||
elif state.in_special and tok not in specials:
|
||||
specials.append(tok)
|
||||
if lhs is not None:
|
||||
rule.append(RuleComp(tok, line=line))
|
||||
except Exception as err:
|
||||
for t in tokens:
|
||||
slog(ERR, t)
|
||||
slog(ERR, "Unexpected error in line", line, ":", str(err))
|
||||
raise
|
||||
exit(1)
|
||||
for s in terminals:
|
||||
grammar_add_symbol(grammar, s, None)
|
||||
grammar[s].set_type(p_terminal)
|
||||
for s in specials:
|
||||
grammar_add_symbol(grammar, s, None)
|
||||
grammar[s].set_type(p_special)
|
||||
|
||||
return grammar
|
||||
|
||||
def grammar_parse_ebnf(content_):
|
||||
tokens = grammar_tokenize_ebnf(content_)
|
||||
grammar = grammar_parse_ebnf_tokens(tokens)
|
||||
return grammar
|
||||
|
||||
def grammar_get_types(grammar):
|
||||
types = dict()
|
||||
for t, p in grammar.iteritems():
|
||||
|
|
@ -427,9 +554,9 @@ def grammar_fix_extensions(grammar, mode):
|
|||
prefix = prefix[1:]
|
||||
slog(INFO, "Found prefix", prefix)
|
||||
if mode == mode_keep:
|
||||
newrule.append(RuleComp('<', t_grammar))
|
||||
newrule.append(RuleComp('<'))
|
||||
newrule.append(RuleComp(prefix, t_target_lang))
|
||||
newrule.append(RuleComp('>', t_grammar))
|
||||
newrule.append(RuleComp('>'))
|
||||
newrule.append(c)
|
||||
elif mode == mode_discard:
|
||||
prefix = ''
|
||||
|
|
@ -464,8 +591,9 @@ def grammar_unroll_lists(grammar):
|
|||
listrule = []
|
||||
prefix = None
|
||||
s = State()
|
||||
slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
|
||||
for c in rule:
|
||||
s.update(c.token)
|
||||
s.update(c.token, c.line)
|
||||
if c.token == '{':
|
||||
continue
|
||||
if c.token == '}':
|
||||
|
|
@ -614,7 +742,7 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
|
|||
s = State()
|
||||
for c in rule:
|
||||
slog(INFO, indent, "testing token", c.token)
|
||||
if c.tp == t_grammar and s.update(c.token):
|
||||
if c.tp == t_grammar and s.update(c.token, 0):
|
||||
continue
|
||||
if c.tp != t_target_lang:
|
||||
slog(INFO, indent, " token", c.token, "is not a VHDL token")
|
||||
|
|
@ -942,7 +1070,7 @@ def create_yacc(grammar):
|
|||
for c in rule:
|
||||
n += 1
|
||||
if c.tp == t_grammar:
|
||||
s.update(c.token)
|
||||
s.update(c.token, 0)
|
||||
continue
|
||||
p = grammar[c.token]
|
||||
#if is_terminal(c.token) is not None:
|
||||
|
|
@ -1015,8 +1143,8 @@ def create_lex(grammar):
|
|||
for t, p in grammar.iteritems():
|
||||
if p.term is not None:
|
||||
# \. { return T_DOT; }
|
||||
assert(p.term[0] == '"')
|
||||
assert(p.term[-1] == '"')
|
||||
assert p.term[0] in [ '"', "'" ], p.term
|
||||
assert p.term[-1] in [ '"', "'" ], p.term
|
||||
out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'
|
||||
|
||||
out += textwrap.dedent("""\
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue