mirror of
ssh://git.janware.com/srv/git/janware/proj/jw-python
synced 2026-01-15 09:53:32 +01:00
Continue implementation of grammar.py
Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
parent
5b76358238
commit
40e6add5ad
6 changed files with 366 additions and 44 deletions
|
|
@ -6,6 +6,8 @@ import sys
|
|||
import re
|
||||
import lxml.etree as ET
|
||||
import textwrap
|
||||
import itertools
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from abc import abstractmethod
|
||||
|
||||
|
|
@ -83,21 +85,34 @@ token_regexes = {
|
|||
"PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
|
||||
}
|
||||
|
||||
quotechars = [ '"', "'" ]
|
||||
|
||||
def dump(obj):
|
||||
for c, v in obj.iteritems():
|
||||
slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
|
||||
|
||||
def dump_grammar(prio, grammar):
|
||||
for t, p in grammar.iteritems():
|
||||
p.dump(prio)
|
||||
|
||||
def cleanup_token(tok):
|
||||
tok = tok.strip()
|
||||
if len(tok) == 0:
|
||||
return None
|
||||
if tok[0] == "'" and tok[-1] == "'":
|
||||
tok = '"' + tok[1:-1] + '"'
|
||||
return tok
|
||||
|
||||
def is_terminal(tok):
|
||||
if not tok.startswith('"'):
|
||||
size = len(tok)
|
||||
if size < 2:
|
||||
return None
|
||||
if not tok.endswith('"'):
|
||||
raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"')
|
||||
first = tok[0]
|
||||
last = tok[-1]
|
||||
if (not first in quotechars) and (not last in quotechars):
|
||||
return None
|
||||
if first != last:
|
||||
raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
|
||||
return tok[1:-1]
|
||||
|
||||
def tok2name(tok):
|
||||
|
|
@ -129,6 +144,29 @@ def format_rule(rule):
|
|||
def format_rules(rules):
|
||||
return ', '.join(format_rule(rule) for rule in rules)
|
||||
|
||||
def format_ebnf_rule(grammar, rule):
|
||||
r = ""
|
||||
last = None
|
||||
for comp in rule:
|
||||
if last is not None:
|
||||
if comp.tp == t_grammar:
|
||||
if last.tp == t_grammar:
|
||||
pass
|
||||
else:
|
||||
if comp.token in [ '[', '(', '{', '<' ]:
|
||||
r += ','
|
||||
else:
|
||||
if last.tp == t_grammar:
|
||||
if comp.token in [ ']', ')', '}', '>' ]:
|
||||
r += ','
|
||||
else:
|
||||
r += ','
|
||||
r += ' ' + comp.token
|
||||
last = comp
|
||||
if len(r) == 0:
|
||||
return r
|
||||
return r.strip()
|
||||
|
||||
def format_yacc_rule(rule):
|
||||
r = ''
|
||||
for c in rule:
|
||||
|
|
@ -140,7 +178,7 @@ class RuleComp:
|
|||
def __init__(self, token, tp):
|
||||
self.token = token
|
||||
self.tp = tp
|
||||
slog(INFO, "creating rule >" + self.str() + "<")
|
||||
slog(INFO, "creating rule component >" + self.str() + "<")
|
||||
|
||||
def __eq__(self, rhs):
|
||||
if self.token != rhs.token:
|
||||
|
|
@ -153,7 +191,14 @@ class RuleComp:
|
|||
return not self.__eq__(rhs)
|
||||
|
||||
def str(self):
|
||||
return "{" + self.tp + ": " + self.token + "}"
|
||||
tp = 'u'
|
||||
if self.tp == t_grammar:
|
||||
tp = 'g'
|
||||
elif self.tp == t_target_lang:
|
||||
tp = 'l'
|
||||
else:
|
||||
tp = self.tp
|
||||
return "{" + tp + ": " + self.token + "}"
|
||||
|
||||
class State:
|
||||
|
||||
|
|
@ -192,13 +237,13 @@ class Symbol:
|
|||
def __init__(self, token, tp = p_ruleset, rules = None):
|
||||
self.tp = tp
|
||||
self.token = token
|
||||
self.name = tok2name(token)
|
||||
self.sym = tok2sym(token)
|
||||
self.term = None
|
||||
self.name = tok2name(token)
|
||||
self.sym = tok2sym(token)
|
||||
self.term = None
|
||||
self.regex = None
|
||||
self.is_lexical_element = False
|
||||
self.rules = []
|
||||
self.datatype = None
|
||||
self.is_lexical_element = False
|
||||
self.rules = []
|
||||
self.datatype = None
|
||||
if rules is not None:
|
||||
self.rules = rules
|
||||
self.set_type(tp)
|
||||
|
|
@ -244,17 +289,82 @@ class Symbol:
|
|||
return True
|
||||
|
||||
def dump(self, prio = NOTICE, msg=""):
|
||||
slog(prio, ",----------------", msg)
|
||||
slog(prio, "| type =", self.tp)
|
||||
slog(prio, "| name =", self.name)
|
||||
slog(prio, ",----------------", msg)
|
||||
slog(prio, "| type =", self.tp)
|
||||
slog(prio, "| name =", self.name)
|
||||
slog(prio, "| token =", self.token)
|
||||
slog(prio, "| sym =", self.sym)
|
||||
slog(prio, "| term =", self.term)
|
||||
slog(prio, "| sym =", self.sym)
|
||||
slog(prio, "| term =", self.term)
|
||||
slog(prio, "| regex =", self.regex)
|
||||
slog(prio, "| datatype =", self.datatype)
|
||||
slog(prio, "| is_lexical_element =", self.is_lexical_element)
|
||||
slog(prio, "| rules =", format_rules(self.rules))
|
||||
slog(prio, "`----------------", msg)
|
||||
slog(prio, "| is_lexical_element =", self.is_lexical_element)
|
||||
slog(prio, "| rules =", format_rules(self.rules))
|
||||
slog(prio, "`----------------", msg)
|
||||
|
||||
def split_list_by(l_, tok):
|
||||
l = copy.deepcopy(l_)
|
||||
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
|
||||
|
||||
def grammar_parse_ebnf(content_):
|
||||
|
||||
# remove comments
|
||||
in_comment = False
|
||||
quoted = None
|
||||
raw_tokens = re.split("([, ])", content_)
|
||||
tokens = []
|
||||
for t in raw_tokens:
|
||||
t = t.strip()
|
||||
if not len(t):
|
||||
continue
|
||||
if quoted:
|
||||
if t == quoted: # FIXME: check backslash before
|
||||
quoted = None
|
||||
elif in_comment:
|
||||
if t == '*)':
|
||||
in_comment = False
|
||||
continue
|
||||
elif t == '(*':
|
||||
in_comment = True
|
||||
continue
|
||||
elif t in [ '"', "'" ]:
|
||||
quoted = t
|
||||
tokens.append(t)
|
||||
|
||||
grammar = OrderedDict()
|
||||
raw_productions = split_list_by(tokens, ';')
|
||||
#slog(INFO, "raw_productions =", raw_productions)
|
||||
for raw_production in raw_productions:
|
||||
#slog(INFO, "raw_production =", '@'.join(raw_production))
|
||||
raw_lhs_rhs = split_list_by(raw_production, '=')
|
||||
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
|
||||
assert(len(raw_lhs_rhs) == 2)
|
||||
lhs = ' '.join(raw_lhs_rhs[0])
|
||||
p = Symbol(lhs)
|
||||
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
|
||||
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
|
||||
for raw_rule in raw_rules:
|
||||
#slog(INFO, "raw_rule =", raw_rule)
|
||||
rule_tokens = split_list_by(raw_rule, ',')
|
||||
#slog(INFO, "rule_tokens =", rule_tokens)
|
||||
rule = []
|
||||
for raw_tok in rule_tokens:
|
||||
tok = cleanup_token(' '.join(raw_tok))
|
||||
tp = t_target_lang
|
||||
if is_terminal(tok) is not None:
|
||||
if not tok in grammar.keys():
|
||||
litp = Symbol(tok, p_terminal)
|
||||
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
|
||||
grammar[tok] = litp
|
||||
tp = t_target_lang
|
||||
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
|
||||
tp = t_grammar
|
||||
rule.append(RuleComp(tok, tp))
|
||||
p.rules.append(rule)
|
||||
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
|
||||
grammar[lhs] = p
|
||||
|
||||
dump_grammar(INFO, grammar)
|
||||
return grammar
|
||||
|
||||
def grammar_get_types(grammar):
|
||||
types = dict()
|
||||
|
|
@ -292,10 +402,10 @@ def grammar_fix_extensions(grammar, mode):
|
|||
prefix = ""
|
||||
paren = 0
|
||||
for c in rule:
|
||||
if c.tp != t_target_lang:
|
||||
if c.token == '(':
|
||||
if c.tp == t_grammar and c.token in ['<', '>']:
|
||||
if c.token == '<':
|
||||
paren += 1
|
||||
elif c.token == ')':
|
||||
elif c.token == '>':
|
||||
paren -= 1
|
||||
if paren <= 1: # don't add first level of control chars
|
||||
continue
|
||||
|
|
@ -309,9 +419,9 @@ def grammar_fix_extensions(grammar, mode):
|
|||
prefix = prefix[1:]
|
||||
slog(INFO, "Found prefix", prefix)
|
||||
if mode == mode_keep:
|
||||
newrule.append(RuleComp('(', t_grammar))
|
||||
newrule.append(RuleComp('<', t_grammar))
|
||||
newrule.append(RuleComp(prefix, t_target_lang))
|
||||
newrule.append(RuleComp(')', t_grammar))
|
||||
newrule.append(RuleComp('>', t_grammar))
|
||||
newrule.append(c)
|
||||
elif mode == mode_discard:
|
||||
prefix = ''
|
||||
|
|
@ -362,8 +472,12 @@ def grammar_unroll_lists(grammar):
|
|||
if rule.tp != t_target_lang:
|
||||
continue
|
||||
name += tok2name(rule.token) + "_"
|
||||
if len(delpos) != 1:
|
||||
raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule))
|
||||
|
||||
# not really: there are lists without delimiters, too
|
||||
#if len(delpos) != 1:
|
||||
# p.dump(ERR)
|
||||
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
|
||||
|
||||
name = name + "my_list"
|
||||
newrule.append(RuleComp(name, t_target_lang))
|
||||
p = Symbol(name, rules=[[], listrule])
|
||||
|
|
@ -447,15 +561,6 @@ def grammar_unroll_options(grammar):
|
|||
grammar[tok].rules = rules_unroll_options(p.rules)
|
||||
return grammar
|
||||
|
||||
def format_ebnf_rule(grammar, rule):
|
||||
r = ""
|
||||
for comp in rule:
|
||||
if comp.tp == t_grammar:
|
||||
r = r + " " + comp.token
|
||||
continue
|
||||
r = r + " " + comp.token
|
||||
return r.strip()
|
||||
|
||||
def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
|
||||
if checked is None:
|
||||
checked = set()
|
||||
|
|
@ -538,14 +643,14 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
|
|||
slog(INFO, indent, "returning", r, "for token", tok)
|
||||
return r
|
||||
|
||||
def grammar_check(grammar, selements = None):
|
||||
if selements is None:
|
||||
selements = []
|
||||
def grammar_check(grammar, check_symbols = None):
|
||||
terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
|
||||
orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
|
||||
lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
|
||||
elements = set()
|
||||
if len(selements) == 0:
|
||||
if check_symbols is None:
|
||||
check_symbols = []
|
||||
if len(check_symbols) == 0:
|
||||
for tok, p in grammar.iteritems():
|
||||
if p.is_lexical_element:
|
||||
elements.add(tok)
|
||||
|
|
@ -555,9 +660,9 @@ def grammar_check(grammar, selements = None):
|
|||
if c.tp == t_grammar:
|
||||
continue
|
||||
elements.add(c.token)
|
||||
selements = sorted(list(elements))
|
||||
check_symbols = sorted(list(elements))
|
||||
found = dict()
|
||||
for tok in selements:
|
||||
for tok in check_symbols:
|
||||
slog(INFO, "======= checking", tok)
|
||||
rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
|
||||
if rr == sys.maxint:
|
||||
|
|
@ -683,14 +788,18 @@ def grammar_trim_symbols(grammar, symbols):
|
|||
|
||||
def create_ebnf(grammar):
|
||||
indent = 40
|
||||
slog(INFO, "creating ebnf from grammar of size", len(grammar))
|
||||
out = ''
|
||||
for t, p in grammar.iteritems():
|
||||
slog(INFO, "formatting rule", t)
|
||||
if not len(p.rules):
|
||||
slog(INFO, "ignoring " + t + " (has no rules)\n")
|
||||
continue
|
||||
out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0])
|
||||
out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
|
||||
for rule in p.rules[1:]:
|
||||
out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule)
|
||||
return out + "\n"
|
||||
out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
|
||||
out += ' ' * indent + ' ;\n'
|
||||
return out
|
||||
|
||||
def create_yacc(grammar):
|
||||
indent = ' ' * 40
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue