jw-python/tools/python/jwutils/grammar.py
Jan Lindemann 16ce7abd93 grammar.py and friends: Make test/grammar compile and start
Doesn't successfully parse grammartest.code, yet, it errors out with a
syntax error on whitespace. But at least it compiles and starts.

Signed-off-by: Jan Lindemann <jan@janware.com>
2017-10-30 13:05:22 +01:00

1629 lines
51 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import sys
import re
import lxml.etree as ET
import textwrap
import itertools
import copy
from collections import OrderedDict
from abc import abstractmethod
import os.path
import jwutils
from jwutils.log import *
t_grammar = "grammar"
t_target_lang = "target"
p_ruleset = "ruleset"
p_terminal = "term"
p_literal = "literal"
p_lexical = "lexical"
p_special = "special"
mode_unroll = "unroll"
mode_concat = "concat"
mode_keep = "keep"
mode_discard = "discard"
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
member_prefix = ''
special_terminals = {
"`" : "BACKTICK",
"^" : "CARET",
"<" : "LT",
"<<" : "LEFT_SHIFT",
"<=" : "LTE",
"<=>" : "SPACE_SHIP",
"<>" : "NE",
"=" : "EQ",
"=>" : "EG",
">" : "GT",
">=" : "GE",
">>" : "RIGHT_SHIFT",
"|" : "PIPE",
"_" : "UNDERSCORE",
"," : "COMMA",
";" : "SEMICOLON",
":" : "COLON",
":=" : "DEFINE",
"?" : "QM",
"?<" : "QM_LT",
"?<=" : "QM_LE",
"?=" : "QM_EQ",
"?>" : "QM_GT",
"?>=" : "QM_GE",
"??" : "QM_QM",
"?/=" : "QM_DIV_EQ",
"/" : "DIV",
"/=" : "DIV_EQ",
"." : "DOT",
"\"" : "DQUOTE",
"'" : "QUOTE",
"(" : "LPAREN",
")" : "RPAREN",
"[" : "LBRACKET",
"]" : "RBRACKET",
"@" : "AT",
"*" : "ASTERISK",
"**" : "DASTERISK",
"\\" : "BACKSLASH",
"&" : "AMPERSAND",
"#" : "NUMBER_SIGN",
"+" : "PLUS",
"-" : "MINUS"
}
token_regexes = {
"PSL_Property_Declaration" : "property[ \t]+[^;]+;",
"PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;",
"PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;",
"PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;",
"PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
}
quotechars = [ '"', "'" ]
def dump(obj):
for c, v in obj.iteritems():
slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
def dump_grammar(prio, grammar):
caller = get_caller_pos()
for t, p in grammar.iteritems():
p.dump(prio, caller=caller)
def cleanup_token(tok):
tok = tok.strip()
if len(tok) == 0:
return None
if tok[0] == "'" and tok[-1] == "'":
tok = '"' + tok[1:-1] + '"'
return tok
def tok2ctype(tok):
if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]:
return t_grammar
return t_target_lang
def is_terminal(tok):
size = len(tok)
if size < 2:
return None
first = tok[0]
last = tok[-1]
if (not first in quotechars) and (not last in quotechars):
return None
if first != last:
raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
return tok[1:-1]
def tok2name(tok):
tok = cleanup_token(tok)
term = is_terminal(tok)
if term is not None:
if term in special_terminals.keys():
return special_terminals[term]
return term
return tok
def tok2sym(tok):
tok = cleanup_token(tok)
term = is_terminal(tok)
if term is not None:
if term in special_terminals.keys():
return "T_" + special_terminals[term].upper()
return "T_" + re.sub('[^a-zA-Z0-9]', '_', term).upper()
return tok
def tok2regex(tok):
if tok in token_regexes.keys():
return token_regexes[tok]
return re.escape(tok)
def format_rule(rule):
return ' '.join(c.str() for c in rule)
def format_rules(rules):
return ', '.join(format_rule(rule) for rule in rules)
def format_ebnf_rule(grammar, rule):
r = ""
last = None
for comp in rule:
if last is not None:
if comp.tp == t_grammar:
if last.tp == t_grammar:
pass
else:
if comp.token in [ '[', '(', '{', '<' ]:
r += ','
else:
if last.tp == t_grammar:
if comp.token in [ ']', ')', '}', '>' ]:
r += ','
else:
r += ','
r += ' ' + comp.token
last = comp
if len(r) == 0:
return r
return r.strip()
def format_yacc_rule(rule):
r = ''
for c in rule:
if c.tp != t_target_lang:
slog(DEBUG, "ignoring non-target-language token", c.token, "in rule")
continue
r += tok2sym(c.token) + ' '
return r[:-1]
class SourceElement:
def __init__(self, token, line):
self.token = token
self.line = line
class RuleComp:
def __init__(self, token, tp = None, line=-1):
assert(token is not None)
# assert(token != '|')
self.token = token
if tp is None:
tp = tok2ctype(token)
self.tp = tp
slog(INFO, "creating rule component >" + self.str() + "<")
assert(token != "{ assignment")
self.line = line
def __eq__(self, rhs):
if self.token != rhs.token:
return False
if self.tp != rhs.tp:
return False
return True
def __ne__(self, rhs):
return not self.__eq__(rhs)
def str(self):
tp = 'u'
if self.tp == t_grammar:
tp = 'g'
elif self.tp == t_target_lang:
tp = 'l'
else:
tp = self.tp
return "{" + tp + ": " + self.token + "}"
class State:
def __init__(self):
self.__pair_square = ['[', ']']
self.__pair_curly = ['{', '}']
self.__pair_ext = ['<', '>']
self.__pair_group = ['(', ')']
self.__pair_comment = ['(*', '*)']
self.__pair_special = ['?', '?']
self.reset()
def reset(self):
self.curly = 0
self.square = 0
self.ext = 0
self.group = 0
self.in_comment = False
self.in_special = False
self.production = None
self.rule = []
self.rules = []
self.things = []
def optional(self):
return self.square != 0 or self.curly != 0
def update(self, tok, line):
if not self.in_comment:
if tok == '[':
self.square += 1
self.things.append(self.__pair_square)
elif tok == ']':
self.square -= 1
assert(self.things.pop() == self.__pair_square)
elif tok == '{':
self.curly += 1
self.things.append(self.__pair_curly)
elif tok == '}':
self.curly -= 1
assert(self.things.pop() == self.__pair_curly)
elif tok == '(':
self.group += 1
self.things.append(self.__pair_group)
elif tok == ')':
self.group -= 1
assert(self.things.pop() == self.__pair_group)
elif tok == '<':
self.ext += 1
self.things.append(self.__pair_ext)
elif tok == '>':
self.ext -= 1
assert(self.things.pop() == self.__pair_ext)
elif tok == '?':
if not self.in_special:
self.in_special = True
self.things.append(self.__pair_special)
else:
self.in_special = False
assert(self.things.pop() == self.__pair_special)
elif tok == '(*':
self.in_comment = True
self.things.append(self.__pair_comment)
elif tok == '*)':
raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
else:
if tok == '(*':
raise Exception("Nested EBNF comment", tok, "in line", line)
elif tok == '*)':
assert(self.things.pop() == self.__pair_comment)
self.in_comment = False
if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
raise Exception("Unbalanced BNF bracket", tok, "in line", line)
return self.optional()
def in_list(self):
return self.curly > 0
def in_option(self):
return self.square > 0
def in_group(self):
return self.group > 0
def in_ext(self):
return self.ext > 0
def in_something(self):
if len(self.things) == 0:
return None
return self.things[-1]
class Symbol:
def __init__(self, token, tp = None, rules = None):
self.reset(token, tp, rules)
self.set_is_payload(True)
def reset(self, token, tp = None, rules = None):
if tp == None:
if is_terminal(token) is not None:
tp = p_terminal
else:
tp = p_ruleset
self.tp = tp
self.token = token
self.name = tok2name(token)
self.sym = tok2sym(token)
self.term = None
self.regex = None
self.is_lexical_element = False
self.rules = []
self.datatype = None
if rules is not None:
self.rules = rules
self.set_type(tp)
def set_is_payload(self, onoff):
self.is_payload = onoff
def set_type(self, tp):
if tp == p_ruleset:
self.term = None
self.regex = None
self.is_lexical_element = False
self.datatype = self.token + '_t'
elif tp == p_literal:
assert(len(self.rules) == 0)
self.term = None
self.regex = tok2regex(self.token)
self.is_lexical_element = False
self.datatype = 'std::string'
elif tp == p_special or tp == p_lexical:
if len(self.rules):
self.dump(ERR)
raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
self.term = None
self.regex = None
self.is_lexical_element = True
self.datatype = 'std::string'
elif tp == p_terminal:
if len(self.rules):
slog(ERR, "rules = ", self.rules)
self.dump(ERR)
raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
self.term = self.token
self.regex = tok2regex(self.token)
self.is_lexical_element = False
self.datatype = None
else:
self.dump()
raise Exception("Tried to set symbol to unknown type", tp)
self.tp = tp
def str(self):
r = self.name + ' = ' + format_rules(self.rules)
return r
def equals(self, rhs):
for k, v in self.__dict__.iteritems():
if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]:
slog(WARNING, k, self.__dict__[k], rhs.__dict__[k])
return False
return True
def dump(self, prio = NOTICE, msg="", caller=None):
if caller is None:
caller = get_caller_pos(1)
slog(prio, ",----------------", msg, caller=caller)
slog(prio, "| type =", self.tp, caller=caller)
slog(prio, "| name =", self.name, caller=caller)
slog(prio, "| token =", self.token, caller=caller)
slog(prio, "| sym =", self.sym, caller=caller)
slog(prio, "| term =", self.term, caller=caller)
slog(prio, "| regex =", self.regex, caller=caller)
slog(prio, "| datatype =", self.datatype, caller=caller)
slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller)
slog(prio, "| rules =", format_rules(self.rules), caller=caller)
slog(prio, "`----------------", msg, caller=caller)
def split_list_by(l_, tok):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
def split_list_by_regex(l_, regex):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]
def grammar_tokenize_ebnf(content):
r = []
c = ''
l = 0
in_comment = False
in_quote = None
for line in content.splitlines(True):
end = len(line) - 1
l += 1
tok = ''
p = -1
while p < end:
p += 1
if p < end and in_quote == None:
cand = line[p:p+2]
if cand == '(*':
if in_comment:
raise Exception("Syntax error in line", l, ": spurious comment closure")
in_comment = True
p += 1
continue
elif cand == '*)':
if not in_comment:
raise Exception("Syntax error in line", l, ": spurious comment opener")
in_comment = False
p += 1
continue
if in_comment:
continue
c = line[p]
if c in [ '"', "'" ]:
if in_quote is None:
in_quote = c
else:
if in_quote == c:
in_quote = None
if in_quote is not None:
tok += c
continue
if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
tok = tok.strip()
if len(tok):
r.append((tok, l))
tok = ''
if not c.isspace():
r.append((c, l))
continue
tok += c
tok = tok.strip()
if len(tok):
r.append((tok, l))
return r
def grammar_add_symbol(grammar, tok, rules):
assert(tok is not None)
if tok in grammar.keys():
s = grammar[tok]
else:
s = Symbol(tok, rules=rules)
grammar[tok] = s
if rules is not None:
slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
for rule in rules:
if not rule in s.rules:
s.rules.append(rule)
grammar[tok] = s
def grammar_parse_ebnf_tokens(tokens):
grammar = OrderedDict()
state = State()
lhs = None
last = None
ruleset = []
rule = []
terminals = []
specials = []
for tok, line in tokens:
try:
state.update(tok, line)
if tok == '=':
lhs = last
continue
last = tok
if tok == ';':
ruleset.append(rule)
grammar_add_symbol(grammar, lhs, ruleset)
ruleset = []
rule = []
lhs = None
continue
if tok == ',':
continue
if tok == '|' and state.in_something() is None:
ruleset.append(rule)
rule = []
continue
if is_terminal(tok) and tok not in terminals:
terminals.append(tok)
elif state.in_special and tok not in specials:
specials.append(tok)
if lhs is not None:
rule.append(RuleComp(tok, line=line))
except Exception as err:
for t in tokens:
slog(ERR, t)
slog(ERR, "Unexpected error in line", line, ":", str(err))
raise
exit(1)
for s in terminals:
grammar_add_symbol(grammar, s, None)
grammar[s].set_type(p_terminal)
for s in specials:
slog(INFO, "found special sequence symbol", s)
grammar_add_symbol(grammar, s, None)
grammar[s].set_type(p_special)
return grammar
def grammar_parse_ebnf(content_):
tokens = grammar_tokenize_ebnf(content_)
grammar = grammar_parse_ebnf_tokens(tokens)
return grammar
def grammar_get_types(grammar):
types = dict()
for t, p in grammar.iteritems():
if not len(p.rules):
continue
if p.term is not None:
continue
ruleno = 1
rules = []
for rule in p.rules:
members = []
for c in rule:
if c.tp != t_target_lang:
continue
if not c.token in grammar.keys():
p.dump(ERR)
raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule))
pp = grammar[c.token]
if pp.tp is p_terminal:
continue
if not pp.is_payload:
continue
members.append(tok2sym(c.token))
if True or len(members):
rules.append(members)
if t in types.keys():
raise Exception("Tried to add type", t, "twice")
types[t] = rules
return types
def grammar_fix_extensions(grammar, mode):
for tok, p in grammar.iteritems():
newrules = []
for rule in p.rules:
newrule = []
prefix = ""
paren = 0
for c in rule:
if c.tp == t_grammar and c.token in ['<', '>']:
if c.token == '<':
paren += 1
elif c.token == '>':
paren -= 1
if paren <= 1: # don't add first level of control chars
continue
newrule.append(c)
continue
if paren > 0:
assert(len(c.token) != 0)
prefix += '_' + c.token
continue
if len(prefix) > 0:
prefix = prefix[1:]
slog(INFO, "Found prefix", prefix)
if mode == mode_keep:
newrule.append(RuleComp('<'))
newrule.append(RuleComp(prefix, t_target_lang))
newrule.append(RuleComp('>'))
newrule.append(c)
elif mode == mode_discard:
prefix = ''
continue
elif mode in [ mode_unroll, mode_concat ]:
combined = RuleComp(c.token, c.tp)
combined.token = prefix + c.token
prefix = ''
newrule.append(combined)
slog(INFO, "Appended new rule return value", combined.token)
if mode == mode_unroll:
if combined.token in grammar.keys():
continue
grammar[combined.token] = Symbol(combined.token, rules=[[c]])
else:
raise Exception("Invalid prefix mode", mode)
prefix = ''
continue
newrule.append(c)
if len(prefix): # undigested prefix, since it was the last
newrule.append(RuleComp(prefix[1:], t_target_lang))
newrules.append(newrule)
grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only
return grammar # TODO: not sure if this is necessary
def grammar_unroll_lists(grammar):
delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic
for tok, p in grammar.iteritems():
newrules = []
for rule in p.rules:
newrule = []
listrule = []
prefix = None
s = State()
slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
for c in rule:
s.update(c.token, c.line)
if c.token == '{':
continue
if c.token == '}':
if len(listrule) == 0:
raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
name = ""
delpos = []
for i, rule in enumerate(listrule):
if rule.token in delimiters:
delpos.append(i)
continue
if rule.tp != t_target_lang:
continue
name += tok2name(rule.token) + "_"
# not really: there are lists without delimiters, too
#if len(delpos) != 1:
# p.dump(ERR)
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
name = name + "my_list"
newrule.append(RuleComp(name, t_target_lang))
p = Symbol(name, rules=[[], listrule])
#p = Symbol(name)
#p.rules = [ [], listrule ]
listrule = []
if name not in grammar.keys():
grammar[name] = p
continue
if not p.equals(grammar[name]):
p.dump(ERR, "old list production")
p.dump(ERR, "new list production")
raise Exception("List production expands to already taken name", name)
continue
if s.in_list():
listrule.append(c)
continue
newrule.append(c)
newrules.append(newrule)
grammar[tok].rules = newrules
return grammar
def rules_unroll_options(rules):
r = []
found = False
slog(DEBUG, "unrolling", format_rules(rules))
for rule in rules:
square = 0
option = []
newrule = []
for i, c in enumerate(rule):
if c.tp == t_grammar:
if c.token == '[':
square += 1
elif c.token == ']':
square -= 1
if square == 1:
continue
if square >= 1:
option.append(c)
continue
slog(DEBUG, "square =", square)
assert(square == 0)
n = len(option)
if n == 0:
newrule.append(c)
continue
# first without option
replaced = newrule[:]
tail = rule[i+1:len(rule)]
slog(DEBUG, "i = ", i)
slog(DEBUG, "n = ", n)
slog(DEBUG, "rule = ", format_rule(rule))
slog(DEBUG, "tail = ", format_rule(tail))
slog(DEBUG, ",-------------------------")
slog(DEBUG, "head = ", format_rule(replaced))
replaced.extend(tail)
slog(DEBUG, "head + tail = ", format_rule(replaced))
r.append(replaced)
# then with option inserted
for unrolled in rules_unroll_options([ option ]):
replaced = newrule[:]
slog(DEBUG, ",-------------------------")
slog(DEBUG, "head = ", format_rule(replaced))
slog(DEBUG, "unrolled = ", format_rule(unrolled))
replaced.extend(unrolled)
slog(DEBUG, "head + unrolled =", format_rule(replaced))
replaced.extend(tail)
slog(DEBUG, "head + unrolled + tail =", format_rule(replaced))
r.append(replaced)
found = True
break
if not found:
r.append(newrule)
if found:
return rules_unroll_options(r)
return r
def grammar_unroll_options(grammar):
for tok, p in grammar.iteritems():
grammar[tok].rules = rules_unroll_options(p.rules)
return grammar
def rules_unroll_alternatives(rules):
r = []
found = False
slog(INFO, "unrolling alternatives in", format_rules(rules))
sep = RuleComp('|')
for rule in rules:
if not sep in rule:
r.append(rule)
continue
found = True
state = State()
end = len(rule) - 1
first = last = -1
for i, c in enumerate(rule):
state.update(c.token, line=c.line)
if c.token != '|' or c.tp != t_grammar:
slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i)
continue
slog(INFO, "found token at position", i)
container = state.in_something()
slog(INFO, "thing delimiters are", container)
if container is None:
raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule))
first = last = -1
k = i - 1
while k >= 0:
prev = rule[k]
slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0])
if prev.token == container[0]:
first = k
break
k -= 1
if first == -1:
raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule))
k = i
while k <= end:
nxt = rule[k]
slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1])
if nxt.token == container[1]:
last = k
break
k += 1
if last == i:
raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule))
break # found what I wanted
assert(first > 0)
assert(last > 0)
assert(last <= end)
head = rule[0:first]
mid = rule[first+1:last]
tail = rule[last+1:end]
slog(INFO, "first =", first, "last =", last, "end =", end)
slog(INFO, "head = ", format_rule(head))
slog(INFO, "mid = ", format_rule(mid))
slog(INFO, "tail = ", format_rule(tail))
for m in split_list_by(mid, sep):
unrolled_rule = head + m + tail
r.append(unrolled_rule)
#if found:
# return rules_unroll_alternatives(r)
return r
def grammar_unroll_alternatives(grammar):
for tok, p in grammar.iteritems():
grammar[tok].rules = rules_unroll_alternatives(p.rules)
return grammar
def grammar_replace_whitespace(grammar):
r = OrderedDict()
for tok, s in grammar.iteritems():
newrules = []
for rule in s.rules:
newrule = []
for c in rule:
newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line)
newrule.append(newc)
newrules.append(newrule)
newtok = tok.replace(' ', '_')
s.reset(newtok, tp=s.tp, rules=newrules)
r[newtok] = s
slog(INFO, "added symbol", newtok)
return r
def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
if checked is None:
checked = set()
if found is None:
found = dict()
indent = ' ' * depth * 2
if tok in found.keys():
slog(INFO, indent + " + found cached", tok, "with depth", found[tok])
return found[tok]
slog(INFO, indent + " + " + tok)
indent = indent + " "
if tok in terminals:
found[tok] = 1
slog(INFO, indent + " + found terminal", tok, "with depth", found[tok])
return 1
if tok in orphans:
found[tok] = 1
slog(INFO, indent + " + found orphan", tok, "with depth", found[tok])
return 1
#if tok in lexicals:
# found[tok] = 1
# slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok])
# return 1
if tok in checked:
slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked))
return sys.maxint
slog(INFO, indent, "checked =", ' '.join(checked))
checked.add(tok)
if tok not in grammar.keys():
slog(ERR, "tried to validate unknown token \"" + tok + "\"")
return sys.maxint
p = grammar[tok]
r = sys.maxint
slog(INFO, indent, p.token, "has", len(p.rules), "rules")
only_optional = True
for rule in p.rules:
slog(INFO, indent, "testing rule", format_rule(rule))
if tok in [ c.token for c in rule ]:
continue
mn = sys.maxint
mx = 0
s = State()
for c in rule:
slog(INFO, indent, "testing token", c.token)
if c.tp == t_grammar and s.update(c.token, 0):
continue
if c.tp != t_target_lang:
slog(INFO, indent, " token", c.token, "is not a VHDL token")
continue
only_optional = False
# same "found" argument in next call?
rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found)
slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx)
if rr == sys.maxint or rr is None:
slog(INFO, indent, " got error for token", c.token)
mn = sys.maxint
mx = 0
break
if rr > mx:
slog(INFO, indent, " adjusting mx to", rr)
mx = rr
if rr < mn:
slog(INFO, indent, " adjusting mn to", rr)
mn = rr
if mn == sys.maxint or mx == 0: # unusable as escape route
slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule))
continue
slog(INFO, indent, "after checking all rules, mx is", mx)
if mx < r:
slog(INFO, indent, "setting return value to max", mx)
r = mx
if only_optional:
slog(INFO, indent, tok, "has only optional rules, accepting")
r = 0
if r != sys.maxint:
r += 1
slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps")
found[tok] = r
slog(INFO, indent, "returning", r, "for token", tok)
return r
def grammar_check(grammar, check_symbols = None):
terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
elements = set()
if check_symbols is None:
check_symbols = []
if len(check_symbols) == 0:
for tok, p in grammar.iteritems():
if p.is_lexical_element:
elements.add(tok)
continue
for rule in p.rules:
for c in rule:
if c.tp == t_grammar:
continue
elements.add(c.token)
check_symbols = sorted(list(elements))
found = dict()
for tok in check_symbols:
slog(INFO, "======= checking", tok)
rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
if rr == sys.maxint:
slog(ERR, "No way out for", tok)
exit(1)
if not tok in grammar.keys():
slog(ERR, "Token", tok, "has no production")
exit(1)
slog(INFO, tok, "->", str(rr))
def grammar_lhss_map(grammar):
r = dict()
for t in grammar.keys():
r[t] = set()
for t, p in grammar.iteritems():
for rule in p.rules:
for c in rule:
if c.tp == t_target_lang:
r[c.token].add(t)
return r
def do_grammar_lhss(dmap, stop, rhs, buf, recursive):
lhss = dmap[rhs]
for lhs in lhss:
if lhs in buf:
continue
buf.add(lhs)
if lhs in stop:
slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion")
continue
if recursive:
do_grammar_lhss(dmap, stop, lhs, buf, recursive)
def grammar_lhss(dmap, stop, symbols, recursive = False):
r = set()
for s in symbols:
if s in r:
continue
do_grammar_lhss(dmap, stop, s, r, recursive)
return r
def do_grammar_rhss(grammar, stop, sym, buf):
p = grammar[sym]
for rule in p.rules:
for c in rule:
if c.tp != t_target_lang:
continue
if c.token in stop:
continue
if c.token in buf:
continue
buf.add(c.token)
do_grammar_rhss(grammar, stop, c.token, buf)
def grammar_rhss(grammar, stop, symbols):
r = set()
for s in symbols:
if s in r:
continue
do_grammar_rhss(grammar, stop, s, r)
return r
def grammar_symbol_in_use(grammar, dmap, stop, checked, sym):
if sym in stop:
return False
# Does this have to be recursive?
defined = grammar_lhss(dmap, stop, set([sym]))
slog(INFO, " symbol", sym, "defines:", ', '.join(defined))
if not len(defined):
return True
for d in defined:
if d in stop:
continue
if d in checked:
continue
checked.add(d)
if grammar_symbol_in_use(grammar, dmap, stop, checked, d):
return True
return False
def do_grammar_unused(grammar, dmap, doomed):
r = set(doomed)
rhss = grammar_rhss(grammar, set(), doomed)
for rhs in rhss:
slog(INFO, "+++ checking if symbol", rhs, "is in use >>")
if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs):
slog(INFO, " symbol", rhs, "is not in use")
r.add(rhs)
slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<")
return r
def grammar_unused(grammar, dmap, doomed):
r = set(doomed)
while True:
unused = do_grammar_unused(grammar, dmap, r)
slog(INFO, "unused:", ', '.join(unused))
slog(INFO, "r: ", ', '.join(r))
if unused == r:
break
r |= unused
return r
# eradicate symbols from tree
def grammar_cut_symbols(grammar, symbols):
slog(INFO, "-------- removing symbols:", ', '.join(symbols))
dmap = grammar_lhss_map(grammar)
unused = grammar_unused(grammar, dmap, symbols)
for s in unused:
slog(INFO, " + removing symbol", s)
del grammar[s]
return grammar
# make symbol an empty literal production
def grammar_trim_symbols(grammar, symbols):
grammar_cut_symbols(grammar, symbols)
for s in symbols:
slog(INFO, " + adding empty production for symbol", s)
p = Symbol(s)
p.set_type(p_literal)
grammar[s] = p
return grammar
# flag symbols as non-payload
def grammar_irrelevant_symbols(grammar, symbols):
for s in symbols:
grammar[s].set_is_payload(False)
return grammar
def grammar_create_ebnf(grammar, opts):
indent = 40
slog(INFO, "creating ebnf from grammar of size", len(grammar))
out = ''
for t, p in grammar.iteritems():
slog(INFO, "formatting rule", t)
if not len(p.rules):
slog(INFO, "ignoring " + t + " (has no rules)\n")
continue
out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
for rule in p.rules[1:]:
out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
out += ' ' * indent + ' ;\n'
return out
def format_token(sym, tp):
return misc.pad('%token <' + sym + '>', 27) + misc.pad(sym, 20) + '/* ' + tp + ' */'
def grammar_create_y(grammar, opts):
indent = ' ' * 40
width = 0
for t, p in grammar.iteritems():
if p.term is not None:
continue
if len(t) > width:
width = len(t)
spaces = 0
while spaces < width:
spaces += 8
indent = '\t' * (spaces / 8)
out = ""
# preamble
out += textwrap.dedent("""\
%{
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdarg.h>
#include <vector>
#include <string>
""")
for f in opts['includes']:
out += '#include "' + f + '"' + '\n'
out += "\nusing namespace " + opts['namespace'] + ';\n'
out += textwrap.dedent("""\
using namespace std;
namespace {
typedef vector<const char *> wrap_t;
const wrap_t curly_braces{ "{", "}" };
const wrap_t round_braces{ "(", ")" };
}
#ifdef __cplusplus
// extern "C" {
#endif
%}
""")
# types
out += textwrap.dedent("""\
%union {
""")
types = grammar_get_types(grammar)
for t in types.keys():
out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';'
out += '\n'
out += textwrap.dedent("""\
}
""")
# yydecl
out += textwrap.dedent("""\
%{
// int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner);
YY_DECL;
%}
""")
# terminal tokens
out += '\n'
for t, p in grammar.iteritems():
if p.tp == p_terminal:
out += format_token(p.sym, t) +'\n'
# special tokens
out += '\n'
for t, p in grammar.iteritems():
if p.tp == p_special:
if p.token == '?': # TODO: why is this among the symbols anyway?
continue
out += format_token(p.sym, t) +'\n'
# regex tokens
out += '\n'
for t, p in grammar.iteritems():
if p.tp == p_literal:
out += format_token(p.sym, t) +'\n'
# types
out += '\n'
for t, p in grammar.iteritems():
if p.tp == p_ruleset:
out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'
out += textwrap.dedent("""\
%define parse.error verbose
%define api.pure full
%param { struct context *context } { void *scanner }
""")
# productions
out += '\n%%\n\n'
for t, p in grammar.iteritems():
if not len(p.rules):
continue
if p.tp == p_terminal:
continue
if p.tp == p_special:
continue
slog(INFO, "creating production for symbol", p.str())
#if p.is_lexical_element is True:
# continue
if len(p.rules) == 0:
raise Exception("Symbol ", p.str(), "has no rules")
first = True
n_rule = 0
for rule in p.rules:
n_rule += 1
n = 0
s = State()
if first:
out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n"
first = False
else:
out += indent + "| " + format_yacc_rule(rule) + "\n"
out += indent + "{" + "\n"
out += indent + "\t" + "$$->type = " + opts['namespace'] + '::' + t + "::t_" + str(n_rule) + ";\n"
tokens = []
for c in rule:
if c.tp == t_target_lang:
tokens.append(c.token)
idx = 0
for c in rule:
n += 1
if c.tp == t_grammar:
s.update(c.token, 0)
continue
p = grammar[c.token]
#if is_terminal(c.token) is not None:
# continue
if p.tp not in [ p_ruleset ]:
continue
if not p.is_payload:
continue
tp = tok2name(c.token)
suffix = ''
if tokens.count(c.token) > 1:
idx += 1
suffix = '_' + str(idx)
out += indent + "\t" + \
"$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \
" = new " + p.datatype + "(*$" + str(n) + ");\n"
out += indent + "}" + "\n"
out += indent + ";\n\n"
# tail
out += '\n%%\n\n'
out += textwrap.dedent("""
#ifdef __cplusplus
// } /* extern "C" */
#endif
""")
return out + "\n"
def grammar_create_l(grammar, opts):
ignore = ""
out = textwrap.dedent("""\
%option reentrant
%option bison-bridge
%{
#include <slog.h>
""")
for f in opts['includes']:
out += '#include "' + f + '"' + '\n'
out += "\nusing namespace " + opts['namespace'] + ';\n'
#out += textwrap.dedent("""\
# /* This is too late in the Flex generated file to work. Still lots of
# * prototypes are spat into it above it, which end up with C++ linkage, of
# * course, causing the linkages to be inconsistent to the functions below this
# * extern "C". Only way I found was to use C++ is to use it on Bison only, and
# * have Flex use C instead. */
# #ifdef __cplusplus
# // extern "C" {
# #endif
# #ifdef _REMOVE_ME
# static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
# static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
# #endif
# %}
out += textwrap.dedent("""\
%}
%%
\\n { context->line++; }
""")
for t, p in grammar.iteritems():
if p.term is not None:
# \. { return T_DOT; }
assert p.term[0] in [ '"', "'" ], p.term
assert p.term[-1] in [ '"', "'" ], p.term
out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'
#out += textwrap.dedent("""\
#
# %{/* basic_identifier */%}
# %{/* extended_identifier */%}
# %{/* based_integer */%}
# %{/* bit_value */%}
# %{/* numeric_literal */%}
# %{/* enumeration_literal */%}
# %{/* string_literal */%}
# %{/* bit_string_literal */%}
# %{/* character_literal */%}
# %{/* graphic_character */%}
# %{/* basic_character */%}
# %{/* integer */%}
#
# """)
ignore += textwrap.dedent("""\
%{ /* not sure how to handle literals >> */ %}
\\"[ \\!#-~]*\\" |
\\'[0-1]\\' {
// get_string(yylval_param, yyscanner, 1);
/* Gets a string excluding " or ' */
int skip = 1;
int i;
for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++);
yytext[i] = 0;
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
lv->txt=(char *)malloc(i+1);
strcpy(lv->txt, yytext+skip);
return STRING;
}
#[0-9a-f]*# {
// get_based_string(yylval_param, yyscanner, 1); /* skip leading # */
/* Gets a string excluding # */
int i;
int skip = 1;
for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++);
yytext[i] = 0;
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
lv->txt = (char *)malloc(i+1);
strcpy(lv->txt, yytext + skip);
return BASED;
}
[a-zA-Z_$][a-zA-Z0-9_$.]* {
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
lv->txt=(char *)malloc(strlen(yytext)+1);
strcpy(lv->txt, yytext);
return NAME;
}
[0-9]+ {
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
sscanf(yytext, "%d", &lv->n);
return NATURAL;
}
""")
out += textwrap.dedent("""\
. {
return yytext[0];
}
%{/* not sure how to handle literals << */%}
%%
void FB_SYM(error)(struct context *context, void *scanner, const char *msg)
{
struct yyguts_t *yyg =(struct yyguts_t*)scanner;
set_error(PRI_ERR, EINVAL, "%s at \\"%s\\" in line %d", msg, yytext, context->line);
}
int FB_SYM(wrap)(void *scanner)
{
return 1;
}
struct vp_scanner {
YY_BUFFER_STATE buf;
void *scanner;
char *str;
};
/* utilities which need to be placed here, because I can't find
* yylex_init() / _destroy() in any generated header file (??) */
struct vp_scanner *FB_SYM(init_scanner)(const char *str)
{
struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r));
yylex_init(&r->scanner);
r->str = strdup(str);
r->buf = yy_scan_string(r->str, r->scanner);
FB_SYM(set_extra)(r, r->scanner);
// yyset_in(stdin, r->scanner);
// yyset_out(stdout, r->scanner);
return r;
}
void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner)
{
return scanner->scanner;
}
void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner)
{
free(scanner->str);
yy_delete_buffer(scanner->buf, scanner->scanner);
yylex_destroy(scanner->scanner);
free(scanner);
}
""")
# #ifdef __cplusplus
# // } // extern "C"
# #endif
#
# """)
return out
def grammar_create_h(grammar, opts):
out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
ns = opts['namespace']
if ns is not None:
out += 'namespace ' + ns + '{\n\n'
types = grammar_get_types(grammar)
# struct forward declarations
for t, members in types.iteritems():
if len(members):
out += '\nstruct ' + t + ';'
out += '\n'
# struct / non-struct typedefs
for t, members in types.iteritems():
if not len(members):
out += '\ntypedef const char ' + t + '_t;'
continue
out += '\ntypedef struct ' + t + ' ' + t + '_t;'
out += '\n'
# struct definitions
for t, rules in types.iteritems():
if not len(rules):
continue
out += '\n\nstruct ' + t + ' {\n'
# rule structs
n = 0
for rule in rules:
n += 1
idx = 0
out += '\n\tstruct ' + 'r' + str(n) + '_t {'
for m in rule:
suffix = ''
if rule.count(m) > 1:
idx += 1
suffix = '_' + str(idx)
p = grammar[m]
out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
out += '\n\t};'
# type enum
n = 0
out += '\n\n\tenum {'
for rule in rules:
n += 1
out += '\n\t\tt_' + str(n) + ','
out += '\n\t} type;'
out += '\n'
# data union
n = 0
out += '\n\tunion {'
for rule in rules:
n += 1
out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';'
out += '\n\t} data;'
# struct done
out += '\n};'
out += '\n'
if ns is not None:
out += '\n} /* namespace ' + ns + '*/'
out += textwrap.dedent("""\
struct vp_scanner;
struct vp_scanner *FB_SYM(init_scanner)(const char *str);
void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner);
void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner);
""")
out += '\n\n#endif /* #ifndef + ' + opts['mip'] + ' */'
return out
class GrammarCmd(jwutils.Cmd):
def __init__(self, name, help):
super(GrammarCmd, self).__init__(name, help=help)
def add_parser(self, parsers):
p = super(GrammarCmd, self).add_parser(parsers)
p.add_argument("input", help="input file")
p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False)
p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat)
p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False)
p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False)
p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False)
p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='')
p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='')
p.add_argument('-r', '--irrelevant-symbols', help='exclude symbol from output payload', nargs='?', default='')
p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='')
return p
def processGrammar(self, args, grammar):
if args.fix_extensions not in fix_extensions_mode:
raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
grammar = grammar_fix_extensions(grammar, args.fix_extensions)
if args.unroll_alternatives:
grammar = grammar_unroll_alternatives(grammar)
if args.unroll_lists:
grammar = grammar_unroll_lists(grammar)
if args.unroll_options:
grammar = grammar_unroll_options(grammar)
if len(args.check_symbols):
check_symbols = []
if args.check_symbols == 'all':
args.check_symbols = ''
check_symbols = args.check_symbols.split()
grammar_check(grammar, check_symbols)
if args.replace_whitespace:
grammar = grammar_replace_whitespace(grammar)
if len(args.trim_symbols):
grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(','))
if len(args.cut_symbols):
grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(','))
if len(args.irrelevant_symbols):
grammar = grammar_irrelevant_symbols(grammar, args.irrelevant_symbols.split(','))
return grammar
# ------------------------------------------------- TODO: clean this up >
class DerivedGrammarCmd(GrammarCmd):
def __init__(self, name, help):
super(DerivedGrammarCmd, self).__init__(name, help=help)
@abstractmethod
def _run(self, grammar):
pass
def _parse(self, contents):
return grammar_parse_ebnf(contents)
def add_parser(self, parsers):
p = super(DerivedGrammarCmd, self).add_parser(parsers)
return p
def run(self, args):
with open(args.input, 'r') as infile:
contents = infile.read()
grammar = self._parse(contents)
grammar = super(DerivedGrammarCmd, self).processGrammar(args, grammar)
self._run(args, grammar)
class CmdCreate(DerivedGrammarCmd):
def __init__(self):
super(CmdCreate, self).__init__("create", help="Create a file")
def add_parser(self, parsers):
p = super(CmdCreate, self).add_parser(parsers)
p.add_argument("output", help="output file")
p.add_argument('--namespace', help='namespace of generated AST', default='parser')
p.add_argument('--includes', help='list of header files to be #included in C/C++ implementation files', default='')
return p
def _run(self, args, grammar):
name, ext = os.path.splitext(args.output)[1]
#cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output))
mip = None
if ext == 'h':
mip = args.namespace + re.sub(r'[-./]', '_', args.output).upper()
includes = args.includes.split(',')
# generated code breaks without this, not sure why
if ext == 'l':
tmp = []
for f in includes:
if not re.match('.*lex\..*\.h', f):
tmp.append(f)
includes = tmp
cmd = getattr(sys.modules[__name__], 'grammar_create_' + ext)
opts = {
"namespace" : args.namespace,
"includes" : includes,
"mip" : mip
}
out = cmd(grammar, opts)
print(out)
class CmdCheck(DerivedGrammarCmd):
def __init__(self):
super(CmdCheck, self).__init__("check", help="Check grammar")
def add_parser(self, parsers):
p = super(CmdCheck, self).add_parser(parsers)
return p
def _run(self, args, grammar):
pass