mirror of
ssh://git.janware.com/srv/git/janware/proj/jw-python
synced 2026-01-15 01:52:56 +01:00
1255 lines
39 KiB
Python
1255 lines
39 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import argparse
|
|
import sys
|
|
import re
|
|
import lxml.etree as ET
|
|
import textwrap
|
|
import itertools
|
|
import copy
|
|
from collections import OrderedDict
|
|
from abc import abstractmethod
|
|
|
|
import jwutils
|
|
|
|
from jwutils.log import *
|
|
|
|
t_grammar = "grammar"
|
|
t_target_lang = "target"
|
|
|
|
p_ruleset = "ruleset"
|
|
p_terminal = "term"
|
|
p_literal = "literal"
|
|
p_lexical = "lexical"
|
|
|
|
mode_unroll = "unroll"
|
|
mode_concat = "concat"
|
|
mode_keep = "keep"
|
|
mode_discard = "discard"
|
|
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
|
|
|
|
member_prefix = ''
|
|
|
|
special_terminals = {
|
|
"`" : "BACKTICK",
|
|
"^" : "CARET",
|
|
"<" : "LT",
|
|
"<<" : "LEFT_SHIFT",
|
|
"<=" : "LTE",
|
|
"<=>" : "SPACE_SHIP",
|
|
"<>" : "NE",
|
|
"=" : "EQ",
|
|
"=>" : "EG",
|
|
">" : "GT",
|
|
">=" : "GE",
|
|
">>" : "RIGHT_SHIFT",
|
|
"|" : "PIPE",
|
|
"_" : "UNDERSCORE",
|
|
"," : "COMMA",
|
|
";" : "SEMICOLON",
|
|
":" : "COLON",
|
|
":=" : "DEFINE",
|
|
"?" : "QM",
|
|
"?<" : "QM_LT",
|
|
"?<=" : "QM_LE",
|
|
"?=" : "QM_EQ",
|
|
"?>" : "QM_GT",
|
|
"?>=" : "QM_GE",
|
|
"??" : "QM_QM",
|
|
"?/=" : "QM_DIV_EQ",
|
|
"/" : "DIV",
|
|
"/=" : "DIV_EQ",
|
|
"." : "DOT",
|
|
"\"" : "DQUOTE",
|
|
"'" : "QUOTE",
|
|
"(" : "LPAREN",
|
|
")" : "RPAREN",
|
|
"[" : "LBRACKET",
|
|
"]" : "RBRACKET",
|
|
"@" : "AT",
|
|
"*" : "ASTERISK",
|
|
"**" : "DASTERISK",
|
|
"\\" : "BACKSLASH",
|
|
"&" : "AMPERSAND",
|
|
"#" : "NUMBER_SIGN",
|
|
"+" : "PLUS",
|
|
"-" : "MINUS"
|
|
}
|
|
|
|
token_regexes = {
|
|
"PSL_Property_Declaration" : "property[ \t]+[^;]+;",
|
|
"PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;",
|
|
"PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;",
|
|
"PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;",
|
|
"PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
|
|
}
|
|
|
|
quotechars = [ '"', "'" ]
|
|
|
|
def dump(obj):
|
|
for c, v in obj.iteritems():
|
|
slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
|
|
|
|
def dump_grammar(prio, grammar):
|
|
caller = get_caller_pos()
|
|
for t, p in grammar.iteritems():
|
|
p.dump(prio, caller=caller)
|
|
|
|
def cleanup_token(tok):
|
|
tok = tok.strip()
|
|
if len(tok) == 0:
|
|
return None
|
|
if tok[0] == "'" and tok[-1] == "'":
|
|
tok = '"' + tok[1:-1] + '"'
|
|
return tok
|
|
|
|
def is_terminal(tok):
|
|
size = len(tok)
|
|
if size < 2:
|
|
return None
|
|
first = tok[0]
|
|
last = tok[-1]
|
|
if (not first in quotechars) and (not last in quotechars):
|
|
return None
|
|
if first != last:
|
|
raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
|
|
return tok[1:-1]
|
|
|
|
def tok2name(tok):
|
|
tok = cleanup_token(tok)
|
|
term = is_terminal(tok)
|
|
if term is not None:
|
|
if term in special_terminals.keys():
|
|
return special_terminals[term]
|
|
return term
|
|
return tok
|
|
|
|
def tok2sym(tok):
|
|
tok = cleanup_token(tok)
|
|
term = is_terminal(tok)
|
|
if term is not None:
|
|
if term in special_terminals.keys():
|
|
return "T_" + special_terminals[term].upper()
|
|
return "T_" + term.upper()
|
|
return tok
|
|
|
|
def tok2regex(tok):
|
|
if tok in token_regexes.keys():
|
|
return token_regexes[tok]
|
|
return re.escape(tok)
|
|
|
|
def format_rule(rule):
|
|
return ' '.join(c.str() for c in rule)
|
|
|
|
def format_rules(rules):
|
|
return ', '.join(format_rule(rule) for rule in rules)
|
|
|
|
def format_ebnf_rule(grammar, rule):
|
|
r = ""
|
|
last = None
|
|
for comp in rule:
|
|
if last is not None:
|
|
if comp.tp == t_grammar:
|
|
if last.tp == t_grammar:
|
|
pass
|
|
else:
|
|
if comp.token in [ '[', '(', '{', '<' ]:
|
|
r += ','
|
|
else:
|
|
if last.tp == t_grammar:
|
|
if comp.token in [ ']', ')', '}', '>' ]:
|
|
r += ','
|
|
else:
|
|
r += ','
|
|
r += ' ' + comp.token
|
|
last = comp
|
|
if len(r) == 0:
|
|
return r
|
|
return r.strip()
|
|
|
|
def format_yacc_rule(rule):
|
|
r = ''
|
|
for c in rule:
|
|
r += tok2sym(c.token) + ' '
|
|
return r[:-1]
|
|
|
|
class RuleComp:
|
|
|
|
def __init__(self, token, tp):
|
|
self.token = token
|
|
self.tp = tp
|
|
slog(INFO, "creating rule component >" + self.str() + "<")
|
|
assert(token != "{ assignment")
|
|
|
|
def __eq__(self, rhs):
|
|
if self.token != rhs.token:
|
|
return False
|
|
if self.tp != rhs.tp:
|
|
return False
|
|
return True
|
|
|
|
def __ne__(self, rhs):
|
|
return not self.__eq__(rhs)
|
|
|
|
def str(self):
|
|
tp = 'u'
|
|
if self.tp == t_grammar:
|
|
tp = 'g'
|
|
elif self.tp == t_target_lang:
|
|
tp = 'l'
|
|
else:
|
|
tp = self.tp
|
|
return "{" + tp + ": " + self.token + "}"
|
|
|
|
class State:
|
|
|
|
def __init__(self):
|
|
self.curly = 0
|
|
self.square = 0
|
|
|
|
def reset(self):
|
|
self.curly = 0
|
|
self.square = 0
|
|
|
|
def optional(self):
|
|
return self.square != 0 or self.curly != 0
|
|
|
|
def update(self, tok):
|
|
if tok == '[':
|
|
self.square += 1
|
|
elif tok == ']':
|
|
self.square -= 1
|
|
elif tok == '{':
|
|
self.curly += 1
|
|
elif tok == '}':
|
|
self.curly -= 1
|
|
if self.curly < 0 or self.square < 0:
|
|
raise Exception("Unbalanced BNF bracket", tok)
|
|
return self.optional()
|
|
|
|
def in_list(self):
|
|
return self.curly > 0
|
|
|
|
def in_option(self):
|
|
return self.square > 0
|
|
|
|
class Symbol:
|
|
|
|
def __init__(self, token, tp = p_ruleset, rules = None):
|
|
self.tp = tp
|
|
self.token = token
|
|
self.name = tok2name(token)
|
|
self.sym = tok2sym(token)
|
|
self.term = None
|
|
self.regex = None
|
|
self.is_lexical_element = False
|
|
self.rules = []
|
|
self.datatype = None
|
|
if rules is not None:
|
|
self.rules = rules
|
|
self.set_type(tp)
|
|
|
|
def set_type(self, tp):
|
|
if tp == p_ruleset:
|
|
self.term = None
|
|
self.regex = None
|
|
self.is_lexical_element = False
|
|
self.datatype = self.token + '_t'
|
|
elif tp == p_literal:
|
|
assert(len(self.rules) == 0)
|
|
self.term = None
|
|
self.regex = tok2regex(self.token)
|
|
self.is_lexical_element = False
|
|
self.datatype = "std::string"
|
|
elif tp == p_lexical:
|
|
assert(len(self.rules) == 0)
|
|
self.term = None
|
|
self.regex = tok2regex(self.token)
|
|
self.is_lexical_element = True
|
|
self.datatype = None
|
|
elif tp == p_terminal:
|
|
assert(len(self.rules) == 0)
|
|
self.term = self.token
|
|
self.regex = tok2regex(self.token)
|
|
self.is_lexical_element = False
|
|
self.datatype = None
|
|
else:
|
|
self.dump()
|
|
raise Exception("Tried to set production to unknown type", tp)
|
|
self.tp = tp
|
|
|
|
def str(self):
|
|
r = self.name + ' = ' + format_rules(self.rules)
|
|
return r
|
|
|
|
def equals(self, rhs):
|
|
for k, v in self.__dict__.iteritems():
|
|
if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]:
|
|
slog(WARNING, k, self.__dict__[k], rhs.__dict__[k])
|
|
return False
|
|
return True
|
|
|
|
def dump(self, prio = NOTICE, msg="", caller=None):
|
|
if caller is None:
|
|
caller = get_caller_pos(1)
|
|
slog(prio, ",----------------", msg, caller=caller)
|
|
slog(prio, "| type =", self.tp, caller=caller)
|
|
slog(prio, "| name =", self.name, caller=caller)
|
|
slog(prio, "| token =", self.token, caller=caller)
|
|
slog(prio, "| sym =", self.sym, caller=caller)
|
|
slog(prio, "| term =", self.term, caller=caller)
|
|
slog(prio, "| regex =", self.regex, caller=caller)
|
|
slog(prio, "| datatype =", self.datatype, caller=caller)
|
|
slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller)
|
|
slog(prio, "| rules =", format_rules(self.rules), caller=caller)
|
|
slog(prio, "`----------------", msg, caller=caller)
|
|
|
|
def split_list_by(l_, tok):
|
|
l = copy.deepcopy(l_)
|
|
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
|
|
|
|
|
|
def split_list_by_regex(l_, regex):
|
|
l = copy.deepcopy(l_)
|
|
return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]
|
|
|
|
def grammar_parse_ebnf(content_):
|
|
|
|
# remove comments
|
|
in_comment = False
|
|
quoted = None
|
|
raw_tokens = re.split("([, ])", content_)
|
|
tokens = []
|
|
for t in raw_tokens:
|
|
t = t.strip()
|
|
if not len(t):
|
|
continue
|
|
if quoted:
|
|
if t == quoted: # FIXME: check backslash before
|
|
quoted = None
|
|
elif in_comment:
|
|
if t == '*)':
|
|
in_comment = False
|
|
continue
|
|
elif t == '(*':
|
|
in_comment = True
|
|
continue
|
|
elif t in [ '"', "'" ]:
|
|
quoted = t
|
|
tokens.append(t)
|
|
|
|
grammar = OrderedDict()
|
|
raw_productions = split_list_by(tokens, ';')
|
|
#slog(INFO, "raw_productions =", raw_productions)
|
|
for raw_production in raw_productions:
|
|
#slog(INFO, "raw_production =", '@'.join(raw_production))
|
|
raw_lhs_rhs = split_list_by(raw_production, '=')
|
|
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
|
|
assert(len(raw_lhs_rhs) == 2)
|
|
lhs = ' '.join(raw_lhs_rhs[0])
|
|
p = Symbol(lhs)
|
|
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
|
|
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
|
|
for raw_rule in raw_rules:
|
|
slog(INFO, "raw_rule =", raw_rule)
|
|
rule_tokens = split_list_by_regex(raw_rule, ',{}\(\)\[\]')
|
|
#slog(INFO, "rule_tokens =", rule_tokens)
|
|
rule = []
|
|
for raw_tok in rule_tokens:
|
|
tok = cleanup_token(' '.join(raw_tok))
|
|
tp = t_target_lang
|
|
if is_terminal(tok) is not None:
|
|
if not tok in grammar.keys():
|
|
litp = Symbol(tok, p_terminal)
|
|
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
|
|
grammar[tok] = litp
|
|
tp = t_target_lang
|
|
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
|
|
tp = t_grammar
|
|
rule.append(RuleComp(tok, tp))
|
|
p.rules.append(rule)
|
|
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
|
|
grammar[lhs] = p
|
|
|
|
return grammar
|
|
|
|
def grammar_get_types(grammar):
|
|
types = dict()
|
|
for t, p in grammar.iteritems():
|
|
if not len(p.rules):
|
|
continue
|
|
if p.term is not None:
|
|
continue
|
|
ruleno = 1
|
|
rules = []
|
|
for rule in p.rules:
|
|
members = []
|
|
for c in rule:
|
|
if c.tp != t_target_lang:
|
|
continue
|
|
if not c.token in grammar.keys():
|
|
p.dump(ERR)
|
|
raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule))
|
|
pp = grammar[c.token]
|
|
if pp.tp is p_terminal:
|
|
continue
|
|
members.append(tok2sym(c.token))
|
|
if True or len(members):
|
|
rules.append(members)
|
|
if t in types.keys():
|
|
raise Exception("Tried to add type", t, "twice")
|
|
types[t] = rules
|
|
return types
|
|
|
|
def grammar_fix_extensions(grammar, mode):
|
|
for tok, p in grammar.iteritems():
|
|
newrules = []
|
|
for rule in p.rules:
|
|
newrule = []
|
|
prefix = ""
|
|
paren = 0
|
|
for c in rule:
|
|
if c.tp == t_grammar and c.token in ['<', '>']:
|
|
if c.token == '<':
|
|
paren += 1
|
|
elif c.token == '>':
|
|
paren -= 1
|
|
if paren <= 1: # don't add first level of control chars
|
|
continue
|
|
newrule.append(c)
|
|
continue
|
|
if paren > 0:
|
|
assert(len(c.token) != 0)
|
|
prefix += '_' + c.token
|
|
continue
|
|
if len(prefix) > 0:
|
|
prefix = prefix[1:]
|
|
slog(INFO, "Found prefix", prefix)
|
|
if mode == mode_keep:
|
|
newrule.append(RuleComp('<', t_grammar))
|
|
newrule.append(RuleComp(prefix, t_target_lang))
|
|
newrule.append(RuleComp('>', t_grammar))
|
|
newrule.append(c)
|
|
elif mode == mode_discard:
|
|
prefix = ''
|
|
continue
|
|
elif mode in [ mode_unroll, mode_concat ]:
|
|
combined = RuleComp(c.token, c.tp)
|
|
combined.token = prefix + c.token
|
|
prefix = ''
|
|
newrule.append(combined)
|
|
slog(INFO, "Appended new rule return value", combined.token)
|
|
if mode == mode_unroll:
|
|
if combined.token in grammar.keys():
|
|
continue
|
|
grammar[combined.token] = Symbol(combined.token, rules=[[c]])
|
|
else:
|
|
raise Exception("Invalid prefix mode", mode)
|
|
prefix = ''
|
|
continue
|
|
newrule.append(c)
|
|
if len(prefix): # undigested prefix, since it was the last
|
|
newrule.append(RuleComp(prefix[1:], t_target_lang))
|
|
newrules.append(newrule)
|
|
grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only
|
|
return grammar # TODO: not sure if this is necessary
|
|
|
|
def grammar_unroll_lists(grammar):
|
|
delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic
|
|
for tok, p in grammar.iteritems():
|
|
newrules = []
|
|
for rule in p.rules:
|
|
newrule = []
|
|
listrule = []
|
|
prefix = None
|
|
s = State()
|
|
for c in rule:
|
|
s.update(c.token)
|
|
if c.token == '{':
|
|
continue
|
|
if c.token == '}':
|
|
if len(listrule) == 0:
|
|
raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
|
|
name = ""
|
|
delpos = []
|
|
for i, rule in enumerate(listrule):
|
|
if rule.token in delimiters:
|
|
delpos.append(i)
|
|
continue
|
|
if rule.tp != t_target_lang:
|
|
continue
|
|
name += tok2name(rule.token) + "_"
|
|
|
|
# not really: there are lists without delimiters, too
|
|
#if len(delpos) != 1:
|
|
# p.dump(ERR)
|
|
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
|
|
|
|
name = name + "my_list"
|
|
newrule.append(RuleComp(name, t_target_lang))
|
|
p = Symbol(name, rules=[[], listrule])
|
|
#p = Symbol(name)
|
|
#p.rules = [ [], listrule ]
|
|
listrule = []
|
|
if name not in grammar.keys():
|
|
grammar[name] = p
|
|
continue
|
|
if not p.equals(grammar[name]):
|
|
p.dump(ERR, "old list production")
|
|
p.dump(ERR, "new list production")
|
|
raise Exception("List production expands to already taken name", name)
|
|
continue
|
|
if s.in_list():
|
|
listrule.append(c)
|
|
continue
|
|
newrule.append(c)
|
|
newrules.append(newrule)
|
|
grammar[tok].rules = newrules
|
|
return grammar
|
|
|
|
def rules_unroll_options(rules):
|
|
r = []
|
|
found = False
|
|
slog(DEBUG, "unrolling", format_rules(rules))
|
|
for rule in rules:
|
|
square = 0
|
|
option = []
|
|
newrule = []
|
|
for i, c in enumerate(rule):
|
|
if c.tp == t_grammar:
|
|
if c.token == '[':
|
|
square += 1
|
|
elif c.token == ']':
|
|
square -= 1
|
|
if square == 1:
|
|
continue
|
|
if square >= 1:
|
|
option.append(c)
|
|
continue
|
|
slog(DEBUG, "square =", square)
|
|
assert(square == 0)
|
|
n = len(option)
|
|
if n == 0:
|
|
newrule.append(c)
|
|
continue
|
|
# first without option
|
|
replaced = newrule[:]
|
|
tail = rule[i+1:len(rule)]
|
|
slog(DEBUG, "i = ", i)
|
|
slog(DEBUG, "n = ", n)
|
|
slog(DEBUG, "rule = ", format_rule(rule))
|
|
slog(DEBUG, "tail = ", format_rule(tail))
|
|
slog(DEBUG, ",-------------------------")
|
|
slog(DEBUG, "head = ", format_rule(replaced))
|
|
replaced.extend(tail)
|
|
slog(DEBUG, "head + tail = ", format_rule(replaced))
|
|
r.append(replaced)
|
|
# then with option inserted
|
|
for unrolled in rules_unroll_options([ option ]):
|
|
replaced = newrule[:]
|
|
slog(DEBUG, ",-------------------------")
|
|
slog(DEBUG, "head = ", format_rule(replaced))
|
|
slog(DEBUG, "unrolled = ", format_rule(unrolled))
|
|
replaced.extend(unrolled)
|
|
slog(DEBUG, "head + unrolled =", format_rule(replaced))
|
|
replaced.extend(tail)
|
|
slog(DEBUG, "head + unrolled + tail =", format_rule(replaced))
|
|
r.append(replaced)
|
|
found = True
|
|
break
|
|
if not found:
|
|
r.append(newrule)
|
|
if found:
|
|
return rules_unroll_options(r)
|
|
return r
|
|
|
|
def grammar_unroll_options(grammar):
|
|
for tok, p in grammar.iteritems():
|
|
grammar[tok].rules = rules_unroll_options(p.rules)
|
|
return grammar
|
|
|
|
def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
|
|
if checked is None:
|
|
checked = set()
|
|
if found is None:
|
|
found = dict()
|
|
indent = ' ' * depth * 2
|
|
if tok in found.keys():
|
|
slog(INFO, indent + " + found cached", tok, "with depth", found[tok])
|
|
return found[tok]
|
|
slog(INFO, indent + " + " + tok)
|
|
indent = indent + " "
|
|
if tok in terminals:
|
|
found[tok] = 1
|
|
slog(INFO, indent + " + found terminal", tok, "with depth", found[tok])
|
|
return 1
|
|
if tok in orphans:
|
|
found[tok] = 1
|
|
slog(INFO, indent + " + found orphan", tok, "with depth", found[tok])
|
|
return 1
|
|
#if tok in lexicals:
|
|
# found[tok] = 1
|
|
# slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok])
|
|
# return 1
|
|
if tok in checked:
|
|
slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked))
|
|
return sys.maxint
|
|
|
|
slog(INFO, indent, "checked =", ' '.join(checked))
|
|
checked.add(tok)
|
|
if tok not in grammar.keys():
|
|
slog(ERR, "tried to validate unknown token \"" + tok + "\"")
|
|
return sys.maxint
|
|
p = grammar[tok]
|
|
r = sys.maxint
|
|
slog(INFO, indent, p.token, "has", len(p.rules), "rules")
|
|
only_optional = True
|
|
for rule in p.rules:
|
|
slog(INFO, indent, "testing rule", format_rule(rule))
|
|
if tok in [ c.token for c in rule ]:
|
|
continue
|
|
mn = sys.maxint
|
|
mx = 0
|
|
s = State()
|
|
for c in rule:
|
|
slog(INFO, indent, "testing token", c.token)
|
|
if c.tp == t_grammar and s.update(c.token):
|
|
continue
|
|
if c.tp != t_target_lang:
|
|
slog(INFO, indent, " token", c.token, "is not a VHDL token")
|
|
continue
|
|
only_optional = False
|
|
# same "found" argument in next call?
|
|
rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found)
|
|
slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx)
|
|
if rr == sys.maxint or rr is None:
|
|
slog(INFO, indent, " got error for token", c.token)
|
|
mn = sys.maxint
|
|
mx = 0
|
|
break
|
|
if rr > mx:
|
|
slog(INFO, indent, " adjusting mx to", rr)
|
|
mx = rr
|
|
if rr < mn:
|
|
slog(INFO, indent, " adjusting mn to", rr)
|
|
mn = rr
|
|
if mn == sys.maxint or mx == 0: # unusable as escape route
|
|
slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule))
|
|
continue
|
|
slog(INFO, indent, "after checking all rules, mx is", mx)
|
|
if mx < r:
|
|
slog(INFO, indent, "setting return value to max", mx)
|
|
r = mx
|
|
if only_optional:
|
|
slog(INFO, indent, tok, "has only optional rules, accepting")
|
|
r = 0
|
|
if r != sys.maxint:
|
|
r += 1
|
|
slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps")
|
|
found[tok] = r
|
|
slog(INFO, indent, "returning", r, "for token", tok)
|
|
return r
|
|
|
|
def grammar_check(grammar, check_symbols = None):
|
|
terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
|
|
orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
|
|
lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
|
|
elements = set()
|
|
if check_symbols is None:
|
|
check_symbols = []
|
|
if len(check_symbols) == 0:
|
|
for tok, p in grammar.iteritems():
|
|
if p.is_lexical_element:
|
|
elements.add(tok)
|
|
continue
|
|
for rule in p.rules:
|
|
for c in rule:
|
|
if c.tp == t_grammar:
|
|
continue
|
|
elements.add(c.token)
|
|
check_symbols = sorted(list(elements))
|
|
found = dict()
|
|
for tok in check_symbols:
|
|
slog(INFO, "======= checking", tok)
|
|
rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
|
|
if rr == sys.maxint:
|
|
slog(ERR, "No way out for", tok, "in production", p.str())
|
|
exit(1)
|
|
if not tok in grammar.keys():
|
|
slog(ERR, "Token", tok, "has no production")
|
|
exit(1)
|
|
slog(INFO, tok, "->", str(rr))
|
|
|
|
def grammar_lhss_map(grammar):
|
|
r = dict()
|
|
for t in grammar.keys():
|
|
r[t] = set()
|
|
for t, p in grammar.iteritems():
|
|
for rule in p.rules:
|
|
for c in rule:
|
|
if c.tp == t_target_lang:
|
|
r[c.token].add(t)
|
|
return r
|
|
|
|
def do_grammar_lhss(dmap, stop, rhs, buf, recursive):
|
|
lhss = dmap[rhs]
|
|
for lhs in lhss:
|
|
if lhs in buf:
|
|
continue
|
|
buf.add(lhs)
|
|
if lhs in stop:
|
|
slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion")
|
|
continue
|
|
if recursive:
|
|
do_grammar_lhss(dmap, stop, lhs, buf, recursive)
|
|
|
|
def grammar_lhss(dmap, stop, symbols, recursive = False):
|
|
r = set()
|
|
for s in symbols:
|
|
if s in r:
|
|
continue
|
|
do_grammar_lhss(dmap, stop, s, r, recursive)
|
|
return r
|
|
|
|
def do_grammar_rhss(grammar, stop, sym, buf):
|
|
p = grammar[sym]
|
|
for rule in p.rules:
|
|
for c in rule:
|
|
if c.tp != t_target_lang:
|
|
continue
|
|
if c.token in stop:
|
|
continue
|
|
if c.token in buf:
|
|
continue
|
|
buf.add(c.token)
|
|
do_grammar_rhss(grammar, stop, c.token, buf)
|
|
|
|
def grammar_rhss(grammar, stop, symbols):
|
|
r = set()
|
|
for s in symbols:
|
|
if s in r:
|
|
continue
|
|
do_grammar_rhss(grammar, stop, s, r)
|
|
return r
|
|
|
|
def grammar_symbol_in_use(grammar, dmap, stop, checked, sym):
|
|
if sym in stop:
|
|
return False
|
|
# Does this have to be recursive?
|
|
defined = grammar_lhss(dmap, stop, set([sym]))
|
|
slog(INFO, " symbol", sym, "defines:", ', '.join(defined))
|
|
if not len(defined):
|
|
return True
|
|
for d in defined:
|
|
if d in stop:
|
|
continue
|
|
if d in checked:
|
|
continue
|
|
checked.add(d)
|
|
if grammar_symbol_in_use(grammar, dmap, stop, checked, d):
|
|
return True
|
|
return False
|
|
|
|
def do_grammar_unused(grammar, dmap, doomed):
|
|
r = set(doomed)
|
|
rhss = grammar_rhss(grammar, set(), doomed)
|
|
for rhs in rhss:
|
|
slog(INFO, "+++ checking if symbol", rhs, "is in use >>")
|
|
if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs):
|
|
slog(INFO, " symbol", rhs, "is not in use")
|
|
r.add(rhs)
|
|
slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<")
|
|
return r
|
|
|
|
def grammar_unused(grammar, dmap, doomed):
|
|
r = set(doomed)
|
|
while True:
|
|
unused = do_grammar_unused(grammar, dmap, r)
|
|
slog(INFO, "unused:", ', '.join(unused))
|
|
slog(INFO, "r: ", ', '.join(r))
|
|
if unused == r:
|
|
break
|
|
r |= unused
|
|
return r
|
|
|
|
# eradicate symbols from tree
|
|
def grammar_cut_symbols(grammar, symbols):
|
|
slog(INFO, "-------- removing symbols:", ', '.join(symbols))
|
|
dmap = grammar_lhss_map(grammar)
|
|
unused = grammar_unused(grammar, dmap, symbols)
|
|
for s in unused:
|
|
slog(INFO, " + removing symbol", s)
|
|
del grammar[s]
|
|
return grammar
|
|
|
|
# make symbol an empty literal production
|
|
def grammar_trim_symbols(grammar, symbols):
|
|
grammar_cut_symbols(grammar, symbols)
|
|
for s in symbols:
|
|
slog(INFO, " + adding empty production for symbol", s)
|
|
p = Symbol(s)
|
|
p.set_type(p_literal)
|
|
grammar[s] = p
|
|
|
|
return grammar
|
|
|
|
def create_ebnf(grammar):
|
|
indent = 40
|
|
slog(INFO, "creating ebnf from grammar of size", len(grammar))
|
|
out = ''
|
|
for t, p in grammar.iteritems():
|
|
slog(INFO, "formatting rule", t)
|
|
if not len(p.rules):
|
|
slog(INFO, "ignoring " + t + " (has no rules)\n")
|
|
continue
|
|
out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
|
|
for rule in p.rules[1:]:
|
|
out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
|
|
out += ' ' * indent + ' ;\n'
|
|
return out
|
|
|
|
def create_yacc(grammar):
|
|
indent = ' ' * 40
|
|
width = 0
|
|
for t, p in grammar.iteritems():
|
|
if p.term is not None:
|
|
continue
|
|
if len(t) > width:
|
|
width = len(t)
|
|
spaces = 0
|
|
while spaces < width:
|
|
spaces += 8
|
|
indent = '\t' * (spaces / 8)
|
|
|
|
out = ""
|
|
|
|
# preamble
|
|
out += textwrap.dedent("""\
|
|
%{
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
#include <stdarg.h>
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#include "include/defs.h"
|
|
#include "include/vhdl2017.h"
|
|
#include "include/lex.vhdl2017.h"
|
|
#include "include/vhdl2017.tab.h"
|
|
|
|
using namespace std;
|
|
using namespace v2017;
|
|
|
|
namespace {
|
|
|
|
typedef vector<const char *> wrap_t;
|
|
const wrap_t curly_braces{ "{", "}" };
|
|
const wrap_t round_braces{ "(", ")" };
|
|
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
// extern "C" {
|
|
#endif
|
|
|
|
%}
|
|
|
|
""")
|
|
|
|
# types
|
|
out += textwrap.dedent("""\
|
|
%union {
|
|
""")
|
|
|
|
types = grammar_get_types(grammar)
|
|
for t in types.keys():
|
|
out += '\n\tv2017::' + t + '_t *' + t + ';'
|
|
out += '\n'
|
|
|
|
out += textwrap.dedent("""\
|
|
}
|
|
|
|
""")
|
|
|
|
# yydecl
|
|
out += textwrap.dedent("""\
|
|
%{
|
|
// int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner);
|
|
YY_DECL;
|
|
%}
|
|
""")
|
|
|
|
# terminal tokens
|
|
out += '\n'
|
|
for t, p in grammar.iteritems():
|
|
if p.tp == p_terminal:
|
|
#out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
|
|
out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
|
|
|
|
# regex tokens
|
|
out += '\n'
|
|
for t, p in grammar.iteritems():
|
|
if p.tp == p_literal:
|
|
#out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
|
|
out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
|
|
|
|
# types
|
|
out += '\n'
|
|
for t, p in grammar.iteritems():
|
|
if p.tp == p_ruleset:
|
|
out += '%type <' + tok2sym(p.token) + '> ' + t + (40 - len(t)) * ' ' + '/* ' + t + ' */' +'\n'
|
|
|
|
out += textwrap.dedent("""\
|
|
|
|
%define parse.error verbose
|
|
%define api.pure full
|
|
%param { struct context *context } { void *scanner }
|
|
""")
|
|
|
|
# productions
|
|
out += '\n%%\n\n'
|
|
for t, p in grammar.iteritems():
|
|
if not len(p.rules):
|
|
continue
|
|
if p.term is not None:
|
|
continue
|
|
#if p.is_lexical_element is True:
|
|
# continue
|
|
if len(p.rules) == 0:
|
|
raise Exception("Symbol ", p.str(), "has no rules")
|
|
first = True
|
|
n_rule = 0
|
|
for rule in p.rules:
|
|
n_rule += 1
|
|
n = 0
|
|
s = State()
|
|
if first:
|
|
out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n"
|
|
first = False
|
|
else:
|
|
out += indent + "| " + format_yacc_rule(rule) + "\n"
|
|
out += indent + "{" + "\n"
|
|
out += indent + "\t" + "$$->type = v2017::" + t + "::t_" + str(n_rule) + ";\n"
|
|
tokens = []
|
|
for c in rule:
|
|
if c.tp == t_target_lang:
|
|
tokens.append(c.token)
|
|
idx = 0
|
|
for c in rule:
|
|
n += 1
|
|
if c.tp == t_grammar:
|
|
s.update(c.token)
|
|
continue
|
|
p = grammar[c.token]
|
|
#if is_terminal(c.token) is not None:
|
|
# continue
|
|
if p.tp not in [ p_ruleset ]:
|
|
continue
|
|
tp = tok2name(c.token)
|
|
suffix = ''
|
|
if tokens.count(c.token) > 1:
|
|
idx += 1
|
|
suffix = '_' + str(idx)
|
|
out += indent + "\t" + \
|
|
"$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \
|
|
" = new " + p.datatype + "(*$" + str(n) + ");\n"
|
|
out += indent + "}" + "\n"
|
|
out += indent + ";\n\n"
|
|
|
|
# tail
|
|
out += '\n%%\n\n'
|
|
|
|
out += textwrap.dedent("""
|
|
#ifdef __cplusplus
|
|
// } /* extern "C" */
|
|
#endif
|
|
""")
|
|
|
|
return out + "\n"
|
|
|
|
def create_lex(grammar):
|
|
|
|
ignore = ""
|
|
|
|
out = textwrap.dedent("""\
|
|
%option reentrant
|
|
%option bison-bridge
|
|
|
|
%{
|
|
#include <slog.h>
|
|
|
|
#include "include/defs.h"
|
|
#include "include/vhdl2017.h"
|
|
|
|
// #include "include/lex.vhdl2017.h"
|
|
#include "include/vhdl2017.tab.h"
|
|
|
|
using namespace v2017;
|
|
|
|
/* This is too late in the Flex generated file to work. Still lots of
|
|
* prototypes are spat into it above it, which end up with C++ linkage, of
|
|
* course, causing the linkages to be inconsistent to the functions below this
|
|
* extern "C". Only way I found was to use C++ is to use it on Bison only, and
|
|
* have Flex use C instead. */
|
|
#ifdef __cplusplus
|
|
// extern "C" {
|
|
#endif
|
|
|
|
#ifdef _REMOVE_ME
|
|
static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
|
|
static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
|
|
#endif
|
|
|
|
%}
|
|
|
|
%%
|
|
|
|
\\n { context->line++; }
|
|
|
|
""")
|
|
|
|
for t, p in grammar.iteritems():
|
|
if p.term is not None:
|
|
# \. { return T_DOT; }
|
|
assert(p.term[0] == '"')
|
|
assert(p.term[-1] == '"')
|
|
out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'
|
|
|
|
out += textwrap.dedent("""\
|
|
|
|
%{/* basic_identifier */%}
|
|
%{/* extended_identifier */%}
|
|
%{/* based_integer */%}
|
|
%{/* bit_value */%}
|
|
%{/* numeric_literal */%}
|
|
%{/* enumeration_literal */%}
|
|
%{/* string_literal */%}
|
|
%{/* bit_string_literal */%}
|
|
%{/* character_literal */%}
|
|
%{/* graphic_character */%}
|
|
%{/* basic_character */%}
|
|
%{/* integer */%}
|
|
|
|
""")
|
|
|
|
ignore += textwrap.dedent("""\
|
|
|
|
%{ /* not sure how to handle literals >> */ %}
|
|
\\"[ \\!#-~]*\\" |
|
|
\\'[0-1]\\' {
|
|
// get_string(yylval_param, yyscanner, 1);
|
|
/* Gets a string excluding " or ' */
|
|
int skip = 1;
|
|
int i;
|
|
|
|
for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++);
|
|
yytext[i] = 0;
|
|
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
|
|
lv->txt=(char *)malloc(i+1);
|
|
strcpy(lv->txt, yytext+skip);
|
|
|
|
return STRING;
|
|
}
|
|
|
|
#[0-9a-f]*# {
|
|
// get_based_string(yylval_param, yyscanner, 1); /* skip leading # */
|
|
/* Gets a string excluding # */
|
|
int i;
|
|
int skip = 1;
|
|
|
|
for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++);
|
|
yytext[i] = 0;
|
|
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
|
|
lv->txt = (char *)malloc(i+1);
|
|
strcpy(lv->txt, yytext + skip);
|
|
|
|
return BASED;
|
|
}
|
|
|
|
[a-zA-Z_$][a-zA-Z0-9_$.]* {
|
|
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
|
|
lv->txt=(char *)malloc(strlen(yytext)+1);
|
|
strcpy(lv->txt, yytext);
|
|
return NAME;
|
|
}
|
|
|
|
[0-9]+ {
|
|
YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
|
|
sscanf(yytext, "%d", &lv->n);
|
|
return NATURAL;
|
|
}
|
|
|
|
""")
|
|
|
|
out += textwrap.dedent("""\
|
|
. {
|
|
return yytext[0];
|
|
}
|
|
|
|
%{/* not sure how to handle literals << */%}
|
|
|
|
%%
|
|
|
|
void FB_SYM(error)(struct context *context, void *scanner, const char *msg)
|
|
{
|
|
struct yyguts_t *yyg =(struct yyguts_t*)scanner;
|
|
// vp_log(context->vp, VP_LOG_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->lineno);
|
|
slog(PRI_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->line);
|
|
}
|
|
|
|
int FB_SYM(wrap)(void *scanner)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
struct vp_scanner {
|
|
YY_BUFFER_STATE buf;
|
|
void *scanner;
|
|
char *str;
|
|
};
|
|
|
|
/* utilities which need to be placed here, because I can't find
|
|
* yylex_init() / _destroy() in any generated header file (??) */
|
|
struct vp_scanner *vhdl_default_init_scanner(const char *str)
|
|
{
|
|
struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r));
|
|
|
|
yylex_init(&r->scanner);
|
|
r->str = strdup(str);
|
|
r->buf = yy_scan_string(r->str, r->scanner);
|
|
FB_SYM(set_extra)(r, r->scanner);
|
|
// yyset_in(stdin, r->scanner);
|
|
// yyset_out(stdout, r->scanner);
|
|
return r;
|
|
}
|
|
|
|
void *vhdl_default_scanner_get_data(const struct vp_scanner *scanner)
|
|
{
|
|
return scanner->scanner;
|
|
}
|
|
|
|
void vhdl_default_cleanup_scanner(struct vp_scanner *scanner)
|
|
{
|
|
free(scanner->str);
|
|
yy_delete_buffer(scanner->buf, scanner->scanner);
|
|
yylex_destroy(scanner->scanner);
|
|
free(scanner);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
// } // extern "C"
|
|
#endif
|
|
|
|
""")
|
|
|
|
return out
|
|
|
|
def create_header(grammar, mip, namespace = None):
|
|
out = "#ifndef " + mip + '\n#define ' + mip + '\n\n'
|
|
if namespace is not None:
|
|
out += 'namespace ' + namespace + '{\n\n'
|
|
|
|
types = grammar_get_types(grammar)
|
|
|
|
# struct forward declarations
|
|
for t, members in types.iteritems():
|
|
if len(members):
|
|
out += '\nstruct ' + t + ';'
|
|
out += '\n'
|
|
|
|
# struct / non-struct typedefs
|
|
for t, members in types.iteritems():
|
|
if not len(members):
|
|
out += '\ntypedef const char ' + t + '_t;'
|
|
continue
|
|
out += '\ntypedef struct ' + t + ' ' + t + '_t;'
|
|
out += '\n'
|
|
|
|
# struct definitions
|
|
for t, rules in types.iteritems():
|
|
if not len(rules):
|
|
continue
|
|
out += '\n\nstruct ' + t + ' {\n'
|
|
|
|
# rule structs
|
|
n = 0
|
|
for rule in rules:
|
|
n += 1
|
|
idx = 0
|
|
out += '\n\tstruct ' + 'r' + str(n) + '_t {'
|
|
for m in rule:
|
|
suffix = ''
|
|
if rule.count(m) > 1:
|
|
idx += 1
|
|
suffix = '_' + str(idx)
|
|
p = grammar[m]
|
|
out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
|
|
out += '\n\t};'
|
|
|
|
# type enum
|
|
n = 0
|
|
out += '\n\n\tenum {'
|
|
for rule in rules:
|
|
n += 1
|
|
out += '\n\t\tt_' + str(n) + ','
|
|
out += '\n\t} type;'
|
|
out += '\n'
|
|
|
|
# data union
|
|
n = 0
|
|
out += '\n\tunion {'
|
|
for rule in rules:
|
|
n += 1
|
|
out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';'
|
|
out += '\n\t} data;'
|
|
|
|
# struct done
|
|
out += '\n};'
|
|
|
|
out += '\n'
|
|
|
|
if namespace is not None:
|
|
out += '\n} /* namespace ' + namespace + '*/'
|
|
out += '\n#endif /* #ifndef + ' + mip + ' */'
|
|
|
|
return out
|
|
|
|
class GrammarCmd(jwutils.Cmd):
|
|
|
|
def __init__(self, name, help):
|
|
super(GrammarCmd, self).__init__(name, help=help)
|
|
|
|
def add_parser(self, parsers):
|
|
p = super(GrammarCmd, self).add_parser(parsers)
|
|
p.add_argument("input", help="input file")
|
|
p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False)
|
|
p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat)
|
|
p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False)
|
|
p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='')
|
|
p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='')
|
|
p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='')
|
|
return p
|
|
|
|
def processGrammar(self, args, grammar):
|
|
if args.fix_extensions not in fix_extensions_mode:
|
|
raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
|
|
grammar = grammar_fix_extensions(grammar, args.fix_extensions)
|
|
if args.unroll_lists:
|
|
grammar = grammar_unroll_lists(grammar)
|
|
if args.unroll_options:
|
|
grammar = grammar_unroll_options(grammar)
|
|
if len(args.check_symbols):
|
|
check_symbols = []
|
|
if args.check_symbols == 'all':
|
|
args.check_symbols = ''
|
|
check_symbols = args.check_symbols.split()
|
|
grammar_check(grammar, check_symbols)
|
|
if len(args.trim_symbols):
|
|
grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(','))
|
|
if len(args.cut_symbols):
|
|
grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(','))
|
|
return grammar
|