From 5b7635823812755ce09b257c1823e3e48050fb87 Mon Sep 17 00:00:00 2001 From: Jan Lindemann Date: Wed, 25 Oct 2017 12:41:51 +0200 Subject: [PATCH] Add grammar.py for generating compiler-compilers grammar.py is meant as a compiler compiler compiler, taking in a grammar structure and returning flex and bison files. It can also parse EBNF. Currently there are still remnants of VHDL-specific stuff in the code, because parsing VHDL was what it was initially about. Signed-off-by: Jan Lindemann --- tools/python/jwutils/grammar.py | 1138 +++++++++++++++++++++++++++++++ 1 file changed, 1138 insertions(+) create mode 100644 tools/python/jwutils/grammar.py diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py new file mode 100644 index 0000000..e6420b9 --- /dev/null +++ b/tools/python/jwutils/grammar.py @@ -0,0 +1,1138 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import argparse +import sys +import re +import lxml.etree as ET +import textwrap +from collections import OrderedDict +from abc import abstractmethod + +import jwutils + +from jwutils.log import * + +t_grammar = "grammar" +t_target_lang = "target" + +p_ruleset = "ruleset" +p_terminal = "term" +p_literal = "literal" +p_lexical = "lexical" + +mode_unroll = "unroll" +mode_concat = "concat" +mode_keep = "keep" +mode_discard = "discard" +fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] + +member_prefix = '' + +special_terminals = { + "`" : "BACKTICK", + "^" : "CARET", + "<" : "LT", + "<<" : "LEFT_SHIFT", + "<=" : "LTE", + "<=>" : "SPACE_SHIP", + "<>" : "NE", + "=" : "EQ", + "=>" : "EG", + ">" : "GT", + ">=" : "GE", + ">>" : "RIGHT_SHIFT", + "|" : "PIPE", + "_" : "UNDERSCORE", + "," : "COMMA", + ";" : "SEMICOLON", + ":" : "COLON", + ":=" : "DEFINE", + "?" : "QM", + "?<" : "QM_LT", + "?<=" : "QM_LE", + "?=" : "QM_EQ", + "?>" : "QM_GT", + "?>=" : "QM_GE", + "??" : "QM_QM", + "?/=" : "QM_DIV_EQ", + "/" : "DIV", + "/=" : "DIV_EQ", + "." : "DOT", + "\"" : "DQUOTE", + "'" : "QUOTE", + "(" : "LPAREN", + ")" : "RPAREN", + "[" : "LBRACKET", + "]" : "RBRACKET", + "@" : "AT", + "*" : "ASTERISK", + "**" : "DASTERISK", + "\\" : "BACKSLASH", + "&" : "AMPERSAND", + "#" : "NUMBER_SIGN", + "+" : "PLUS", + "-" : "MINUS" +} + +token_regexes = { + "PSL_Property_Declaration" : "property[ \t]+[^;]+;", + "PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;", + "PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;", + "PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;", + "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", +} + +def dump(obj): + for c, v in obj.iteritems(): + slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) + +def cleanup_token(tok): + tok = tok.strip() + if len(tok) == 0: + return None + return tok + +def is_terminal(tok): + if not tok.startswith('"'): + return None + if not tok.endswith('"'): + raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"') + return tok[1:-1] + +def tok2name(tok): + tok = cleanup_token(tok) + term = is_terminal(tok) + if term is not None: + if term in special_terminals.keys(): + return special_terminals[term] + return term + return tok + +def tok2sym(tok): + tok = cleanup_token(tok) + term = is_terminal(tok) + if term is not None: + if term in special_terminals.keys(): + return "T_" + special_terminals[term].upper() + return "T_" + term.upper() + return tok + +def tok2regex(tok): + if tok in token_regexes.keys(): + return token_regexes[tok] + return re.escape(tok) + +def format_rule(rule): + return ' '.join(c.str() for c in rule) + +def format_rules(rules): + return ', '.join(format_rule(rule) for rule in rules) + +def format_yacc_rule(rule): + r = '' + for c in rule: + r += tok2sym(c.token) + ' ' + return r[:-1] + +class RuleComp: + + def __init__(self, token, tp): + self.token = token + self.tp = tp + slog(INFO, "creating rule >" + self.str() + "<") + + def __eq__(self, rhs): + if self.token != rhs.token: + return False + if self.tp != rhs.tp: + return False + return True + + def __ne__(self, rhs): + return not self.__eq__(rhs) + + def str(self): + return "{" + self.tp + ": " + self.token + "}" + +class State: + + def __init__(self): + self.curly = 0 + self.square = 0 + + def reset(self): + self.curly = 0 + self.square = 0 + + def optional(self): + return self.square != 0 or self.curly != 0 + + def update(self, tok): + if tok == '[': + self.square += 1 + elif tok == ']': + self.square -= 1 + elif tok == '{': + self.curly += 1 + elif tok == '}': + self.curly -= 1 + if self.curly < 0 or self.square < 0: + raise Exception("Unbalanced BNF bracket", tok) + return self.optional() + + def in_list(self): + return self.curly > 0 + + def in_option(self): + return self.square > 0 + +class Symbol: + + def __init__(self, token, tp = p_ruleset, rules = None): + self.tp = tp + self.token = token + self.name = tok2name(token) + self.sym = tok2sym(token) + self.term = None + self.regex = None + self.is_lexical_element = False + self.rules = [] + self.datatype = None + if rules is not None: + self.rules = rules + self.set_type(tp) + + def set_type(self, tp): + if tp == p_ruleset: + self.term = None + self.regex = None + self.is_lexical_element = False + self.datatype = self.token + '_t' + elif tp == p_literal: + assert(len(self.rules) == 0) + self.term = None + self.regex = tok2regex(self.token) + self.is_lexical_element = False + self.datatype = "std::string" + elif tp == p_lexical: + assert(len(self.rules) == 0) + self.term = None + self.regex = tok2regex(self.token) + self.is_lexical_element = True + self.datatype = None + elif tp == p_terminal: + assert(len(self.rules) == 0) + self.term = self.token + self.regex = tok2regex(self.token) + self.is_lexical_element = False + self.datatype = None + else: + self.dump() + raise Exception("Tried to set production to unknown type", tp) + self.tp = tp + + def str(self): + r = self.name + ' = ' + format_rules(self.rules) + return r + + def equals(self, rhs): + for k, v in self.__dict__.iteritems(): + if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]: + slog(WARNING, k, self.__dict__[k], rhs.__dict__[k]) + return False + return True + + def dump(self, prio = NOTICE, msg=""): + slog(prio, ",----------------", msg) + slog(prio, "| type =", self.tp) + slog(prio, "| name =", self.name) + slog(prio, "| token =", self.token) + slog(prio, "| sym =", self.sym) + slog(prio, "| term =", self.term) + slog(prio, "| regex =", self.regex) + slog(prio, "| datatype =", self.datatype) + slog(prio, "| is_lexical_element =", self.is_lexical_element) + slog(prio, "| rules =", format_rules(self.rules)) + slog(prio, "`----------------", msg) + +def grammar_get_types(grammar): + types = dict() + for t, p in grammar.iteritems(): + if not len(p.rules): + continue + if p.term is not None: + continue + ruleno = 1 + rules = [] + for rule in p.rules: + members = [] + for c in rule: + if c.tp != t_target_lang: + continue + if not c.token in grammar.keys(): + p.dump(ERR) + raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule)) + pp = grammar[c.token] + if pp.tp is p_terminal: + continue + members.append(tok2sym(c.token)) + if True or len(members): + rules.append(members) + if t in types.keys(): + raise Exception("Tried to add type", t, "twice") + types[t] = rules + return types + +def grammar_fix_extensions(grammar, mode): + for tok, p in grammar.iteritems(): + newrules = [] + for rule in p.rules: + newrule = [] + prefix = "" + paren = 0 + for c in rule: + if c.tp != t_target_lang: + if c.token == '(': + paren += 1 + elif c.token == ')': + paren -= 1 + if paren <= 1: # don't add first level of control chars + continue + newrule.append(c) + continue + if paren > 0: + assert(len(c.token) != 0) + prefix += '_' + c.token + continue + if len(prefix) > 0: + prefix = prefix[1:] + slog(INFO, "Found prefix", prefix) + if mode == mode_keep: + newrule.append(RuleComp('(', t_grammar)) + newrule.append(RuleComp(prefix, t_target_lang)) + newrule.append(RuleComp(')', t_grammar)) + newrule.append(c) + elif mode == mode_discard: + prefix = '' + continue + elif mode in [ mode_unroll, mode_concat ]: + combined = RuleComp(c.token, c.tp) + combined.token = prefix + c.token + prefix = '' + newrule.append(combined) + slog(INFO, "Appended new rule return value", combined.token) + if mode == mode_unroll: + if combined.token in grammar.keys(): + continue + grammar[combined.token] = Symbol(combined.token, rules=[[c]]) + else: + raise Exception("Invalid prefix mode", mode) + prefix = '' + continue + newrule.append(c) + if len(prefix): # undigested prefix, since it was the last + newrule.append(RuleComp(prefix[1:], t_target_lang)) + newrules.append(newrule) + grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only + return grammar # TODO: not sure if this is necessary + +def grammar_unroll_lists(grammar): + delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic + for tok, p in grammar.iteritems(): + newrules = [] + for rule in p.rules: + newrule = [] + listrule = [] + prefix = None + s = State() + for c in rule: + s.update(c.token) + if c.token == '{': + continue + if c.token == '}': + if len(listrule) == 0: + raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) + name = "" + delpos = [] + for i, rule in enumerate(listrule): + if rule.token in delimiters: + delpos.append(i) + continue + if rule.tp != t_target_lang: + continue + name += tok2name(rule.token) + "_" + if len(delpos) != 1: + raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule)) + name = name + "my_list" + newrule.append(RuleComp(name, t_target_lang)) + p = Symbol(name, rules=[[], listrule]) + #p = Symbol(name) + #p.rules = [ [], listrule ] + listrule = [] + if name not in grammar.keys(): + grammar[name] = p + continue + if not p.equals(grammar[name]): + p.dump(ERR, "old list production") + p.dump(ERR, "new list production") + raise Exception("List production expands to already taken name", name) + continue + if s.in_list(): + listrule.append(c) + continue + newrule.append(c) + newrules.append(newrule) + grammar[tok].rules = newrules + return grammar + +def rules_unroll_options(rules): + r = [] + found = False + slog(DEBUG, "unrolling", format_rules(rules)) + for rule in rules: + square = 0 + option = [] + newrule = [] + for i, c in enumerate(rule): + if c.tp == t_grammar: + if c.token == '[': + square += 1 + elif c.token == ']': + square -= 1 + if square == 1: + continue + if square >= 1: + option.append(c) + continue + slog(DEBUG, "square =", square) + assert(square == 0) + n = len(option) + if n == 0: + newrule.append(c) + continue + # first without option + replaced = newrule[:] + tail = rule[i+1:len(rule)] + slog(DEBUG, "i = ", i) + slog(DEBUG, "n = ", n) + slog(DEBUG, "rule = ", format_rule(rule)) + slog(DEBUG, "tail = ", format_rule(tail)) + slog(DEBUG, ",-------------------------") + slog(DEBUG, "head = ", format_rule(replaced)) + replaced.extend(tail) + slog(DEBUG, "head + tail = ", format_rule(replaced)) + r.append(replaced) + # then with option inserted + for unrolled in rules_unroll_options([ option ]): + replaced = newrule[:] + slog(DEBUG, ",-------------------------") + slog(DEBUG, "head = ", format_rule(replaced)) + slog(DEBUG, "unrolled = ", format_rule(unrolled)) + replaced.extend(unrolled) + slog(DEBUG, "head + unrolled =", format_rule(replaced)) + replaced.extend(tail) + slog(DEBUG, "head + unrolled + tail =", format_rule(replaced)) + r.append(replaced) + found = True + break + if not found: + r.append(newrule) + if found: + return rules_unroll_options(r) + return r + +def grammar_unroll_options(grammar): + for tok, p in grammar.iteritems(): + grammar[tok].rules = rules_unroll_options(p.rules) + return grammar + +def format_ebnf_rule(grammar, rule): + r = "" + for comp in rule: + if comp.tp == t_grammar: + r = r + " " + comp.token + continue + r = r + " " + comp.token + return r.strip() + +def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): + if checked is None: + checked = set() + if found is None: + found = dict() + indent = ' ' * depth * 2 + if tok in found.keys(): + slog(INFO, indent + " + found cached", tok, "with depth", found[tok]) + return found[tok] + slog(INFO, indent + " + " + tok) + indent = indent + " " + if tok in terminals: + found[tok] = 1 + slog(INFO, indent + " + found terminal", tok, "with depth", found[tok]) + return 1 + if tok in orphans: + found[tok] = 1 + slog(INFO, indent + " + found orphan", tok, "with depth", found[tok]) + return 1 + #if tok in lexicals: + # found[tok] = 1 + # slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok]) + # return 1 + if tok in checked: + slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked)) + return sys.maxint + + slog(INFO, indent, "checked =", ' '.join(checked)) + checked.add(tok) + if tok not in grammar.keys(): + slog(ERR, "tried to validate unknown token \"" + tok + "\"") + return sys.maxint + p = grammar[tok] + r = sys.maxint + slog(INFO, indent, p.token, "has", len(p.rules), "rules") + only_optional = True + for rule in p.rules: + slog(INFO, indent, "testing rule", format_rule(rule)) + if tok in [ c.token for c in rule ]: + continue + mn = sys.maxint + mx = 0 + s = State() + for c in rule: + slog(INFO, indent, "testing token", c.token) + if c.tp == t_grammar and s.update(c.token): + continue + if c.tp != t_target_lang: + slog(INFO, indent, " token", c.token, "is not a VHDL token") + continue + only_optional = False + # same "found" argument in next call? + rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found) + slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx) + if rr == sys.maxint or rr is None: + slog(INFO, indent, " got error for token", c.token) + mn = sys.maxint + mx = 0 + break + if rr > mx: + slog(INFO, indent, " adjusting mx to", rr) + mx = rr + if rr < mn: + slog(INFO, indent, " adjusting mn to", rr) + mn = rr + if mn == sys.maxint or mx == 0: # unusable as escape route + slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule)) + continue + slog(INFO, indent, "after checking all rules, mx is", mx) + if mx < r: + slog(INFO, indent, "setting return value to max", mx) + r = mx + if only_optional: + slog(INFO, indent, tok, "has only optional rules, accepting") + r = 0 + if r != sys.maxint: + r += 1 + slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps") + found[tok] = r + slog(INFO, indent, "returning", r, "for token", tok) + return r + +def grammar_check(grammar, selements = None): + if selements is None: + selements = [] + terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} + orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} + lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} + elements = set() + if len(selements) == 0: + for tok, p in grammar.iteritems(): + if p.is_lexical_element: + elements.add(tok) + continue + for rule in p.rules: + for c in rule: + if c.tp == t_grammar: + continue + elements.add(c.token) + selements = sorted(list(elements)) + found = dict() + for tok in selements: + slog(INFO, "======= checking", tok) + rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) + if rr == sys.maxint: + slog(ERR, "No way out for", tok, "in production", p.str()) + exit(1) + if not tok in grammar.keys(): + slog(ERR, "Token", tok, "has no production") + exit(1) + slog(INFO, tok, "->", str(rr)) + +def grammar_lhss_map(grammar): + r = dict() + for t in grammar.keys(): + r[t] = set() + for t, p in grammar.iteritems(): + for rule in p.rules: + for c in rule: + if c.tp == t_target_lang: + r[c.token].add(t) + return r + +def do_grammar_lhss(dmap, stop, rhs, buf, recursive): + lhss = dmap[rhs] + for lhs in lhss: + if lhs in buf: + continue + buf.add(lhs) + if lhs in stop: + slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion") + continue + if recursive: + do_grammar_lhss(dmap, stop, lhs, buf, recursive) + +def grammar_lhss(dmap, stop, symbols, recursive = False): + r = set() + for s in symbols: + if s in r: + continue + do_grammar_lhss(dmap, stop, s, r, recursive) + return r + +def do_grammar_rhss(grammar, stop, sym, buf): + p = grammar[sym] + for rule in p.rules: + for c in rule: + if c.tp != t_target_lang: + continue + if c.token in stop: + continue + if c.token in buf: + continue + buf.add(c.token) + do_grammar_rhss(grammar, stop, c.token, buf) + +def grammar_rhss(grammar, stop, symbols): + r = set() + for s in symbols: + if s in r: + continue + do_grammar_rhss(grammar, stop, s, r) + return r + +def grammar_symbol_in_use(grammar, dmap, stop, checked, sym): + if sym in stop: + return False + # Does this have to be recursive? + defined = grammar_lhss(dmap, stop, set([sym])) + slog(INFO, " symbol", sym, "defines:", ', '.join(defined)) + if not len(defined): + return True + for d in defined: + if d in stop: + continue + if d in checked: + continue + checked.add(d) + if grammar_symbol_in_use(grammar, dmap, stop, checked, d): + return True + return False + +def do_grammar_unused(grammar, dmap, doomed): + r = set(doomed) + rhss = grammar_rhss(grammar, set(), doomed) + for rhs in rhss: + slog(INFO, "+++ checking if symbol", rhs, "is in use >>") + if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs): + slog(INFO, " symbol", rhs, "is not in use") + r.add(rhs) + slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<") + return r + +def grammar_unused(grammar, dmap, doomed): + r = set(doomed) + while True: + unused = do_grammar_unused(grammar, dmap, r) + slog(INFO, "unused:", ', '.join(unused)) + slog(INFO, "r: ", ', '.join(r)) + if unused == r: + break + r |= unused + return r + +# eradicate symbols from tree +def grammar_cut_symbols(grammar, symbols): + slog(INFO, "-------- removing symbols:", ', '.join(symbols)) + dmap = grammar_lhss_map(grammar) + unused = grammar_unused(grammar, dmap, symbols) + for s in unused: + slog(INFO, " + removing symbol", s) + del grammar[s] + return grammar + +# make symbol an empty literal production +def grammar_trim_symbols(grammar, symbols): + grammar_cut_symbols(grammar, symbols) + for s in symbols: + slog(INFO, " + adding empty production for symbol", s) + p = Symbol(s) + p.set_type(p_literal) + grammar[s] = p + + return grammar + +def create_ebnf(grammar): + indent = 40 + for t, p in grammar.iteritems(): + if not len(p.rules): + slog(INFO, "ignoring " + t + " (has no rules)\n") + continue + out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + for rule in p.rules[1:]: + out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + return out + "\n" + +def create_yacc(grammar): + indent = ' ' * 40 + width = 0 + for t, p in grammar.iteritems(): + if p.term is not None: + continue + if len(t) > width: + width = len(t) + spaces = 0 + while spaces < width: + spaces += 8 + indent = '\t' * (spaces / 8) + + out = "" + + # preamble + out += textwrap.dedent("""\ + %{ + #include + #include + #include + #include + #include + + #include + #include + + #include "include/defs.h" + #include "include/vhdl2017.h" + #include "include/lex.vhdl2017.h" + #include "include/vhdl2017.tab.h" + + using namespace std; + using namespace v2017; + + namespace { + + typedef vector wrap_t; + const wrap_t curly_braces{ "{", "}" }; + const wrap_t round_braces{ "(", ")" }; + + } + + #ifdef __cplusplus + // extern "C" { + #endif + + %} + + """) + + # types + out += textwrap.dedent("""\ + %union { + """) + + types = grammar_get_types(grammar) + for t in types.keys(): + out += '\n\tv2017::' + t + '_t *' + t + ';' + out += '\n' + + out += textwrap.dedent("""\ + } + + """) + + # yydecl + out += textwrap.dedent("""\ + %{ + // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner); + YY_DECL; + %} + """) + + # terminal tokens + out += '\n' + for t, p in grammar.iteritems(): + if p.tp == p_terminal: + #out += '%token ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' + out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' + + # regex tokens + out += '\n' + for t, p in grammar.iteritems(): + if p.tp == p_literal: + #out += '%token ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' + out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' + + # types + out += '\n' + for t, p in grammar.iteritems(): + if p.tp == p_ruleset: + out += '%type <' + tok2sym(p.token) + '> ' + t + (40 - len(t)) * ' ' + '/* ' + t + ' */' +'\n' + + out += textwrap.dedent("""\ + + %define parse.error verbose + %define api.pure full + %param { struct context *context } { void *scanner } + """) + + # productions + out += '\n%%\n\n' + for t, p in grammar.iteritems(): + if not len(p.rules): + continue + if p.term is not None: + continue + #if p.is_lexical_element is True: + # continue + if len(p.rules) == 0: + raise Exception("Symbol ", p.str(), "has no rules") + first = True + n_rule = 0 + for rule in p.rules: + n_rule += 1 + n = 0 + s = State() + if first: + out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n" + first = False + else: + out += indent + "| " + format_yacc_rule(rule) + "\n" + out += indent + "{" + "\n" + out += indent + "\t" + "$$->type = v2017::" + t + "::t_" + str(n_rule) + ";\n" + tokens = [] + for c in rule: + if c.tp == t_target_lang: + tokens.append(c.token) + idx = 0 + for c in rule: + n += 1 + if c.tp == t_grammar: + s.update(c.token) + continue + p = grammar[c.token] + #if is_terminal(c.token) is not None: + # continue + if p.tp not in [ p_ruleset ]: + continue + tp = tok2name(c.token) + suffix = '' + if tokens.count(c.token) > 1: + idx += 1 + suffix = '_' + str(idx) + out += indent + "\t" + \ + "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \ + " = new " + p.datatype + "(*$" + str(n) + ");\n" + out += indent + "}" + "\n" + out += indent + ";\n\n" + + # tail + out += '\n%%\n\n' + + out += textwrap.dedent(""" + #ifdef __cplusplus + // } /* extern "C" */ + #endif + """) + + return out + "\n" + +def create_lex(grammar): + + ignore = "" + + out = textwrap.dedent("""\ + %option reentrant + %option bison-bridge + + %{ + #include + + #include "include/defs.h" + #include "include/vhdl2017.h" + + // #include "include/lex.vhdl2017.h" + #include "include/vhdl2017.tab.h" + + using namespace v2017; + + /* This is too late in the Flex generated file to work. Still lots of + * prototypes are spat into it above it, which end up with C++ linkage, of + * course, causing the linkages to be inconsistent to the functions below this + * extern "C". Only way I found was to use C++ is to use it on Bison only, and + * have Flex use C instead. */ + #ifdef __cplusplus + // extern "C" { + #endif + + #ifdef _REMOVE_ME + static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); + static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); + #endif + + %} + + %% + + \\n { context->line++; } + + """) + + for t, p in grammar.iteritems(): + if p.term is not None: + # \. { return T_DOT; } + assert(p.term[0] == '"') + assert(p.term[-1] == '"') + out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n' + + out += textwrap.dedent("""\ + + %{/* basic_identifier */%} + %{/* extended_identifier */%} + %{/* based_integer */%} + %{/* bit_value */%} + %{/* numeric_literal */%} + %{/* enumeration_literal */%} + %{/* string_literal */%} + %{/* bit_string_literal */%} + %{/* character_literal */%} + %{/* graphic_character */%} + %{/* basic_character */%} + %{/* integer */%} + + """) + + ignore += textwrap.dedent("""\ + + %{ /* not sure how to handle literals >> */ %} + \\"[ \\!#-~]*\\" | + \\'[0-1]\\' { + // get_string(yylval_param, yyscanner, 1); + /* Gets a string excluding " or ' */ + int skip = 1; + int i; + + for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++); + yytext[i] = 0; + YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); + lv->txt=(char *)malloc(i+1); + strcpy(lv->txt, yytext+skip); + + return STRING; + } + + #[0-9a-f]*# { + // get_based_string(yylval_param, yyscanner, 1); /* skip leading # */ + /* Gets a string excluding # */ + int i; + int skip = 1; + + for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++); + yytext[i] = 0; + YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); + lv->txt = (char *)malloc(i+1); + strcpy(lv->txt, yytext + skip); + + return BASED; + } + + [a-zA-Z_$][a-zA-Z0-9_$.]* { + YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); + lv->txt=(char *)malloc(strlen(yytext)+1); + strcpy(lv->txt, yytext); + return NAME; + } + + [0-9]+ { + YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); + sscanf(yytext, "%d", &lv->n); + return NATURAL; + } + + """) + + out += textwrap.dedent("""\ + . { + return yytext[0]; + } + + %{/* not sure how to handle literals << */%} + + %% + + void FB_SYM(error)(struct context *context, void *scanner, const char *msg) + { + struct yyguts_t *yyg =(struct yyguts_t*)scanner; + // vp_log(context->vp, VP_LOG_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->lineno); + slog(PRI_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->line); + } + + int FB_SYM(wrap)(void *scanner) + { + return 1; + } + + struct vp_scanner { + YY_BUFFER_STATE buf; + void *scanner; + char *str; + }; + + /* utilities which need to be placed here, because I can't find + * yylex_init() / _destroy() in any generated header file (??) */ + struct vp_scanner *vhdl_default_init_scanner(const char *str) + { + struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r)); + + yylex_init(&r->scanner); + r->str = strdup(str); + r->buf = yy_scan_string(r->str, r->scanner); + FB_SYM(set_extra)(r, r->scanner); + // yyset_in(stdin, r->scanner); + // yyset_out(stdout, r->scanner); + return r; + } + + void *vhdl_default_scanner_get_data(const struct vp_scanner *scanner) + { + return scanner->scanner; + } + + void vhdl_default_cleanup_scanner(struct vp_scanner *scanner) + { + free(scanner->str); + yy_delete_buffer(scanner->buf, scanner->scanner); + yylex_destroy(scanner->scanner); + free(scanner); + } + + #ifdef __cplusplus + // } // extern "C" + #endif + + """) + + return out + +def create_header(grammar, mip, namespace = None): + out = "#ifndef " + mip + '\n#define ' + mip + '\n\n' + if namespace is not None: + out += 'namespace ' + namespace + '{\n\n' + + types = grammar_get_types(grammar) + + # struct forward declarations + for t, members in types.iteritems(): + if len(members): + out += '\nstruct ' + t + ';' + out += '\n' + + # struct / non-struct typedefs + for t, members in types.iteritems(): + if not len(members): + out += '\ntypedef const char ' + t + '_t;' + continue + out += '\ntypedef struct ' + t + ' ' + t + '_t;' + out += '\n' + + # struct definitions + for t, rules in types.iteritems(): + if not len(rules): + continue + out += '\n\nstruct ' + t + ' {\n' + + # rule structs + n = 0 + for rule in rules: + n += 1 + idx = 0 + out += '\n\tstruct ' + 'r' + str(n) + '_t {' + for m in rule: + suffix = '' + if rule.count(m) > 1: + idx += 1 + suffix = '_' + str(idx) + p = grammar[m] + out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' + out += '\n\t};' + + # type enum + n = 0 + out += '\n\n\tenum {' + for rule in rules: + n += 1 + out += '\n\t\tt_' + str(n) + ',' + out += '\n\t} type;' + out += '\n' + + # data union + n = 0 + out += '\n\tunion {' + for rule in rules: + n += 1 + out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';' + out += '\n\t} data;' + + # struct done + out += '\n};' + + out += '\n' + + if namespace is not None: + out += '\n} /* namespace ' + namespace + '*/' + out += '\n#endif /* #ifndef + ' + mip + ' */' + + return out + +class GrammarCmd(jwutils.Cmd): + + def __init__(self, name, help): + super(GrammarCmd, self).__init__(name, help=help) + + def add_parser(self, parsers): + p = super(GrammarCmd, self).add_parser(parsers) + p.add_argument("input", help="input file") + p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False) + p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat) + p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False) + p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='') + p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='') + p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='') + return p + + def processGrammar(self, args, grammar): + if args.fix_extensions not in fix_extensions_mode: + raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") + grammar = grammar_fix_extensions(grammar, args.fix_extensions) + if args.unroll_lists: + grammar = grammar_unroll_lists(grammar) + if args.unroll_options: + grammar = grammar_unroll_options(grammar) + if len(args.check_symbols): + check_symbols = [] + if args.check_symbols == 'all': + args.check_symbols = '' + check_symbols = args.check_symbols.split() + grammar_check(grammar, check_symbols) + if len(args.trim_symbols): + grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(',')) + if len(args.cut_symbols): + grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(',')) + return grammar