From 5b7635823812755ce09b257c1823e3e48050fb87 Mon Sep 17 00:00:00 2001
From: Jan Lindemann <jan@janware.com>
Date: Wed, 25 Oct 2017 12:41:51 +0200
Subject: [PATCH] Add grammar.py for generating compiler-compilers

grammar.py is meant as a compiler compiler compiler, taking in a grammar
structure and returning flex and bison files. It can also parse EBNF.
Currently there are still remnants of VHDL-specific stuff in the code,
because parsing VHDL was what it was initially about.

Signed-off-by: Jan Lindemann <jan@janware.com>
---
 tools/python/jwutils/grammar.py | 1138 +++++++++++++++++++++++++++++++
 1 file changed, 1138 insertions(+)
 create mode 100644 tools/python/jwutils/grammar.py

diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py
new file mode 100644
index 0000000..e6420b9
--- /dev/null
+++ b/tools/python/jwutils/grammar.py
@@ -0,0 +1,1138 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+import re
+import lxml.etree as ET
+import textwrap
+from collections import OrderedDict
+from abc import abstractmethod
+
+import jwutils
+
+from jwutils.log import *
+
+t_grammar      = "grammar"
+t_target_lang  = "target"
+
+p_ruleset  = "ruleset"
+p_terminal = "term"
+p_literal  = "literal"
+p_lexical  = "lexical"
+
+mode_unroll  = "unroll"
+mode_concat  = "concat"
+mode_keep    = "keep"
+mode_discard = "discard"
+fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]
+
+member_prefix = ''
+
+special_terminals = {
+    "`"   : "BACKTICK",
+    "^"   : "CARET",
+    "<"   : "LT",
+    "<<"  : "LEFT_SHIFT",
+    "<="  : "LTE",
+    "<=>" : "SPACE_SHIP",
+    "<>"  : "NE",
+    "="   : "EQ",
+    "=>"  : "EG",
+    ">"   : "GT",
+    ">="  : "GE",
+    ">>"  : "RIGHT_SHIFT",
+    "|"   : "PIPE",
+    "_"   : "UNDERSCORE",
+    ","   : "COMMA",
+    ";"   : "SEMICOLON",
+    ":"   : "COLON",
+    ":="  : "DEFINE",
+    "?"   : "QM",
+    "?<"  : "QM_LT",
+    "?<=" : "QM_LE",
+    "?="  : "QM_EQ",
+    "?>"  : "QM_GT",
+    "?>=" : "QM_GE",
+    "??"  : "QM_QM",
+    "?/=" : "QM_DIV_EQ",
+    "/"   : "DIV",
+    "/="  : "DIV_EQ",
+    "."   : "DOT",
+    "\""  : "DQUOTE",
+    "'"   : "QUOTE",
+    "("   : "LPAREN",
+    ")"   : "RPAREN",
+    "["   : "LBRACKET",
+    "]"   : "RBRACKET",
+    "@"   : "AT",
+    "*"   : "ASTERISK",
+    "**"  : "DASTERISK",
+    "\\"  : "BACKSLASH",
+    "&"   : "AMPERSAND",
+    "#"   : "NUMBER_SIGN",
+    "+"   : "PLUS",
+    "-"   : "MINUS"
+}
+
+token_regexes = {
+    "PSL_Property_Declaration"   : "property[ \t]+[^;]+;",
+    "PSL_Sequence_Declaration"   : "sequence[ \t]+[^;]+;",
+    "PSL_Clock_Declaration"      : "default[ \t]+clock[ \t]+[^;]+;",
+    "PSL_Directive"              : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;",
+    "PSL_Verification_Unit"      : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
+}
+
+def dump(obj):
+  for c, v in obj.iteritems():
+    slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
+
+def cleanup_token(tok):
+    tok = tok.strip()
+    if len(tok) == 0:
+        return None
+    return tok
+
+def is_terminal(tok):
+    if not tok.startswith('"'):
+        return None
+    if not tok.endswith('"'):
+        raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"')
+    return tok[1:-1]
+
+def tok2name(tok):
+    tok = cleanup_token(tok)
+    term = is_terminal(tok)
+    if term is not None:
+        if term in special_terminals.keys():
+           return special_terminals[term]
+        return term
+    return tok
+
+def tok2sym(tok):
+    tok = cleanup_token(tok)
+    term = is_terminal(tok)
+    if term is not None:
+        if term in special_terminals.keys():
+           return "T_" + special_terminals[term].upper()
+        return "T_" + term.upper()
+    return tok
+
+def tok2regex(tok):
+    if tok in token_regexes.keys():
+        return token_regexes[tok]
+    return re.escape(tok)
+
+def format_rule(rule):
+    return ' '.join(c.str() for c in rule)
+
+def format_rules(rules):
+    return ', '.join(format_rule(rule) for rule in rules)
+
+def format_yacc_rule(rule):
+    r = ''
+    for c in rule:
+        r += tok2sym(c.token) + ' '
+    return r[:-1]
+
+class RuleComp:
+
+    def __init__(self, token, tp):
+        self.token = token
+        self.tp = tp
+        slog(INFO, "creating rule >" + self.str() + "<")
+
+    def __eq__(self, rhs):
+        if self.token != rhs.token:
+            return False
+        if self.tp != rhs.tp:
+            return False
+        return True
+
+    def __ne__(self, rhs):
+        return not self.__eq__(rhs)
+
+    def str(self):
+        return "{" + self.tp + ": " + self.token + "}"
+
+class State:
+
+    def __init__(self):
+        self.curly = 0
+        self.square = 0
+
+    def reset(self):
+        self.curly = 0
+        self.square = 0
+
+    def optional(self):
+        return self.square != 0 or self.curly != 0
+
+    def update(self, tok):
+        if tok == '[':
+            self.square += 1
+        elif tok == ']':
+            self.square -= 1
+        elif tok == '{':
+            self.curly += 1
+        elif tok == '}':
+            self.curly -= 1
+        if self.curly < 0 or self.square < 0:
+            raise Exception("Unbalanced BNF bracket", tok)
+        return self.optional()
+
+    def in_list(self):
+        return self.curly > 0
+
+    def in_option(self):
+        return self.square > 0
+
+class Symbol:
+
+    def __init__(self, token, tp = p_ruleset, rules = None):
+        self.tp = tp
+        self.token = token
+	self.name = tok2name(token)
+	self.sym = tok2sym(token)
+	self.term = None
+        self.regex = None
+	self.is_lexical_element = False
+	self.rules = []
+	self.datatype = None
+        if rules is not None:
+            self.rules = rules
+        self.set_type(tp)
+
+    def set_type(self, tp):
+        if tp == p_ruleset:
+            self.term = None
+            self.regex = None
+            self.is_lexical_element = False
+            self.datatype = self.token + '_t'
+        elif tp == p_literal:
+            assert(len(self.rules) == 0)
+            self.term = None
+            self.regex = tok2regex(self.token)
+            self.is_lexical_element = False
+            self.datatype = "std::string"
+        elif tp == p_lexical:
+            assert(len(self.rules) == 0)
+            self.term = None
+            self.regex = tok2regex(self.token)
+            self.is_lexical_element = True
+            self.datatype = None
+        elif tp == p_terminal:
+            assert(len(self.rules) == 0)
+            self.term = self.token
+            self.regex = tok2regex(self.token)
+            self.is_lexical_element = False
+            self.datatype = None
+        else:
+            self.dump()
+            raise Exception("Tried to set production to unknown type", tp)
+        self.tp = tp
+
+    def str(self):
+        r = self.name + ' = ' + format_rules(self.rules)
+        return r
+
+    def equals(self, rhs):
+        for k, v in self.__dict__.iteritems():
+            if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]:
+                slog(WARNING, k, self.__dict__[k], rhs.__dict__[k])
+                return False
+        return True
+
+    def dump(self, prio = NOTICE, msg=""):
+	slog(prio, ",----------------", msg)
+	slog(prio, "| type          =", self.tp)
+	slog(prio, "| name          =", self.name)
+        slog(prio, "| token         =", self.token)
+	slog(prio, "| sym           =", self.sym)
+	slog(prio, "| term          =", self.term)
+        slog(prio, "| regex         =", self.regex)
+        slog(prio, "| datatype      =", self.datatype)
+	slog(prio, "| is_lexical_element =", self.is_lexical_element)
+	slog(prio, "| rules         =", format_rules(self.rules))
+	slog(prio, "`----------------", msg)
+
+def grammar_get_types(grammar):
+    types = dict()
+    for t, p in grammar.iteritems():
+        if not len(p.rules):
+            continue
+        if p.term is not None:
+            continue
+        ruleno = 1
+        rules = []
+        for rule in p.rules:
+            members = []
+            for c in rule:
+                if c.tp != t_target_lang:
+                    continue
+                if not c.token in grammar.keys():
+                    p.dump(ERR)
+                    raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule))
+                pp = grammar[c.token]
+                if pp.tp is p_terminal:
+                    continue
+                members.append(tok2sym(c.token))
+            if True or len(members):
+                rules.append(members)
+        if t in types.keys():
+            raise Exception("Tried to add type", t, "twice")
+        types[t] = rules
+    return types
+
+def grammar_fix_extensions(grammar, mode):
+    for tok, p in grammar.iteritems():
+        newrules = []
+        for rule in p.rules:
+            newrule = []
+            prefix = ""
+            paren = 0
+            for c in rule:
+                if c.tp != t_target_lang:
+                    if c.token == '(':
+                        paren += 1
+                    elif c.token == ')':
+                        paren -= 1
+                    if paren <= 1: # don't add first level of control chars
+                        continue
+                    newrule.append(c)
+                    continue
+                if paren > 0:
+                    assert(len(c.token) != 0)
+                    prefix += '_' + c.token
+                    continue
+                if len(prefix) > 0:
+                    prefix = prefix[1:]
+                    slog(INFO, "Found prefix", prefix)
+                    if mode == mode_keep:
+                        newrule.append(RuleComp('(', t_grammar))
+                        newrule.append(RuleComp(prefix, t_target_lang))
+                        newrule.append(RuleComp(')', t_grammar))
+                        newrule.append(c)
+                    elif mode == mode_discard:
+                        prefix = ''
+                        continue
+                    elif mode in [ mode_unroll, mode_concat ]:
+                        combined = RuleComp(c.token, c.tp)
+                        combined.token = prefix + c.token
+                        prefix = ''
+                        newrule.append(combined)
+                        slog(INFO, "Appended new rule return value", combined.token)
+                        if mode == mode_unroll:
+                            if combined.token in grammar.keys():
+                                continue
+                            grammar[combined.token] = Symbol(combined.token, rules=[[c]])
+                    else:
+                        raise Exception("Invalid prefix mode", mode)
+                    prefix = ''
+                    continue
+                newrule.append(c)
+            if len(prefix): # undigested prefix, since it was the last
+                newrule.append(RuleComp(prefix[1:], t_target_lang))
+            newrules.append(newrule)
+        grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only
+    return grammar # TODO: not sure if this is necessary
+
+def grammar_unroll_lists(grammar):
+    delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic
+    for tok, p in grammar.iteritems():
+        newrules = []
+        for rule in p.rules:
+            newrule = []
+            listrule = []
+            prefix = None
+            s = State()
+            for c in rule:
+                s.update(c.token)
+                if c.token == '{':
+                    continue
+                if c.token == '}':
+                    if len(listrule) == 0:
+                        raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
+                    name = ""
+                    delpos = []
+                    for i, rule in enumerate(listrule):
+                        if rule.token in delimiters:
+                            delpos.append(i)
+                            continue
+                        if rule.tp != t_target_lang:
+                            continue
+                        name += tok2name(rule.token) + "_"
+                    if len(delpos) != 1:
+                        raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule))
+                    name = name + "my_list"
+                    newrule.append(RuleComp(name, t_target_lang))
+                    p = Symbol(name, rules=[[], listrule])
+                    #p = Symbol(name)
+                    #p.rules = [ [], listrule ]
+                    listrule = []
+                    if name not in grammar.keys():
+                        grammar[name] = p
+                        continue
+                    if not p.equals(grammar[name]):
+                        p.dump(ERR, "old list production")
+                        p.dump(ERR, "new list production")
+                        raise Exception("List production expands to already taken name", name)
+                    continue
+                if s.in_list():
+                    listrule.append(c)
+                    continue
+                newrule.append(c)
+            newrules.append(newrule)
+        grammar[tok].rules = newrules
+    return grammar
+
+def rules_unroll_options(rules):
+    r = []
+    found = False
+    slog(DEBUG, "unrolling", format_rules(rules))
+    for rule in rules:
+        square = 0
+        option = []
+        newrule = []
+        for i, c in enumerate(rule):
+            if c.tp == t_grammar:
+                if c.token == '[':
+                    square += 1
+                elif c.token == ']':
+                    square -= 1
+                if square == 1:
+                    continue
+            if square >= 1:
+                option.append(c)
+                continue
+            slog(DEBUG, "square =", square)
+            assert(square == 0)
+            n = len(option)
+            if n == 0:
+                newrule.append(c)
+                continue
+            # first without option
+            replaced = newrule[:]
+            tail = rule[i+1:len(rule)]
+            slog(DEBUG, "i                      = ", i)
+            slog(DEBUG, "n                      = ", n)
+            slog(DEBUG, "rule                   = ", format_rule(rule))
+            slog(DEBUG, "tail                   = ", format_rule(tail))
+            slog(DEBUG, ",-------------------------")
+            slog(DEBUG, "head                   = ", format_rule(replaced))
+            replaced.extend(tail)
+            slog(DEBUG, "head + tail            = ", format_rule(replaced))
+            r.append(replaced)
+            # then with option inserted
+            for unrolled in rules_unroll_options([ option ]):
+                replaced = newrule[:]
+                slog(DEBUG, ",-------------------------")
+                slog(DEBUG, "head                   = ", format_rule(replaced))
+                slog(DEBUG, "unrolled               = ", format_rule(unrolled))
+                replaced.extend(unrolled)
+                slog(DEBUG, "head + unrolled        =", format_rule(replaced))
+                replaced.extend(tail)
+                slog(DEBUG, "head + unrolled + tail =", format_rule(replaced))
+                r.append(replaced)
+            found = True
+            break
+        if not found:
+            r.append(newrule)
+    if found:
+        return rules_unroll_options(r)
+    return r
+
+def grammar_unroll_options(grammar):
+    for tok, p in grammar.iteritems():
+        grammar[tok].rules = rules_unroll_options(p.rules)
+    return grammar
+
+def format_ebnf_rule(grammar, rule):
+    r = ""
+    for comp in rule:
+        if comp.tp == t_grammar:
+            r = r + " " + comp.token
+            continue
+        r = r + " " + comp.token
+    return r.strip()
+
+def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
+    if checked is None:
+        checked = set()
+    if found is None:
+        found = dict()
+    indent = ' ' * depth * 2
+    if tok in found.keys():
+        slog(INFO, indent + " + found cached", tok, "with depth", found[tok])
+        return found[tok]
+    slog(INFO, indent + " + " + tok)
+    indent = indent + "  "
+    if tok in terminals:
+        found[tok] = 1
+        slog(INFO, indent + " + found terminal", tok, "with depth", found[tok])
+        return 1
+    if tok in orphans:
+        found[tok] = 1
+        slog(INFO, indent + " + found orphan", tok, "with depth", found[tok])
+        return 1
+    #if tok in lexicals:
+    #    found[tok] = 1
+    #    slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok])
+    #    return 1
+    if tok in checked:
+        slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked))
+        return sys.maxint
+
+    slog(INFO, indent, "checked =", ' '.join(checked))
+    checked.add(tok)
+    if tok not in grammar.keys():
+        slog(ERR, "tried to validate unknown token \"" + tok + "\"")
+        return sys.maxint
+    p = grammar[tok]
+    r = sys.maxint
+    slog(INFO, indent, p.token, "has", len(p.rules), "rules")
+    only_optional = True
+    for rule in p.rules:
+        slog(INFO, indent, "testing rule", format_rule(rule))
+        if tok in [ c.token for c in rule ]:
+            continue
+        mn = sys.maxint
+        mx = 0
+        s = State()
+        for c in rule:
+            slog(INFO, indent, "testing token", c.token)
+            if c.tp == t_grammar and s.update(c.token):
+                continue
+            if c.tp != t_target_lang:
+                slog(INFO, indent, "  token", c.token, "is not a VHDL token")
+                continue
+            only_optional = False
+            # same "found" argument in next call?
+            rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found)
+            slog(INFO, indent, "  token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx)
+            if rr == sys.maxint or rr is None:
+                slog(INFO, indent, "  got error for token", c.token)
+                mn = sys.maxint
+                mx = 0
+                break
+            if rr > mx:
+                slog(INFO, indent, "  adjusting mx to", rr)
+                mx = rr
+            if rr < mn:
+                slog(INFO, indent, "  adjusting mn to", rr)
+                mn = rr
+        if mn == sys.maxint or mx == 0: # unusable as escape route
+            slog(INFO, indent, "  unusable as escape route for " + tok + ":", format_rule(rule))
+            continue
+        slog(INFO, indent, "after checking all rules, mx is", mx)
+        if mx < r:
+            slog(INFO, indent, "setting return value to max", mx)
+            r = mx
+    if only_optional:
+        slog(INFO, indent, tok, "has only optional rules, accepting")
+        r = 0
+    if r != sys.maxint:
+        r += 1
+        slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps")
+        found[tok] = r
+    slog(INFO, indent, "returning", r, "for token", tok)
+    return r
+
+def grammar_check(grammar, selements = None):
+    if selements is None:
+        selements = []
+    terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
+    orphans   = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
+    lexicals  = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
+    elements  = set()
+    if len(selements) == 0:
+        for tok, p in grammar.iteritems():
+            if p.is_lexical_element:
+                elements.add(tok)
+                continue
+            for rule in p.rules:
+                for c in rule:
+                    if c.tp == t_grammar:
+                        continue
+                    elements.add(c.token)
+        selements = sorted(list(elements))
+    found = dict()
+    for tok in selements:
+        slog(INFO, "======= checking", tok)
+        rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
+        if rr == sys.maxint:
+            slog(ERR, "No way out for", tok, "in production", p.str())
+            exit(1)
+        if not tok in grammar.keys():
+            slog(ERR, "Token", tok, "has no production")
+            exit(1)
+        slog(INFO, tok, "->", str(rr))
+
+def grammar_lhss_map(grammar):
+    r = dict()
+    for t in grammar.keys():
+        r[t] = set()
+    for t, p in grammar.iteritems():
+        for rule in p.rules:
+            for c in rule:
+                if c.tp == t_target_lang:
+                    r[c.token].add(t)
+    return r
+
+def do_grammar_lhss(dmap, stop, rhs, buf, recursive):
+    lhss = dmap[rhs]
+    for lhs in lhss:
+        if lhs in buf:
+            continue
+        buf.add(lhs)
+        if lhs in stop:
+            slog(INFO, "    symbol", lhs, "is among stop symbols, stopping recursion")
+            continue
+        if recursive:
+            do_grammar_lhss(dmap, stop, lhs, buf, recursive)
+
+def grammar_lhss(dmap, stop, symbols, recursive = False):
+    r = set()
+    for s in symbols:
+        if s in r:
+            continue
+        do_grammar_lhss(dmap, stop, s, r, recursive)
+    return r
+
+def do_grammar_rhss(grammar, stop, sym, buf):
+    p = grammar[sym]
+    for rule in p.rules:
+        for c in rule:
+            if c.tp != t_target_lang:
+                continue
+            if c.token in stop:
+                continue
+            if c.token in buf:
+                continue
+            buf.add(c.token)
+            do_grammar_rhss(grammar, stop, c.token, buf)
+
+def grammar_rhss(grammar, stop, symbols):
+    r = set()
+    for s in symbols:
+        if s in r:
+            continue
+        do_grammar_rhss(grammar, stop, s, r)
+    return r
+
+def grammar_symbol_in_use(grammar, dmap, stop, checked, sym):
+    if sym in stop:
+        return False
+    # Does this have to be recursive?
+    defined = grammar_lhss(dmap, stop, set([sym]))
+    slog(INFO, "    symbol", sym, "defines:", ', '.join(defined))
+    if not len(defined):
+        return True
+    for d in defined:
+        if d in stop:
+            continue
+        if d in checked:
+            continue
+        checked.add(d)
+        if grammar_symbol_in_use(grammar, dmap, stop, checked, d):
+            return True
+    return False
+
+def do_grammar_unused(grammar, dmap, doomed):
+    r = set(doomed)
+    rhss = grammar_rhss(grammar, set(), doomed)
+    for rhs in rhss:
+        slog(INFO, "+++ checking if symbol", rhs, "is in use >>")
+        if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs):
+            slog(INFO, "  symbol", rhs, "is not in use")
+            r.add(rhs)
+        slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<")
+    return r
+
+def grammar_unused(grammar, dmap, doomed):
+    r = set(doomed)
+    while True:
+        unused = do_grammar_unused(grammar, dmap, r)
+        slog(INFO, "unused:", ', '.join(unused))
+        slog(INFO, "r:     ", ', '.join(r))
+        if unused == r:
+            break
+        r |= unused
+    return r
+
+# eradicate symbols from tree
+def grammar_cut_symbols(grammar, symbols):
+    slog(INFO, "-------- removing symbols:", ', '.join(symbols))
+    dmap = grammar_lhss_map(grammar)
+    unused = grammar_unused(grammar, dmap, symbols)
+    for s in unused:
+        slog(INFO, " + removing symbol", s)
+        del grammar[s]
+    return grammar
+
+# make symbol an empty literal production
+def grammar_trim_symbols(grammar, symbols):
+    grammar_cut_symbols(grammar, symbols)
+    for s in symbols:
+        slog(INFO, " + adding empty production for symbol", s)
+        p = Symbol(s)
+        p.set_type(p_literal)
+        grammar[s] = p
+
+    return grammar
+
+def create_ebnf(grammar):
+    indent = 40
+    for t, p in grammar.iteritems():
+        if not len(p.rules):
+            slog(INFO, "ignoring " + t + " (has no rules)\n")
+            continue
+        out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0])
+        for rule in p.rules[1:]:
+            out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule)
+        return out + "\n"
+
+def create_yacc(grammar):
+    indent = ' ' * 40
+    width = 0
+    for t, p in grammar.iteritems():
+        if p.term is not None:
+            continue
+        if len(t) > width:
+            width = len(t)
+    spaces = 0
+    while spaces < width:
+        spaces += 8
+    indent = '\t' * (spaces / 8)
+
+    out = ""
+
+    # preamble
+    out += textwrap.dedent("""\
+        %{
+        #include <stdio.h>
+        #include <string.h>
+        #include <assert.h>
+        #include <stdlib.h>
+        #include <stdarg.h>
+
+        #include <vector>
+        #include <string>
+
+        #include "include/defs.h"
+        #include "include/vhdl2017.h"
+        #include "include/lex.vhdl2017.h"
+        #include "include/vhdl2017.tab.h"
+
+        using namespace std;
+        using namespace v2017;
+
+        namespace {
+
+        typedef vector<const char *> wrap_t;
+        const wrap_t curly_braces{ "{", "}" };
+        const wrap_t round_braces{ "(", ")" };
+
+        }
+
+        #ifdef __cplusplus
+        // extern "C" {
+        #endif
+
+        %}
+
+    """)
+
+    # types
+    out += textwrap.dedent("""\
+        %union {
+    """)
+
+    types = grammar_get_types(grammar)
+    for t in types.keys():
+        out += '\n\tv2017::' +  t + '_t *' + t + ';'
+    out += '\n'
+
+    out += textwrap.dedent("""\
+        }
+
+    """)
+
+    # yydecl
+    out += textwrap.dedent("""\
+        %{
+        // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner);
+        YY_DECL;
+        %}
+    """)
+
+    # terminal tokens
+    out += '\n'
+    for t, p in grammar.iteritems():
+        if p.tp == p_terminal:
+            #out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
+            out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
+
+    # regex tokens
+    out += '\n'
+    for t, p in grammar.iteritems():
+        if p.tp == p_literal:
+            #out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
+            out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
+
+    # types
+    out += '\n'
+    for t, p in grammar.iteritems():
+        if p.tp == p_ruleset:
+            out += '%type <' + tok2sym(p.token) + '> ' + t + (40 - len(t)) * ' ' + '/* ' + t + ' */' +'\n'
+
+    out += textwrap.dedent("""\
+
+        %define parse.error verbose
+        %define api.pure full
+        %param { struct context *context } { void *scanner }
+    """)
+
+    # productions
+    out += '\n%%\n\n'
+    for t, p in grammar.iteritems():
+        if not len(p.rules):
+            continue
+        if p.term is not None:
+            continue
+        #if p.is_lexical_element is True:
+        #    continue
+        if len(p.rules) == 0:
+            raise Exception("Symbol ", p.str(), "has no rules")
+        first = True
+        n_rule = 0
+        for rule in p.rules:
+            n_rule += 1
+            n = 0
+            s = State()
+            if first:
+                out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n"
+                first = False
+            else:
+                out += indent + "| " + format_yacc_rule(rule) + "\n"
+            out += indent + "{" + "\n"
+            out += indent + "\t" + "$$->type = v2017::" + t + "::t_" + str(n_rule) + ";\n"
+            tokens = []
+            for c in rule:
+                if c.tp == t_target_lang:
+                    tokens.append(c.token)
+            idx = 0
+            for c in rule:
+                n += 1
+                if c.tp == t_grammar:
+                    s.update(c.token)
+                    continue
+                p = grammar[c.token]
+                #if is_terminal(c.token) is not None:
+                #    continue
+                if p.tp not in [ p_ruleset ]:
+                    continue
+                tp = tok2name(c.token)
+                suffix = ''
+                if tokens.count(c.token) > 1:
+                    idx += 1
+                    suffix = '_' + str(idx)
+                out += indent + "\t" + \
+                    "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \
+                    " = new " + p.datatype + "(*$" + str(n) + ");\n"
+            out += indent + "}" + "\n"
+        out += indent + ";\n\n"
+
+    # tail
+    out += '\n%%\n\n'
+
+    out += textwrap.dedent("""
+        #ifdef __cplusplus
+        // } /* extern "C" */
+        #endif
+    """)
+
+    return out + "\n"
+
+def create_lex(grammar):
+
+    ignore = ""
+
+    out = textwrap.dedent("""\
+        %option reentrant
+        %option bison-bridge
+
+        %{
+        #include <slog.h>
+
+        #include "include/defs.h"
+        #include "include/vhdl2017.h"
+
+        // #include "include/lex.vhdl2017.h"
+        #include "include/vhdl2017.tab.h"
+
+        using namespace v2017;
+
+        /* This is too late in the Flex generated file to work. Still lots of
+         * prototypes are spat into it above it, which end up with C++ linkage, of
+         * course, causing the linkages to be inconsistent to the functions below this
+         * extern "C". Only way I found was to use C++ is to use it on Bison only, and
+         * have Flex use C instead. */
+        #ifdef __cplusplus
+        // extern "C" {
+        #endif
+
+        #ifdef _REMOVE_ME
+        static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
+        static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
+        #endif
+
+        %}
+
+        %%
+
+        \\n { context->line++; }
+
+        """)
+
+    for t, p in grammar.iteritems():
+        if p.term is not None:
+            # \. { return T_DOT; }
+            assert(p.term[0] == '"')
+            assert(p.term[-1] == '"')
+            out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'
+
+    out += textwrap.dedent("""\
+
+        %{/* basic_identifier */%}
+        %{/* extended_identifier */%}
+        %{/* based_integer */%}
+        %{/* bit_value */%}
+        %{/* numeric_literal */%}
+        %{/* enumeration_literal */%}
+        %{/* string_literal */%}
+        %{/* bit_string_literal */%}
+        %{/* character_literal */%}
+        %{/* graphic_character */%}
+        %{/* basic_character */%}
+        %{/* integer */%}
+
+	""")
+
+    ignore += textwrap.dedent("""\
+
+        %{ /* not sure how to handle literals >> */ %}
+        \\"[ \\!#-~]*\\" |
+        \\'[0-1]\\' {
+        	// get_string(yylval_param, yyscanner, 1);
+        	/* Gets a string excluding " or ' */
+        	int skip = 1;
+        	int i;
+
+        	for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++);
+        	yytext[i] = 0;
+        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
+        	lv->txt=(char *)malloc(i+1);
+        	strcpy(lv->txt, yytext+skip);
+
+        	return STRING;
+        }
+
+        #[0-9a-f]*# {
+        	// get_based_string(yylval_param, yyscanner, 1); /* skip leading # */
+        	/* Gets a string excluding # */
+        	int i;
+        	int skip = 1;
+
+        	for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++);
+        	yytext[i] = 0;
+        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
+        	lv->txt = (char *)malloc(i+1);
+        	strcpy(lv->txt, yytext + skip);
+
+        	return BASED;
+        }
+
+        [a-zA-Z_$][a-zA-Z0-9_$.]* {
+        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
+        	lv->txt=(char *)malloc(strlen(yytext)+1);
+        	strcpy(lv->txt, yytext);
+        	return NAME;
+        }
+
+        [0-9]+ {
+        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
+        	sscanf(yytext, "%d", &lv->n);
+        	return NATURAL;
+        }
+
+        """)
+
+    out += textwrap.dedent("""\
+        . {
+        	return yytext[0];
+        }
+
+        %{/* not sure how to handle literals << */%}
+
+        %%
+
+        void FB_SYM(error)(struct context *context, void *scanner, const char *msg)
+        {
+        	struct yyguts_t *yyg =(struct yyguts_t*)scanner;
+        	// vp_log(context->vp, VP_LOG_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->lineno);
+        	slog(PRI_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->line);
+        }
+
+        int FB_SYM(wrap)(void *scanner)
+        {
+        	return 1;
+        }
+
+        struct vp_scanner {
+        	YY_BUFFER_STATE buf;
+        	void *scanner;
+        	char *str;
+        };
+
+        /* utilities which need to be placed here, because I can't find
+         * yylex_init() / _destroy() in any generated header file (??) */
+        struct vp_scanner *vhdl_default_init_scanner(const char *str)
+        {
+        	struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r));
+
+        	yylex_init(&r->scanner);
+        	r->str = strdup(str);
+        	r->buf = yy_scan_string(r->str, r->scanner);
+        	FB_SYM(set_extra)(r, r->scanner);
+        	// yyset_in(stdin, r->scanner);
+        	// yyset_out(stdout, r->scanner);
+        	return r;
+        }
+
+        void *vhdl_default_scanner_get_data(const struct vp_scanner *scanner)
+        {
+        	return scanner->scanner;
+        }
+
+        void vhdl_default_cleanup_scanner(struct vp_scanner *scanner)
+        {
+        	free(scanner->str);
+        	yy_delete_buffer(scanner->buf, scanner->scanner);
+        	yylex_destroy(scanner->scanner);
+        	free(scanner);
+        }
+
+        #ifdef __cplusplus
+        // } // extern "C"
+        #endif
+
+	""")
+
+    return out
+
+def create_header(grammar, mip, namespace = None):
+    out = "#ifndef " + mip + '\n#define ' + mip + '\n\n'
+    if namespace is not None:
+        out += 'namespace ' + namespace + '{\n\n'
+
+    types = grammar_get_types(grammar)
+
+    # struct forward declarations
+    for t, members in types.iteritems():
+        if len(members):
+            out += '\nstruct ' +  t + ';'
+    out += '\n'
+
+    # struct / non-struct typedefs
+    for t, members in types.iteritems():
+        if not len(members):
+            out += '\ntypedef const char ' +  t + '_t;'
+            continue
+        out += '\ntypedef struct ' +  t + ' ' + t + '_t;'
+    out += '\n'
+
+    # struct definitions
+    for t, rules in types.iteritems():
+        if not len(rules):
+            continue
+        out += '\n\nstruct ' +  t + ' {\n'
+
+        # rule structs
+        n = 0
+        for rule in rules:
+            n += 1
+            idx = 0
+            out += '\n\tstruct ' + 'r' + str(n) + '_t {'
+            for m in rule:
+                suffix = ''
+                if rule.count(m) > 1:
+                    idx += 1
+                    suffix = '_' + str(idx)
+                p = grammar[m]
+                out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
+            out += '\n\t};'
+
+        # type enum
+        n = 0
+        out += '\n\n\tenum {'
+        for rule in rules:
+            n += 1
+            out += '\n\t\tt_' + str(n) + ','
+        out += '\n\t} type;'
+        out += '\n'
+
+        # data union
+        n = 0
+        out += '\n\tunion {'
+        for rule in rules:
+            n += 1
+            out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';'
+        out += '\n\t} data;'
+
+        # struct done
+        out += '\n};'
+
+    out += '\n'
+
+    if namespace is not None:
+        out += '\n} /* namespace ' + namespace + '*/'
+    out += '\n#endif /* #ifndef + ' + mip + ' */'
+
+    return out
+
+class GrammarCmd(jwutils.Cmd):
+
+    def __init__(self, name, help):
+        super(GrammarCmd, self).__init__(name, help=help)
+
+    def add_parser(self, parsers):
+        p = super(GrammarCmd, self).add_parser(parsers)
+        p.add_argument("input", help="input file")
+        p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False)
+        p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat)
+        p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False)
+        p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='')
+        p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='')
+        p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='')
+        return p
+
+    def processGrammar(self, args, grammar):
+        if args.fix_extensions not in fix_extensions_mode:
+            raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
+        grammar = grammar_fix_extensions(grammar, args.fix_extensions)
+        if args.unroll_lists:
+            grammar = grammar_unroll_lists(grammar)
+        if args.unroll_options:
+            grammar = grammar_unroll_options(grammar)
+        if len(args.check_symbols):
+            check_symbols = []
+            if args.check_symbols == 'all':
+                args.check_symbols = ''
+            check_symbols = args.check_symbols.split()
+            grammar_check(grammar, check_symbols)
+        if len(args.trim_symbols):
+            grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(','))
+        if len(args.cut_symbols):
+            grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(','))
+        return grammar