jw-python/tools/python/jwutils/grammar.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import sys
import re
import lxml.etree as ET
import textwrap
import itertools
import copy
from collections import OrderedDict
from abc import abstractmethod

import jwutils

from jwutils.log import *

t_grammar      = "grammar"
t_target_lang  = "target"

p_ruleset  = "ruleset"
p_terminal = "term"
p_literal  = "literal"
p_lexical  = "lexical"
p_special  = "special"

mode_unroll  = "unroll"
mode_concat  = "concat"
mode_keep    = "keep"
mode_discard = "discard"
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]

member_prefix = ''

special_terminals = {
    "`"   : "BACKTICK",
    "^"   : "CARET",
    "<"   : "LT",
    "<<"  : "LEFT_SHIFT",
    "<="  : "LTE",
    "<=>" : "SPACE_SHIP",
    "<>"  : "NE",
    "="   : "EQ",
    "=>"  : "EG",
    ">"   : "GT",
    ">="  : "GE",
    ">>"  : "RIGHT_SHIFT",
    "|"   : "PIPE",
    "_"   : "UNDERSCORE",
    ","   : "COMMA",
    ";"   : "SEMICOLON",
    ":"   : "COLON",
    ":="  : "DEFINE",
    "?"   : "QM",
    "?<"  : "QM_LT",
    "?<=" : "QM_LE",
    "?="  : "QM_EQ",
    "?>"  : "QM_GT",
    "?>=" : "QM_GE",
    "??"  : "QM_QM",
    "?/=" : "QM_DIV_EQ",
    "/"   : "DIV",
    "/="  : "DIV_EQ",
    "."   : "DOT",
    "\""  : "DQUOTE",
    "'"   : "QUOTE",
    "("   : "LPAREN",
    ")"   : "RPAREN",
    "["   : "LBRACKET",
    "]"   : "RBRACKET",
    "@"   : "AT",
    "*"   : "ASTERISK",
    "**"  : "DASTERISK",
    "\\"  : "BACKSLASH",
    "&"   : "AMPERSAND",
    "#"   : "NUMBER_SIGN",
    "+"   : "PLUS",
    "-"   : "MINUS"
}

token_regexes = {
    "PSL_Property_Declaration"   : "property[ \t]+[^;]+;",
    "PSL_Sequence_Declaration"   : "sequence[ \t]+[^;]+;",
    "PSL_Clock_Declaration"      : "default[ \t]+clock[ \t]+[^;]+;",
    "PSL_Directive"              : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;",
    "PSL_Verification_Unit"      : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
}

quotechars = [ '"', "'" ]

def dump(obj):
  for c, v in obj.iteritems():
    slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))

def dump_grammar(prio, grammar):
    caller = get_caller_pos()
    for t, p in grammar.iteritems():
        p.dump(prio, caller=caller)

def cleanup_token(tok):
    tok = tok.strip()
    if len(tok) == 0:
        return None
    if tok[0] == "'" and tok[-1] == "'":
        tok = '"' + tok[1:-1] + '"'
    return tok

def tok2ctype(tok):
    if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]:
        return t_grammar
    return t_target_lang

def is_terminal(tok):
    size = len(tok)
    if size < 2:
        return None
    first = tok[0]
    last = tok[-1]
    if (not first in quotechars) and (not last in quotechars):
        return None
    if first != last:
        raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
    return tok[1:-1]

def tok2name(tok):
    tok = cleanup_token(tok)
    term = is_terminal(tok)
    if term is not None:
        if term in special_terminals.keys():
           return special_terminals[term]
        return term
    return tok

def tok2sym(tok):
    tok = cleanup_token(tok)
    term = is_terminal(tok)
    if term is not None:
        if term in special_terminals.keys():
           return "T_" + special_terminals[term].upper()
        return "T_" + term.upper()
    return tok

def tok2regex(tok):
    if tok in token_regexes.keys():
        return token_regexes[tok]
    return re.escape(tok)

def format_rule(rule):
    return ' '.join(c.str() for c in rule)

def format_rules(rules):
    return ', '.join(format_rule(rule) for rule in rules)

def format_ebnf_rule(grammar, rule):
    r = ""
    last = None
    for comp in rule:
        if last is not None:
            if comp.tp == t_grammar:
                if last.tp == t_grammar:
                    pass
                else:
                    if comp.token in [ '[', '(', '{', '<' ]:
                        r += ','
            else:
                if last.tp == t_grammar:
                    if comp.token in [ ']', ')', '}', '>' ]:
                        r += ','
                else:
                        r += ','
        r += ' ' + comp.token
        last = comp
    if len(r) == 0:
        return r
    return r.strip()

def format_yacc_rule(rule):
    r = ''
    for c in rule:
        r += tok2sym(c.token) + ' '
    return r[:-1]

class SourceElement:

    def __init__(self, token, line):
        self.token = token
        self.line = line

class RuleComp:

    def __init__(self, token, tp = None, line=-1):
        assert(token is not None)
        # assert(token != '|')
        self.token = token
        if tp is None:
            tp = tok2ctype(token)
        self.tp = tp
        slog(INFO, "creating rule component >" + self.str() + "<")
        assert(token != "{ assignment")
        self.line = line

    def __eq__(self, rhs):
        if self.token != rhs.token:
            return False
        if self.tp != rhs.tp:
            return False
        return True

    def __ne__(self, rhs):
        return not self.__eq__(rhs)

    def str(self):
        tp = 'u'
        if self.tp == t_grammar:
            tp = 'g'
        elif self.tp == t_target_lang:
            tp = 'l'
        else:
            tp = self.tp
        return "{" + tp + ": " + self.token + "}"

class State:

    def __init__(self):
        self.__pair_square = ['[', ']']
        self.__pair_curly = ['{', '}']
        self.__pair_ext = ['<', '>']
        self.__pair_group = ['(', ')']
        self.__pair_comment = ['(*', '*)']
        self.__pair_special = ['?', '?']
        self.reset()

    def reset(self):
        self.curly = 0
        self.square = 0
        self.ext = 0
        self.group = 0
        self.in_comment = False
        self.in_special = False
        self.production = None
        self.rule = []
        self.rules = []
        self.things = []

    def optional(self):
        return self.square != 0 or self.curly != 0

    def update(self, tok, line):
        if not self.in_comment:
            if tok == '[':
                self.square += 1
                self.things.append(self.__pair_square)
            elif tok == ']':
                self.square -= 1
                assert(self.things.pop() == self.__pair_square)
            elif tok == '{':
                self.curly += 1
                self.things.append(self.__pair_curly)
            elif tok == '}':
                self.curly -= 1
                assert(self.things.pop() == self.__pair_curly)
            elif tok == '(':
                self.group += 1
                self.things.append(self.__pair_group)
            elif tok == ')':
                self.group -= 1
                assert(self.things.pop() == self.__pair_group)
            elif tok == '<':
                self.ext += 1
                self.things.append(self.__pair_ext)
            elif tok == '>':
                self.ext -= 1
                assert(self.things.pop() == self.__pair_ext)
            elif tok == '?':
                if not self.in_special:
                    self.in_special = True
                    self.things.append(self.__pair_special)
                else:
                    self.in_special = False
                    assert(self.things.pop() == self.__pair_special)
            elif tok == '(*':
                self.in_comment = True
                self.things.append(self.__pair_comment)
            elif tok == '*)':
                raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
        else:
            if tok == '(*':
                raise Exception("Nested EBNF comment", tok, "in line", line)
            elif tok == '*)':
                assert(self.things.pop() == self.__pair_comment)
                self.in_comment = False

        if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
            raise Exception("Unbalanced BNF bracket", tok, "in line", line)
        return self.optional()

    def in_list(self):
        return self.curly > 0

    def in_option(self):
        return self.square > 0

    def in_group(self):
        return self.group > 0

    def in_ext(self):
        return self.ext > 0

    def in_something(self):
        if len(self.things) == 0:
            return None
        return self.things[-1]

class Symbol:

    def __init__(self, token, tp = None, rules = None):
        self.reset(token, tp, rules)

    def reset(self, token, tp = None, rules = None):
        if tp == None:
            if is_terminal(token) is not None:
                tp = p_terminal
            else:
                tp = p_ruleset
        self.tp = tp
        self.token = token
        self.name = tok2name(token)
        self.sym = tok2sym(token)
        self.term = None
        self.regex = None
        self.is_lexical_element = False
        self.rules = []
        self.datatype = None
        if rules is not None:
            self.rules = rules
        self.set_type(tp)

    def set_type(self, tp):
        if tp == p_ruleset:
            self.term = None
            self.regex = None
            self.is_lexical_element = False
            self.datatype = self.token + '_t'
        elif tp == p_literal:
            assert(len(self.rules) == 0)
            self.term = None
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
            self.datatype = 'std::string'
        elif tp == p_special or tp == p_lexical:
            if len(self.rules):
                self.dump(ERR)
                raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
            self.term = None
            self.regex = None
            self.is_lexical_element = True
            self.datatype = 'std::string'
        elif tp == p_terminal:
            if len(self.rules):
                slog(ERR, "rules = ", self.rules)
                self.dump(ERR)
                raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
            self.term = self.token
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
            self.datatype = None
        else:
            self.dump()
            raise Exception("Tried to set symbol to unknown type", tp)
        self.tp = tp

    def str(self):
        r = self.name + ' = ' + format_rules(self.rules)
        return r

    def equals(self, rhs):
        for k, v in self.__dict__.iteritems():
            if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]:
                slog(WARNING, k, self.__dict__[k], rhs.__dict__[k])
                return False
        return True

    def dump(self, prio = NOTICE, msg="", caller=None):
        if caller is None:
            caller = get_caller_pos(1)
        slog(prio, ",----------------", msg, caller=caller)
        slog(prio, "| type          =", self.tp, caller=caller)
        slog(prio, "| name          =", self.name, caller=caller)
        slog(prio, "| token         =", self.token, caller=caller)
        slog(prio, "| sym           =", self.sym, caller=caller)
        slog(prio, "| term          =", self.term, caller=caller)
        slog(prio, "| regex         =", self.regex, caller=caller)
        slog(prio, "| datatype      =", self.datatype, caller=caller)
        slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller)
        slog(prio, "| rules         =", format_rules(self.rules), caller=caller)
        slog(prio, "`----------------", msg, caller=caller)

def split_list_by(l_, tok):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]

def split_list_by_regex(l_, regex):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]

def grammar_tokenize_ebnf(content):
    r = []
    c = ''
    l = 0
    in_comment = False
    in_quote = None
    for line in content.splitlines(True):
        end = len(line) - 1
        l += 1
        tok = ''
        p = -1
        while p < end:
            p += 1
            if p < end and in_quote == None:
                cand = line[p:p+2]
                if cand == '(*':
                    if in_comment:
                        raise Exception("Syntax error in line", l, ": spurious comment closure")
                    in_comment = True
                    p += 1
                    continue
                elif cand == '*)':
                    if not in_comment:
                        raise Exception("Syntax error in line", l, ": spurious comment opener")
                    in_comment = False
                    p += 1
                    continue
            if in_comment:
                continue
            c = line[p]
            if c in [ '"', "'" ]:
                if in_quote is None:
                    in_quote = c
                else:
                    if in_quote == c:
                        in_quote = None
            if in_quote is not None:
                tok += c
                continue
            if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
                tok = tok.strip()
                if len(tok):
                    r.append((tok, l))
                tok = ''
                if not c.isspace():
                    r.append((c, l))
                continue
            tok += c

    tok = tok.strip()
    if len(tok):
        r.append((tok, l))
    return r

def grammar_add_symbol(grammar, tok, rules):
    assert(tok is not None)
    if tok in grammar.keys():
        s = grammar[tok]
    else:
        s = Symbol(tok, rules=rules)
        grammar[tok] = s
    if rules is not None:
        slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
        for rule in rules:
            if not rule in s.rules:
                s.rules.append(rule)
        grammar[tok] = s

def grammar_parse_ebnf_tokens(tokens):
    grammar = OrderedDict()
    state = State()
    lhs = None
    last = None
    ruleset = []
    rule = []
    terminals = []
    specials = []
    for tok, line in tokens:
        try:
            state.update(tok, line)
            if tok == '=':
                lhs = last
                continue
            last = tok
            if tok == ';':
                ruleset.append(rule)
                grammar_add_symbol(grammar, lhs, ruleset)
                ruleset = []
                rule = []
                lhs = None
                continue
            if tok == ',':
                continue
            if tok == '|' and state.in_something() is None:
                ruleset.append(rule)
                rule = []
                continue
            if is_terminal(tok) and tok not in terminals:
                terminals.append(tok)
            elif state.in_special and tok not in specials:
                specials.append(tok)
            if lhs is not None:
                rule.append(RuleComp(tok, line=line))
        except Exception as err:
            for t in tokens:
                slog(ERR, t)
            slog(ERR, "Unexpected error in line", line, ":", str(err))
            raise
            exit(1)
    for s in terminals:
        grammar_add_symbol(grammar, s, None)
        grammar[s].set_type(p_terminal)
    for s in specials:
        grammar_add_symbol(grammar, s, None)
        grammar[s].set_type(p_special)

    return grammar

def grammar_parse_ebnf(content_):
    tokens = grammar_tokenize_ebnf(content_)
    grammar = grammar_parse_ebnf_tokens(tokens)
    return grammar

def grammar_get_types(grammar):
    types = dict()
    for t, p in grammar.iteritems():
        if not len(p.rules):
            continue
        if p.term is not None:
            continue
        ruleno = 1
        rules = []
        for rule in p.rules:
            members = []
            for c in rule:
                if c.tp != t_target_lang:
                    continue
                if not c.token in grammar.keys():
                    p.dump(ERR)
                    raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule))
                pp = grammar[c.token]
                if pp.tp is p_terminal:
                    continue
                members.append(tok2sym(c.token))
            if True or len(members):
                rules.append(members)
        if t in types.keys():
            raise Exception("Tried to add type", t, "twice")
        types[t] = rules
    return types

def grammar_fix_extensions(grammar, mode):
    for tok, p in grammar.iteritems():
        newrules = []
        for rule in p.rules:
            newrule = []
            prefix = ""
            paren = 0
            for c in rule:
                if c.tp == t_grammar and c.token in ['<', '>']:
                    if c.token == '<':
                        paren += 1
                    elif c.token == '>':
                        paren -= 1
                    if paren <= 1: # don't add first level of control chars
                        continue
                    newrule.append(c)
                    continue
                if paren > 0:
                    assert(len(c.token) != 0)
                    prefix += '_' + c.token
                    continue
                if len(prefix) > 0:
                    prefix = prefix[1:]
                    slog(INFO, "Found prefix", prefix)
                    if mode == mode_keep:
                        newrule.append(RuleComp('<'))
                        newrule.append(RuleComp(prefix, t_target_lang))
                        newrule.append(RuleComp('>'))
                        newrule.append(c)
                    elif mode == mode_discard:
                        prefix = ''
                        continue
                    elif mode in [ mode_unroll, mode_concat ]:
                        combined = RuleComp(c.token, c.tp)
                        combined.token = prefix + c.token
                        prefix = ''
                        newrule.append(combined)
                        slog(INFO, "Appended new rule return value", combined.token)
                        if mode == mode_unroll:
                            if combined.token in grammar.keys():
                                continue
                            grammar[combined.token] = Symbol(combined.token, rules=[[c]])
                    else:
                        raise Exception("Invalid prefix mode", mode)
                    prefix = ''
                    continue
                newrule.append(c)
            if len(prefix): # undigested prefix, since it was the last
                newrule.append(RuleComp(prefix[1:], t_target_lang))
            newrules.append(newrule)
        grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only
    return grammar # TODO: not sure if this is necessary

def grammar_unroll_lists(grammar):
    delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic
    for tok, p in grammar.iteritems():
        newrules = []
        for rule in p.rules:
            newrule = []
            listrule = []
            prefix = None
            s = State()
            slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
            for c in rule:
                s.update(c.token, c.line)
                if c.token == '{':
                    continue
                if c.token == '}':
                    if len(listrule) == 0:
                        raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
                    name = ""
                    delpos = []
                    for i, rule in enumerate(listrule):
                        if rule.token in delimiters:
                            delpos.append(i)
                            continue
                        if rule.tp != t_target_lang:
                            continue
                        name += tok2name(rule.token) + "_"

                    # not really: there are lists without delimiters, too
                    #if len(delpos) != 1:
                    #    p.dump(ERR)
                    #    raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))

                    name = name + "my_list"
                    newrule.append(RuleComp(name, t_target_lang))
                    p = Symbol(name, rules=[[], listrule])
                    #p = Symbol(name)
                    #p.rules = [ [], listrule ]
                    listrule = []
                    if name not in grammar.keys():
                        grammar[name] = p
                        continue
                    if not p.equals(grammar[name]):
                        p.dump(ERR, "old list production")
                        p.dump(ERR, "new list production")
                        raise Exception("List production expands to already taken name", name)
                    continue
                if s.in_list():
                    listrule.append(c)
                    continue
                newrule.append(c)
            newrules.append(newrule)
        grammar[tok].rules = newrules
    return grammar

def rules_unroll_options(rules):
    r = []
    found = False
    slog(DEBUG, "unrolling", format_rules(rules))
    for rule in rules:
        square = 0
        option = []
        newrule = []
        for i, c in enumerate(rule):
            if c.tp == t_grammar:
                if c.token == '[':
                    square += 1
                elif c.token == ']':
                    square -= 1
                if square == 1:
                    continue
            if square >= 1:
                option.append(c)
                continue
            slog(DEBUG, "square =", square)
            assert(square == 0)
            n = len(option)
            if n == 0:
                newrule.append(c)
                continue
            # first without option
            replaced = newrule[:]
            tail = rule[i+1:len(rule)]
            slog(DEBUG, "i                      = ", i)
            slog(DEBUG, "n                      = ", n)
            slog(DEBUG, "rule                   = ", format_rule(rule))
            slog(DEBUG, "tail                   = ", format_rule(tail))
            slog(DEBUG, ",-------------------------")
            slog(DEBUG, "head                   = ", format_rule(replaced))
            replaced.extend(tail)
            slog(DEBUG, "head + tail            = ", format_rule(replaced))
            r.append(replaced)
            # then with option inserted
            for unrolled in rules_unroll_options([ option ]):
                replaced = newrule[:]
                slog(DEBUG, ",-------------------------")
                slog(DEBUG, "head                   = ", format_rule(replaced))
                slog(DEBUG, "unrolled               = ", format_rule(unrolled))
                replaced.extend(unrolled)
                slog(DEBUG, "head + unrolled        =", format_rule(replaced))
                replaced.extend(tail)
                slog(DEBUG, "head + unrolled + tail =", format_rule(replaced))
                r.append(replaced)
            found = True
            break
        if not found:
            r.append(newrule)
    if found:
        return rules_unroll_options(r)
    return r

def grammar_unroll_options(grammar):
    for tok, p in grammar.iteritems():
        grammar[tok].rules = rules_unroll_options(p.rules)
    return grammar

def rules_unroll_alternatives(rules):
    r = []
    found = False
    slog(INFO, "unrolling alternatives in", format_rules(rules))
    sep = RuleComp('|')
    for rule in rules:
        if not sep in rule:
            r.append(rule)
            continue
        found = True
        state = State()
        end = len(rule) - 1
        first = last = -1
        for i, c in enumerate(rule):
            state.update(c.token, line=c.line)
            if c.token != '|' or c.tp != t_grammar:
                slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i)
                continue
            slog(INFO, "found token at position", i)
            container = state.in_something()
            slog(INFO, "thing delimiters are", container)
            if container is None:
                raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule))
            first = last = -1
            k = i - 1
            while k >= 0:
                prev = rule[k]
                slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0])
                if prev.token == container[0]:
                    first = k
                    break
                k -= 1
            if first == -1:
                raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule))
            k = i
            while k <= end:
                nxt = rule[k]
                slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1])
                if nxt.token == container[1]:
                    last = k
                    break
                k += 1
            if last == i:
                raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule))
            break # found what I wanted
        assert(first > 0)
        assert(last > 0)
        assert(last <= end)
        head = rule[0:first]
        mid = rule[first+1:last]
        tail = rule[last+1:end]
        slog(INFO, "first =", first, "last =", last, "end =", end)
        slog(INFO, "head = ", format_rule(head))
        slog(INFO, "mid  = ", format_rule(mid))
        slog(INFO, "tail = ", format_rule(tail))
        for m in split_list_by(mid, sep):
            unrolled_rule = head + m + tail
            r.append(unrolled_rule)
    #if found:
    #    return rules_unroll_alternatives(r)
    return r

def grammar_unroll_alternatives(grammar):
    for tok, p in grammar.iteritems():
        grammar[tok].rules = rules_unroll_alternatives(p.rules)
    return grammar

def grammar_replace_whitespace(grammar):
    r = OrderedDict()
    for tok, s in grammar.iteritems():
        newrules = []
        for rule in s.rules:
            newrule = []
            for c in rule:
                newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line)
                newrule.append(newc)
            newrules.append(newrule)
        newtok = tok.replace(' ', '_')
        s.reset(newtok, tp=s.tp, rules=newrules)
        r[newtok] = s
        slog(INFO, "added symbol", newtok)
    return r

def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
    if checked is None:
        checked = set()
    if found is None:
        found = dict()
    indent = ' ' * depth * 2
    if tok in found.keys():
        slog(INFO, indent + " + found cached", tok, "with depth", found[tok])
        return found[tok]
    slog(INFO, indent + " + " + tok)
    indent = indent + "  "
    if tok in terminals:
        found[tok] = 1
        slog(INFO, indent + " + found terminal", tok, "with depth", found[tok])
        return 1
    if tok in orphans:
        found[tok] = 1
        slog(INFO, indent + " + found orphan", tok, "with depth", found[tok])
        return 1
    #if tok in lexicals:
    #    found[tok] = 1
    #    slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok])
    #    return 1
    if tok in checked:
        slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked))
        return sys.maxint

    slog(INFO, indent, "checked =", ' '.join(checked))
    checked.add(tok)
    if tok not in grammar.keys():
        slog(ERR, "tried to validate unknown token \"" + tok + "\"")
        return sys.maxint
    p = grammar[tok]
    r = sys.maxint
    slog(INFO, indent, p.token, "has", len(p.rules), "rules")
    only_optional = True
    for rule in p.rules:
        slog(INFO, indent, "testing rule", format_rule(rule))
        if tok in [ c.token for c in rule ]:
            continue
        mn = sys.maxint
        mx = 0
        s = State()
        for c in rule:
            slog(INFO, indent, "testing token", c.token)
            if c.tp == t_grammar and s.update(c.token, 0):
                continue
            if c.tp != t_target_lang:
                slog(INFO, indent, "  token", c.token, "is not a VHDL token")
                continue
            only_optional = False
            # same "found" argument in next call?
            rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found)
            slog(INFO, indent, "  token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx)
            if rr == sys.maxint or rr is None:
                slog(INFO, indent, "  got error for token", c.token)
                mn = sys.maxint
                mx = 0
                break
            if rr > mx:
                slog(INFO, indent, "  adjusting mx to", rr)
                mx = rr
            if rr < mn:
                slog(INFO, indent, "  adjusting mn to", rr)
                mn = rr
        if mn == sys.maxint or mx == 0: # unusable as escape route
            slog(INFO, indent, "  unusable as escape route for " + tok + ":", format_rule(rule))
            continue
        slog(INFO, indent, "after checking all rules, mx is", mx)
        if mx < r:
            slog(INFO, indent, "setting return value to max", mx)
            r = mx
    if only_optional:
        slog(INFO, indent, tok, "has only optional rules, accepting")
        r = 0
    if r != sys.maxint:
        r += 1
        slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps")
        found[tok] = r
    slog(INFO, indent, "returning", r, "for token", tok)
    return r

def grammar_check(grammar, check_symbols = None):
    terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
    orphans   = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
    lexicals  = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
    elements  = set()
    if check_symbols is None:
        check_symbols = []
    if len(check_symbols) == 0:
        for tok, p in grammar.iteritems():
            if p.is_lexical_element:
                elements.add(tok)
                continue
            for rule in p.rules:
                for c in rule:
                    if c.tp == t_grammar:
                        continue
                    elements.add(c.token)
        check_symbols = sorted(list(elements))
    found = dict()
    for tok in check_symbols:
        slog(INFO, "======= checking", tok)
        rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
        if rr == sys.maxint:
            slog(ERR, "No way out for", tok, "in production", p.str())
            exit(1)
        if not tok in grammar.keys():
            slog(ERR, "Token", tok, "has no production")
            exit(1)
        slog(INFO, tok, "->", str(rr))

def grammar_lhss_map(grammar):
    r = dict()
    for t in grammar.keys():
        r[t] = set()
    for t, p in grammar.iteritems():
        for rule in p.rules:
            for c in rule:
                if c.tp == t_target_lang:
                    r[c.token].add(t)
    return r

def do_grammar_lhss(dmap, stop, rhs, buf, recursive):
    lhss = dmap[rhs]
    for lhs in lhss:
        if lhs in buf:
            continue
        buf.add(lhs)
        if lhs in stop:
            slog(INFO, "    symbol", lhs, "is among stop symbols, stopping recursion")
            continue
        if recursive:
            do_grammar_lhss(dmap, stop, lhs, buf, recursive)

def grammar_lhss(dmap, stop, symbols, recursive = False):
    r = set()
    for s in symbols:
        if s in r:
            continue
        do_grammar_lhss(dmap, stop, s, r, recursive)
    return r

def do_grammar_rhss(grammar, stop, sym, buf):
    p = grammar[sym]
    for rule in p.rules:
        for c in rule:
            if c.tp != t_target_lang:
                continue
            if c.token in stop:
                continue
            if c.token in buf:
                continue
            buf.add(c.token)
            do_grammar_rhss(grammar, stop, c.token, buf)

def grammar_rhss(grammar, stop, symbols):
    r = set()
    for s in symbols:
        if s in r:
            continue
        do_grammar_rhss(grammar, stop, s, r)
    return r

def grammar_symbol_in_use(grammar, dmap, stop, checked, sym):
    if sym in stop:
        return False
    # Does this have to be recursive?
    defined = grammar_lhss(dmap, stop, set([sym]))
    slog(INFO, "    symbol", sym, "defines:", ', '.join(defined))
    if not len(defined):
        return True
    for d in defined:
        if d in stop:
            continue
        if d in checked:
            continue
        checked.add(d)
        if grammar_symbol_in_use(grammar, dmap, stop, checked, d):
            return True
    return False

def do_grammar_unused(grammar, dmap, doomed):
    r = set(doomed)
    rhss = grammar_rhss(grammar, set(), doomed)
    for rhs in rhss:
        slog(INFO, "+++ checking if symbol", rhs, "is in use >>")
        if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs):
            slog(INFO, "  symbol", rhs, "is not in use")
            r.add(rhs)
        slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<")
    return r

def grammar_unused(grammar, dmap, doomed):
    r = set(doomed)
    while True:
        unused = do_grammar_unused(grammar, dmap, r)
        slog(INFO, "unused:", ', '.join(unused))
        slog(INFO, "r:     ", ', '.join(r))
        if unused == r:
            break
        r |= unused
    return r

# eradicate symbols from tree
def grammar_cut_symbols(grammar, symbols):
    slog(INFO, "-------- removing symbols:", ', '.join(symbols))
    dmap = grammar_lhss_map(grammar)
    unused = grammar_unused(grammar, dmap, symbols)
    for s in unused:
        slog(INFO, " + removing symbol", s)
        del grammar[s]
    return grammar

# make symbol an empty literal production
def grammar_trim_symbols(grammar, symbols):
    grammar_cut_symbols(grammar, symbols)
    for s in symbols:
        slog(INFO, " + adding empty production for symbol", s)
        p = Symbol(s)
        p.set_type(p_literal)
        grammar[s] = p

    return grammar

def create_ebnf(grammar):
    indent = 40
    slog(INFO, "creating ebnf from grammar of size", len(grammar))
    out = ''
    for t, p in grammar.iteritems():
        slog(INFO, "formatting rule", t)
        if not len(p.rules):
            slog(INFO, "ignoring " + t + " (has no rules)\n")
            continue
        out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
        for rule in p.rules[1:]:
            out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
	out += ' ' * indent + ' ;\n'
    return out

def create_yacc(grammar):
    indent = ' ' * 40
    width = 0
    for t, p in grammar.iteritems():
        if p.term is not None:
            continue
        if len(t) > width:
            width = len(t)
    spaces = 0
    while spaces < width:
        spaces += 8
    indent = '\t' * (spaces / 8)

    out = ""

    # preamble
    out += textwrap.dedent("""\
        %{
        #include <stdio.h>
        #include <string.h>
        #include <assert.h>
        #include <stdlib.h>
        #include <stdarg.h>

        #include <vector>
        #include <string>

        #include "include/defs.h"
        #include "include/vhdl2017.h"
        #include "include/lex.vhdl2017.h"
        #include "include/vhdl2017.tab.h"

        using namespace std;
        using namespace v2017;

        namespace {

        typedef vector<const char *> wrap_t;
        const wrap_t curly_braces{ "{", "}" };
        const wrap_t round_braces{ "(", ")" };

        }

        #ifdef __cplusplus
        // extern "C" {
        #endif

        %}

    """)

    # types
    out += textwrap.dedent("""\
        %union {
    """)

    types = grammar_get_types(grammar)
    for t in types.keys():
        out += '\n\tv2017::' +  t + '_t *' + t + ';'
    out += '\n'

    out += textwrap.dedent("""\
        }

    """)

    # yydecl
    out += textwrap.dedent("""\
        %{
        // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner);
        YY_DECL;
        %}
    """)

    # terminal tokens
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_terminal:
            #out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
            out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'

    # regex tokens
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_literal:
            #out += '%token <String> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'
            out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n'

    # types
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_ruleset:
            out += '%type <' + tok2sym(p.token) + '> ' + t + (40 - len(t)) * ' ' + '/* ' + t + ' */' +'\n'

    out += textwrap.dedent("""\

        %define parse.error verbose
        %define api.pure full
        %param { struct context *context } { void *scanner }
    """)

    # productions
    out += '\n%%\n\n'
    for t, p in grammar.iteritems():
        if not len(p.rules):
            continue
        if p.term is not None:
            continue
        #if p.is_lexical_element is True:
        #    continue
        if len(p.rules) == 0:
            raise Exception("Symbol ", p.str(), "has no rules")
        first = True
        n_rule = 0
        for rule in p.rules:
            n_rule += 1
            n = 0
            s = State()
            if first:
                out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n"
                first = False
            else:
                out += indent + "| " + format_yacc_rule(rule) + "\n"
            out += indent + "{" + "\n"
            out += indent + "\t" + "$$->type = v2017::" + t + "::t_" + str(n_rule) + ";\n"
            tokens = []
            for c in rule:
                if c.tp == t_target_lang:
                    tokens.append(c.token)
            idx = 0
            for c in rule:
                n += 1
                if c.tp == t_grammar:
                    s.update(c.token, 0)
                    continue
                p = grammar[c.token]
                #if is_terminal(c.token) is not None:
                #    continue
                if p.tp not in [ p_ruleset ]:
                    continue
                tp = tok2name(c.token)
                suffix = ''
                if tokens.count(c.token) > 1:
                    idx += 1
                    suffix = '_' + str(idx)
                out += indent + "\t" + \
                    "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \
                    " = new " + p.datatype + "(*$" + str(n) + ");\n"
            out += indent + "}" + "\n"
        out += indent + ";\n\n"

    # tail
    out += '\n%%\n\n'

    out += textwrap.dedent("""
        #ifdef __cplusplus
        // } /* extern "C" */
        #endif
    """)

    return out + "\n"

def create_lex(grammar):

    ignore = ""

    out = textwrap.dedent("""\
        %option reentrant
        %option bison-bridge

        %{
        #include <slog.h>

        #include "include/defs.h"
        #include "include/vhdl2017.h"

        // #include "include/lex.vhdl2017.h"
        #include "include/vhdl2017.tab.h"

        using namespace v2017;

        /* This is too late in the Flex generated file to work. Still lots of
         * prototypes are spat into it above it, which end up with C++ linkage, of
         * course, causing the linkages to be inconsistent to the functions below this
         * extern "C". Only way I found was to use C++ is to use it on Bison only, and
         * have Flex use C instead. */
        #ifdef __cplusplus
        // extern "C" {
        #endif

        #ifdef _REMOVE_ME
        static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
        static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
        #endif

        %}

        %%

        \\n { context->line++; }

        """)

    for t, p in grammar.iteritems():
        if p.term is not None:
            # \. { return T_DOT; }
            assert p.term[0] in [ '"', "'" ], p.term
            assert p.term[-1] in [ '"', "'" ], p.term
            out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'

    out += textwrap.dedent("""\

        %{/* basic_identifier */%}
        %{/* extended_identifier */%}
        %{/* based_integer */%}
        %{/* bit_value */%}
        %{/* numeric_literal */%}
        %{/* enumeration_literal */%}
        %{/* string_literal */%}
        %{/* bit_string_literal */%}
        %{/* character_literal */%}
        %{/* graphic_character */%}
        %{/* basic_character */%}
        %{/* integer */%}

	""")

    ignore += textwrap.dedent("""\

        %{ /* not sure how to handle literals >> */ %}
        \\"[ \\!#-~]*\\" |
        \\'[0-1]\\' {
        	// get_string(yylval_param, yyscanner, 1);
        	/* Gets a string excluding " or ' */
        	int skip = 1;
        	int i;

        	for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++);
        	yytext[i] = 0;
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt=(char *)malloc(i+1);
        	strcpy(lv->txt, yytext+skip);

        	return STRING;
        }

        #[0-9a-f]*# {
        	// get_based_string(yylval_param, yyscanner, 1); /* skip leading # */
        	/* Gets a string excluding # */
        	int i;
        	int skip = 1;

        	for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++);
        	yytext[i] = 0;
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt = (char *)malloc(i+1);
        	strcpy(lv->txt, yytext + skip);

        	return BASED;
        }

        [a-zA-Z_$][a-zA-Z0-9_$.]* {
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt=(char *)malloc(strlen(yytext)+1);
        	strcpy(lv->txt, yytext);
        	return NAME;
        }

        [0-9]+ {
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	sscanf(yytext, "%d", &lv->n);
        	return NATURAL;
        }

        """)

    out += textwrap.dedent("""\
        . {
        	return yytext[0];
        }

        %{/* not sure how to handle literals << */%}

        %%

        void FB_SYM(error)(struct context *context, void *scanner, const char *msg)
        {
        	struct yyguts_t *yyg =(struct yyguts_t*)scanner;
        	// vp_log(context->vp, VP_LOG_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->lineno);
        	slog(PRI_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->line);
        }

        int FB_SYM(wrap)(void *scanner)
        {
        	return 1;
        }

        struct vp_scanner {
        	YY_BUFFER_STATE buf;
        	void *scanner;
        	char *str;
        };

        /* utilities which need to be placed here, because I can't find
         * yylex_init() / _destroy() in any generated header file (??) */
        struct vp_scanner *vhdl_default_init_scanner(const char *str)
        {
        	struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r));

        	yylex_init(&r->scanner);
        	r->str = strdup(str);
        	r->buf = yy_scan_string(r->str, r->scanner);
        	FB_SYM(set_extra)(r, r->scanner);
        	// yyset_in(stdin, r->scanner);
        	// yyset_out(stdout, r->scanner);
        	return r;
        }

        void *vhdl_default_scanner_get_data(const struct vp_scanner *scanner)
        {
        	return scanner->scanner;
        }

        void vhdl_default_cleanup_scanner(struct vp_scanner *scanner)
        {
        	free(scanner->str);
        	yy_delete_buffer(scanner->buf, scanner->scanner);
        	yylex_destroy(scanner->scanner);
        	free(scanner);
        }

        #ifdef __cplusplus
        // } // extern "C"
        #endif

	""")

    return out

def create_header(grammar, mip, namespace = None):
    out = "#ifndef " + mip + '\n#define ' + mip + '\n\n'
    if namespace is not None:
        out += 'namespace ' + namespace + '{\n\n'

    types = grammar_get_types(grammar)

    # struct forward declarations
    for t, members in types.iteritems():
        if len(members):
            out += '\nstruct ' +  t + ';'
    out += '\n'

    # struct / non-struct typedefs
    for t, members in types.iteritems():
        if not len(members):
            out += '\ntypedef const char ' +  t + '_t;'
            continue
        out += '\ntypedef struct ' +  t + ' ' + t + '_t;'
    out += '\n'

    # struct definitions
    for t, rules in types.iteritems():
        if not len(rules):
            continue
        out += '\n\nstruct ' +  t + ' {\n'

        # rule structs
        n = 0
        for rule in rules:
            n += 1
            idx = 0
            out += '\n\tstruct ' + 'r' + str(n) + '_t {'
            for m in rule:
                suffix = ''
                if rule.count(m) > 1:
                    idx += 1
                    suffix = '_' + str(idx)
                p = grammar[m]
                out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
            out += '\n\t};'

        # type enum
        n = 0
        out += '\n\n\tenum {'
        for rule in rules:
            n += 1
            out += '\n\t\tt_' + str(n) + ','
        out += '\n\t} type;'
        out += '\n'

        # data union
        n = 0
        out += '\n\tunion {'
        for rule in rules:
            n += 1
            out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';'
        out += '\n\t} data;'

        # struct done
        out += '\n};'

    out += '\n'

    if namespace is not None:
        out += '\n} /* namespace ' + namespace + '*/'
    out += '\n#endif /* #ifndef + ' + mip + ' */'

    return out

class GrammarCmd(jwutils.Cmd):

    def __init__(self, name, help):
        super(GrammarCmd, self).__init__(name, help=help)

    def add_parser(self, parsers):
        p = super(GrammarCmd, self).add_parser(parsers)
        p.add_argument("input", help="input file")
        p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False)
        p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat)
        p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False)
        p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False)
        p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False)
        p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='')
        p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='')
        p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='')
        return p

    def processGrammar(self, args, grammar):
        if args.fix_extensions not in fix_extensions_mode:
            raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
        grammar = grammar_fix_extensions(grammar, args.fix_extensions)
        if args.unroll_alternatives:
            grammar = grammar_unroll_alternatives(grammar)
        if args.unroll_lists:
            grammar = grammar_unroll_lists(grammar)
        if args.unroll_options:
            grammar = grammar_unroll_options(grammar)
        if len(args.check_symbols):
            check_symbols = []
            if args.check_symbols == 'all':
                args.check_symbols = ''
            check_symbols = args.check_symbols.split()
            grammar_check(grammar, check_symbols)
        if len(args.trim_symbols):
            grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(','))
        if len(args.cut_symbols):
            grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(','))
        if args.replace_whitespace:
            grammar = grammar_replace_whitespace(grammar)
        return grammar