jw-python/tools/python/jwutils/grammar.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import sys
import re
import lxml.etree as ET
import textwrap
import itertools
import copy
from collections import OrderedDict
from abc import abstractmethod
import os.path

import jwutils

from jwutils.log import *

t_grammar      = "grammar"
t_target_lang  = "target"

p_ruleset  = "ruleset"
p_terminal = "term"
p_literal  = "literal"
p_lexical  = "lexical"
p_special  = "special"

mode_unroll  = "unroll"
mode_concat  = "concat"
mode_keep    = "keep"
mode_discard = "discard"
fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ]

member_prefix = ''

special_terminals = {
    "`"   : "BACKTICK",
    "^"   : "CARET",
    "<"   : "LT",
    "<<"  : "LEFT_SHIFT",
    "<="  : "LTE",
    "<=>" : "SPACE_SHIP",
    "<>"  : "NE",
    "="   : "EQ",
    "=>"  : "EG",
    ">"   : "GT",
    ">="  : "GE",
    ">>"  : "RIGHT_SHIFT",
    "|"   : "PIPE",
    "_"   : "UNDERSCORE",
    ","   : "COMMA",
    ";"   : "SEMICOLON",
    ":"   : "COLON",
    ":="  : "DEFINE",
    "?"   : "QM",
    "?<"  : "QM_LT",
    "?<=" : "QM_LE",
    "?="  : "QM_EQ",
    "?>"  : "QM_GT",
    "?>=" : "QM_GE",
    "??"  : "QM_QM",
    "?/=" : "QM_DIV_EQ",
    "/"   : "DIV",
    "/="  : "DIV_EQ",
    "."   : "DOT",
    "\""  : "DQUOTE",
    "'"   : "QUOTE",
    "("   : "LPAREN",
    ")"   : "RPAREN",
    "["   : "LBRACKET",
    "]"   : "RBRACKET",
    "@"   : "AT",
    "*"   : "ASTERISK",
    "**"  : "DASTERISK",
    "\\"  : "BACKSLASH",
    "&"   : "AMPERSAND",
    "#"   : "NUMBER_SIGN",
    "+"   : "PLUS",
    "-"   : "MINUS"
}

token_regexes = {
    "PSL_Property_Declaration"   : "property[ \t]+[^;]+;",
    "PSL_Sequence_Declaration"   : "sequence[ \t]+[^;]+;",
    "PSL_Clock_Declaration"      : "default[ \t]+clock[ \t]+[^;]+;",
    "PSL_Directive"              : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;",
    "PSL_Verification_Unit"      : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
}

quotechars = [ '"', "'" ]

def dump(obj):
  for c, v in obj.iteritems():
    slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))

def dump_grammar(prio, grammar):
    caller = get_caller_pos()
    for t, p in grammar.iteritems():
        p.dump(prio, caller=caller)

def cleanup_token(tok):
    tok = tok.strip()
    if len(tok) == 0:
        return None
    if tok[0] == "'" and tok[-1] == "'":
        tok = '"' + tok[1:-1] + '"'
    return tok

def tok2ctype(tok):
    if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]:
        return t_grammar
    return t_target_lang

def is_terminal(tok):
    size = len(tok)
    if size < 2:
        return None
    first = tok[0]
    last = tok[-1]
    if (not first in quotechars) and (not last in quotechars):
        return None
    if first != last:
        raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
    return tok[1:-1]

def tok2name(tok):
    tok = cleanup_token(tok)
    term = is_terminal(tok)
    if term is not None:
        if term in special_terminals.keys():
           return special_terminals[term]
        return term
    return tok

def tok2sym(tok):
    tok = cleanup_token(tok)
    term = is_terminal(tok)
    if term is not None:
        if term in special_terminals.keys():
           return "T_" + special_terminals[term].upper()
        return "T_" + re.sub('[^a-zA-Z0-9]', '_', term).upper()
    return tok

def tok2regex(tok):
    if tok in token_regexes.keys():
        return token_regexes[tok]
    return re.escape(tok)

def format_rule(rule):
    return ' '.join(c.str() for c in rule)

def format_rules(rules):
    return ', '.join(format_rule(rule) for rule in rules)

def format_ebnf_rule(grammar, rule):
    r = ""
    last = None
    for comp in rule:
        if last is not None:
            if comp.tp == t_grammar:
                if last.tp == t_grammar:
                    pass
                else:
                    if comp.token in [ '[', '(', '{', '<' ]:
                        r += ','
            else:
                if last.tp == t_grammar:
                    if comp.token in [ ']', ')', '}', '>' ]:
                        r += ','
                else:
                        r += ','
        r += ' ' + comp.token
        last = comp
    if len(r) == 0:
        return r
    return r.strip()

def format_yacc_rule(rule):
    r = ''
    for c in rule:
        if c.tp != t_target_lang:
            slog(DEBUG, "ignoring non-target-language token", c.token, "in rule")
            continue
        r += tok2sym(c.token) + ' '
    return r[:-1]

class SourceElement:

    def __init__(self, token, line):
        self.token = token
        self.line = line

class RuleComp:

    def __init__(self, token, tp = None, line=-1):
        assert(token is not None)
        # assert(token != '|')
        self.token = token
        if tp is None:
            tp = tok2ctype(token)
        self.tp = tp
        slog(INFO, "creating rule component >" + self.str() + "<")
        assert(token != "{ assignment")
        self.line = line

    def __eq__(self, rhs):
        if self.token != rhs.token:
            return False
        if self.tp != rhs.tp:
            return False
        return True

    def __ne__(self, rhs):
        return not self.__eq__(rhs)

    def str(self):
        tp = 'u'
        if self.tp == t_grammar:
            tp = 'g'
        elif self.tp == t_target_lang:
            tp = 'l'
        else:
            tp = self.tp
        return "{" + tp + ": " + self.token + "}"

class State:

    def __init__(self):
        self.__pair_square = ['[', ']']
        self.__pair_curly = ['{', '}']
        self.__pair_ext = ['<', '>']
        self.__pair_group = ['(', ')']
        self.__pair_comment = ['(*', '*)']
        self.__pair_special = ['?', '?']
        self.reset()

    def reset(self):
        self.curly = 0
        self.square = 0
        self.ext = 0
        self.group = 0
        self.in_comment = False
        self.in_special = False
        self.production = None
        self.rule = []
        self.rules = []
        self.things = []

    def optional(self):
        return self.square != 0 or self.curly != 0

    def update(self, tok, line):
        if not self.in_comment:
            if tok == '[':
                self.square += 1
                self.things.append(self.__pair_square)
            elif tok == ']':
                self.square -= 1
                assert(self.things.pop() == self.__pair_square)
            elif tok == '{':
                self.curly += 1
                self.things.append(self.__pair_curly)
            elif tok == '}':
                self.curly -= 1
                assert(self.things.pop() == self.__pair_curly)
            elif tok == '(':
                self.group += 1
                self.things.append(self.__pair_group)
            elif tok == ')':
                self.group -= 1
                assert(self.things.pop() == self.__pair_group)
            elif tok == '<':
                self.ext += 1
                self.things.append(self.__pair_ext)
            elif tok == '>':
                self.ext -= 1
                assert(self.things.pop() == self.__pair_ext)
            elif tok == '?':
                if not self.in_special:
                    self.in_special = True
                    self.things.append(self.__pair_special)
                else:
                    self.in_special = False
                    assert(self.things.pop() == self.__pair_special)
            elif tok == '(*':
                self.in_comment = True
                self.things.append(self.__pair_comment)
            elif tok == '*)':
                raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
        else:
            if tok == '(*':
                raise Exception("Nested EBNF comment", tok, "in line", line)
            elif tok == '*)':
                assert(self.things.pop() == self.__pair_comment)
                self.in_comment = False

        if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
            raise Exception("Unbalanced BNF bracket", tok, "in line", line)
        return self.optional()

    def in_list(self):
        return self.curly > 0

    def in_option(self):
        return self.square > 0

    def in_group(self):
        return self.group > 0

    def in_ext(self):
        return self.ext > 0

    def in_something(self):
        if len(self.things) == 0:
            return None
        return self.things[-1]

class Symbol:

    def __init__(self, token, tp = None, rules = None):
        self.reset(token, tp, rules)
        self.set_is_payload(True)

    def reset(self, token, tp = None, rules = None):
        if tp == None:
            if is_terminal(token) is not None:
                tp = p_terminal
            else:
                tp = p_ruleset
        self.tp = tp
        self.token = token
        self.name = tok2name(token)
        self.sym = tok2sym(token)
        self.term = None
        self.regex = None
        self.is_lexical_element = False
        self.rules = []
        self.datatype = None
        if rules is not None:
            self.rules = rules
        self.set_type(tp)

    def set_is_payload(self, onoff):
        self.is_payload = onoff

    def set_type(self, tp):
        if tp == p_ruleset:
            self.term = None
            self.regex = None
            self.is_lexical_element = False
            self.datatype = self.token + '_t'
        elif tp == p_literal:
            assert(len(self.rules) == 0)
            self.term = None
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
            self.datatype = 'std::string'
        elif tp == p_special or tp == p_lexical:
            if len(self.rules):
                self.dump(ERR)
                raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
            self.term = None
            self.regex = None
            self.is_lexical_element = True
            self.datatype = 'std::string'
        elif tp == p_terminal:
            if len(self.rules):
                slog(ERR, "rules = ", self.rules)
                self.dump(ERR)
                raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
            self.term = self.token
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
            self.datatype = None
        else:
            self.dump()
            raise Exception("Tried to set symbol to unknown type", tp)
        self.tp = tp

    def str(self):
        r = self.name + ' = ' + format_rules(self.rules)
        return r

    def equals(self, rhs):
        for k, v in self.__dict__.iteritems():
            if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]:
                slog(WARNING, k, self.__dict__[k], rhs.__dict__[k])
                return False
        return True

    def dump(self, prio = NOTICE, msg="", caller=None):
        if caller is None:
            caller = get_caller_pos(1)
        slog(prio, ",----------------", msg, caller=caller)
        slog(prio, "| type          =", self.tp, caller=caller)
        slog(prio, "| name          =", self.name, caller=caller)
        slog(prio, "| token         =", self.token, caller=caller)
        slog(prio, "| sym           =", self.sym, caller=caller)
        slog(prio, "| term          =", self.term, caller=caller)
        slog(prio, "| regex         =", self.regex, caller=caller)
        slog(prio, "| datatype      =", self.datatype, caller=caller)
        slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller)
        slog(prio, "| rules         =", format_rules(self.rules), caller=caller)
        slog(prio, "`----------------", msg, caller=caller)

def split_list_by(l_, tok):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]

def split_list_by_regex(l_, regex):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]

def grammar_tokenize_ebnf(content):
    r = []
    c = ''
    l = 0
    in_comment = False
    in_quote = None
    for line in content.splitlines(True):
        end = len(line) - 1
        l += 1
        tok = ''
        p = -1
        while p < end:
            p += 1
            if p < end and in_quote == None:
                cand = line[p:p+2]
                if cand == '(*':
                    if in_comment:
                        raise Exception("Syntax error in line", l, ": spurious comment closure")
                    in_comment = True
                    p += 1
                    continue
                elif cand == '*)':
                    if not in_comment:
                        raise Exception("Syntax error in line", l, ": spurious comment opener")
                    in_comment = False
                    p += 1
                    continue
            if in_comment:
                continue
            c = line[p]
            if c in [ '"', "'" ]:
                if in_quote is None:
                    in_quote = c
                else:
                    if in_quote == c:
                        in_quote = None
            if in_quote is not None:
                tok += c
                continue
            if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
                tok = tok.strip()
                if len(tok):
                    r.append((tok, l))
                tok = ''
                if not c.isspace():
                    r.append((c, l))
                continue
            tok += c

    tok = tok.strip()
    if len(tok):
        r.append((tok, l))
    return r

def grammar_add_symbol(grammar, tok, rules):
    assert(tok is not None)
    if tok in grammar.keys():
        s = grammar[tok]
    else:
        s = Symbol(tok, rules=rules)
        grammar[tok] = s
    if rules is not None:
        slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
        for rule in rules:
            if not rule in s.rules:
                s.rules.append(rule)
        grammar[tok] = s

def grammar_parse_ebnf_tokens(tokens):
    grammar = OrderedDict()
    state = State()
    lhs = None
    last = None
    ruleset = []
    rule = []
    terminals = []
    specials = []
    for tok, line in tokens:
        try:
            state.update(tok, line)
            if tok == '=':
                lhs = last
                continue
            last = tok
            if tok == ';':
                ruleset.append(rule)
                grammar_add_symbol(grammar, lhs, ruleset)
                ruleset = []
                rule = []
                lhs = None
                continue
            if tok == ',':
                continue
            if tok == '|' and state.in_something() is None:
                ruleset.append(rule)
                rule = []
                continue
            if is_terminal(tok) and tok not in terminals:
                terminals.append(tok)
            elif state.in_special and tok not in specials:
                specials.append(tok)
            if lhs is not None:
                rule.append(RuleComp(tok, line=line))
        except Exception as err:
            for t in tokens:
                slog(ERR, t)
            slog(ERR, "Unexpected error in line", line, ":", str(err))
            raise
            exit(1)
    for s in terminals:
        grammar_add_symbol(grammar, s, None)
        grammar[s].set_type(p_terminal)
    for s in specials:
        slog(INFO, "found special sequence symbol", s)
        grammar_add_symbol(grammar, s, None)
        grammar[s].set_type(p_special)

    return grammar

def grammar_parse_ebnf(content_):
    tokens = grammar_tokenize_ebnf(content_)
    grammar = grammar_parse_ebnf_tokens(tokens)
    return grammar

def grammar_get_types(grammar):
    types = dict()
    for t, p in grammar.iteritems():
        if not len(p.rules):
            continue
        if p.term is not None:
            continue
        ruleno = 1
        rules = []
        for rule in p.rules:
            members = []
            for c in rule:
                if c.tp != t_target_lang:
                    continue
                if not c.token in grammar.keys():
                    p.dump(ERR)
                    raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule))
                pp = grammar[c.token]
                if pp.tp is p_terminal:
                    continue
                if not pp.is_payload:
                    continue
                members.append(tok2sym(c.token))
            if True or len(members):
                rules.append(members)
        if t in types.keys():
            raise Exception("Tried to add type", t, "twice")
        types[t] = rules
    return types

def grammar_fix_extensions(grammar, mode):
    for tok, p in grammar.iteritems():
        newrules = []
        for rule in p.rules:
            newrule = []
            prefix = ""
            paren = 0
            for c in rule:
                if c.tp == t_grammar and c.token in ['<', '>']:
                    if c.token == '<':
                        paren += 1
                    elif c.token == '>':
                        paren -= 1
                    if paren <= 1: # don't add first level of control chars
                        continue
                    newrule.append(c)
                    continue
                if paren > 0:
                    assert(len(c.token) != 0)
                    prefix += '_' + c.token
                    continue
                if len(prefix) > 0:
                    prefix = prefix[1:]
                    slog(INFO, "Found prefix", prefix)
                    if mode == mode_keep:
                        newrule.append(RuleComp('<'))
                        newrule.append(RuleComp(prefix, t_target_lang))
                        newrule.append(RuleComp('>'))
                        newrule.append(c)
                    elif mode == mode_discard:
                        prefix = ''
                        continue
                    elif mode in [ mode_unroll, mode_concat ]:
                        combined = RuleComp(c.token, c.tp)
                        combined.token = prefix + c.token
                        prefix = ''
                        newrule.append(combined)
                        slog(INFO, "Appended new rule return value", combined.token)
                        if mode == mode_unroll:
                            if combined.token in grammar.keys():
                                continue
                            grammar[combined.token] = Symbol(combined.token, rules=[[c]])
                    else:
                        raise Exception("Invalid prefix mode", mode)
                    prefix = ''
                    continue
                newrule.append(c)
            if len(prefix): # undigested prefix, since it was the last
                newrule.append(RuleComp(prefix[1:], t_target_lang))
            newrules.append(newrule)
        grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only
    return grammar # TODO: not sure if this is necessary

def grammar_unroll_lists(grammar):
    delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic
    for tok, p in grammar.iteritems():
        newrules = []
        for rule in p.rules:
            newrule = []
            listrule = []
            prefix = None
            s = State()
            slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
            for c in rule:
                s.update(c.token, c.line)
                if c.token == '{':
                    continue
                if c.token == '}':
                    if len(listrule) == 0:
                        raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule))
                    name = ""
                    delpos = []
                    for i, rule in enumerate(listrule):
                        if rule.token in delimiters:
                            delpos.append(i)
                            continue
                        if rule.tp != t_target_lang:
                            continue
                        name += tok2name(rule.token) + "_"

                    # not really: there are lists without delimiters, too
                    #if len(delpos) != 1:
                    #    p.dump(ERR)
                    #    raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))

                    name = name + "my_list"
                    newrule.append(RuleComp(name, t_target_lang))
                    p = Symbol(name, rules=[[], listrule])
                    #p = Symbol(name)
                    #p.rules = [ [], listrule ]
                    listrule = []
                    if name not in grammar.keys():
                        grammar[name] = p
                        continue
                    if not p.equals(grammar[name]):
                        p.dump(ERR, "old list production")
                        p.dump(ERR, "new list production")
                        raise Exception("List production expands to already taken name", name)
                    continue
                if s.in_list():
                    listrule.append(c)
                    continue
                newrule.append(c)
            newrules.append(newrule)
        grammar[tok].rules = newrules
    return grammar

def rules_unroll_options(rules):
    r = []
    found = False
    slog(DEBUG, "unrolling", format_rules(rules))
    for rule in rules:
        square = 0
        option = []
        newrule = []
        for i, c in enumerate(rule):
            if c.tp == t_grammar:
                if c.token == '[':
                    square += 1
                elif c.token == ']':
                    square -= 1
                if square == 1:
                    continue
            if square >= 1:
                option.append(c)
                continue
            slog(DEBUG, "square =", square)
            assert(square == 0)
            n = len(option)
            if n == 0:
                newrule.append(c)
                continue
            # first without option
            replaced = newrule[:]
            tail = rule[i+1:len(rule)]
            slog(DEBUG, "i                      = ", i)
            slog(DEBUG, "n                      = ", n)
            slog(DEBUG, "rule                   = ", format_rule(rule))
            slog(DEBUG, "tail                   = ", format_rule(tail))
            slog(DEBUG, ",-------------------------")
            slog(DEBUG, "head                   = ", format_rule(replaced))
            replaced.extend(tail)
            slog(DEBUG, "head + tail            = ", format_rule(replaced))
            r.append(replaced)
            # then with option inserted
            for unrolled in rules_unroll_options([ option ]):
                replaced = newrule[:]
                slog(DEBUG, ",-------------------------")
                slog(DEBUG, "head                   = ", format_rule(replaced))
                slog(DEBUG, "unrolled               = ", format_rule(unrolled))
                replaced.extend(unrolled)
                slog(DEBUG, "head + unrolled        =", format_rule(replaced))
                replaced.extend(tail)
                slog(DEBUG, "head + unrolled + tail =", format_rule(replaced))
                r.append(replaced)
            found = True
            break
        if not found:
            r.append(newrule)
    if found:
        return rules_unroll_options(r)
    return r

def grammar_unroll_options(grammar):
    for tok, p in grammar.iteritems():
        grammar[tok].rules = rules_unroll_options(p.rules)
    return grammar

def rules_unroll_alternatives(rules):
    r = []
    found = False
    slog(INFO, "unrolling alternatives in", format_rules(rules))
    sep = RuleComp('|')
    for rule in rules:
        if not sep in rule:
            r.append(rule)
            continue
        found = True
        state = State()
        end = len(rule) - 1
        first = last = -1
        for i, c in enumerate(rule):
            state.update(c.token, line=c.line)
            if c.token != '|' or c.tp != t_grammar:
                slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i)
                continue
            slog(INFO, "found token at position", i)
            container = state.in_something()
            slog(INFO, "thing delimiters are", container)
            if container is None:
                raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule))
            first = last = -1
            k = i - 1
            while k >= 0:
                prev = rule[k]
                slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0])
                if prev.token == container[0]:
                    first = k
                    break
                k -= 1
            if first == -1:
                raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule))
            k = i
            while k <= end:
                nxt = rule[k]
                slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1])
                if nxt.token == container[1]:
                    last = k
                    break
                k += 1
            if last == i:
                raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule))
            break # found what I wanted
        assert(first > 0)
        assert(last > 0)
        assert(last <= end)
        head = rule[0:first]
        mid = rule[first+1:last]
        tail = rule[last+1:end]
        slog(INFO, "first =", first, "last =", last, "end =", end)
        slog(INFO, "head = ", format_rule(head))
        slog(INFO, "mid  = ", format_rule(mid))
        slog(INFO, "tail = ", format_rule(tail))
        for m in split_list_by(mid, sep):
            unrolled_rule = head + m + tail
            r.append(unrolled_rule)
    #if found:
    #    return rules_unroll_alternatives(r)
    return r

def grammar_unroll_alternatives(grammar):
    for tok, p in grammar.iteritems():
        grammar[tok].rules = rules_unroll_alternatives(p.rules)
    return grammar

def grammar_replace_whitespace(grammar):
    r = OrderedDict()
    for tok, s in grammar.iteritems():
        newrules = []
        for rule in s.rules:
            newrule = []
            for c in rule:
                newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line)
                newrule.append(newc)
            newrules.append(newrule)
        newtok = tok.replace(' ', '_')
        s.reset(newtok, tp=s.tp, rules=newrules)
        r[newtok] = s
        slog(INFO, "added symbol", newtok)
    return r

def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
    if checked is None:
        checked = set()
    if found is None:
        found = dict()
    indent = ' ' * depth * 2
    if tok in found.keys():
        slog(INFO, indent + " + found cached", tok, "with depth", found[tok])
        return found[tok]
    slog(INFO, indent + " + " + tok)
    indent = indent + "  "
    if tok in terminals:
        found[tok] = 1
        slog(INFO, indent + " + found terminal", tok, "with depth", found[tok])
        return 1
    if tok in orphans:
        found[tok] = 1
        slog(INFO, indent + " + found orphan", tok, "with depth", found[tok])
        return 1
    #if tok in lexicals:
    #    found[tok] = 1
    #    slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok])
    #    return 1
    if tok in checked:
        slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked))
        return sys.maxint

    slog(INFO, indent, "checked =", ' '.join(checked))
    checked.add(tok)
    if tok not in grammar.keys():
        slog(ERR, "tried to validate unknown token \"" + tok + "\"")
        return sys.maxint
    p = grammar[tok]
    r = sys.maxint
    slog(INFO, indent, p.token, "has", len(p.rules), "rules")
    only_optional = True
    for rule in p.rules:
        slog(INFO, indent, "testing rule", format_rule(rule))
        if tok in [ c.token for c in rule ]:
            continue
        mn = sys.maxint
        mx = 0
        s = State()
        for c in rule:
            slog(INFO, indent, "testing token", c.token)
            if c.tp == t_grammar and s.update(c.token, 0):
                continue
            if c.tp != t_target_lang:
                slog(INFO, indent, "  token", c.token, "is not a VHDL token")
                continue
            only_optional = False
            # same "found" argument in next call?
            rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found)
            slog(INFO, indent, "  token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx)
            if rr == sys.maxint or rr is None:
                slog(INFO, indent, "  got error for token", c.token)
                mn = sys.maxint
                mx = 0
                break
            if rr > mx:
                slog(INFO, indent, "  adjusting mx to", rr)
                mx = rr
            if rr < mn:
                slog(INFO, indent, "  adjusting mn to", rr)
                mn = rr
        if mn == sys.maxint or mx == 0: # unusable as escape route
            slog(INFO, indent, "  unusable as escape route for " + tok + ":", format_rule(rule))
            continue
        slog(INFO, indent, "after checking all rules, mx is", mx)
        if mx < r:
            slog(INFO, indent, "setting return value to max", mx)
            r = mx
    if only_optional:
        slog(INFO, indent, tok, "has only optional rules, accepting")
        r = 0
    if r != sys.maxint:
        r += 1
        slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps")
        found[tok] = r
    slog(INFO, indent, "returning", r, "for token", tok)
    return r

def grammar_check(grammar, check_symbols = None):
    terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
    orphans   = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
    lexicals  = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
    elements  = set()
    if check_symbols is None:
        check_symbols = []
    if len(check_symbols) == 0:
        for tok, p in grammar.iteritems():
            if p.is_lexical_element:
                elements.add(tok)
                continue
            for rule in p.rules:
                for c in rule:
                    if c.tp == t_grammar:
                        continue
                    elements.add(c.token)
        check_symbols = sorted(list(elements))
    found = dict()
    for tok in check_symbols:
        slog(INFO, "======= checking", tok)
        rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
        if rr == sys.maxint:
            slog(ERR, "No way out for", tok)
            exit(1)
        if not tok in grammar.keys():
            slog(ERR, "Token", tok, "has no production")
            exit(1)
        slog(INFO, tok, "->", str(rr))

def grammar_lhss_map(grammar):
    r = dict()
    for t in grammar.keys():
        r[t] = set()
    for t, p in grammar.iteritems():
        for rule in p.rules:
            for c in rule:
                if c.tp == t_target_lang:
                    r[c.token].add(t)
    return r

def do_grammar_lhss(dmap, stop, rhs, buf, recursive):
    lhss = dmap[rhs]
    for lhs in lhss:
        if lhs in buf:
            continue
        buf.add(lhs)
        if lhs in stop:
            slog(INFO, "    symbol", lhs, "is among stop symbols, stopping recursion")
            continue
        if recursive:
            do_grammar_lhss(dmap, stop, lhs, buf, recursive)

def grammar_lhss(dmap, stop, symbols, recursive = False):
    r = set()
    for s in symbols:
        if s in r:
            continue
        do_grammar_lhss(dmap, stop, s, r, recursive)
    return r

def do_grammar_rhss(grammar, stop, sym, buf):
    p = grammar[sym]
    for rule in p.rules:
        for c in rule:
            if c.tp != t_target_lang:
                continue
            if c.token in stop:
                continue
            if c.token in buf:
                continue
            buf.add(c.token)
            do_grammar_rhss(grammar, stop, c.token, buf)

def grammar_rhss(grammar, stop, symbols):
    r = set()
    for s in symbols:
        if s in r:
            continue
        do_grammar_rhss(grammar, stop, s, r)
    return r

def grammar_symbol_in_use(grammar, dmap, stop, checked, sym):
    if sym in stop:
        return False
    # Does this have to be recursive?
    defined = grammar_lhss(dmap, stop, set([sym]))
    slog(INFO, "    symbol", sym, "defines:", ', '.join(defined))
    if not len(defined):
        return True
    for d in defined:
        if d in stop:
            continue
        if d in checked:
            continue
        checked.add(d)
        if grammar_symbol_in_use(grammar, dmap, stop, checked, d):
            return True
    return False

def do_grammar_unused(grammar, dmap, doomed):
    r = set(doomed)
    rhss = grammar_rhss(grammar, set(), doomed)
    for rhs in rhss:
        slog(INFO, "+++ checking if symbol", rhs, "is in use >>")
        if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs):
            slog(INFO, "  symbol", rhs, "is not in use")
            r.add(rhs)
        slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<")
    return r

def grammar_unused(grammar, dmap, doomed):
    r = set(doomed)
    while True:
        unused = do_grammar_unused(grammar, dmap, r)
        slog(INFO, "unused:", ', '.join(unused))
        slog(INFO, "r:     ", ', '.join(r))
        if unused == r:
            break
        r |= unused
    return r

# eradicate symbols from tree
def grammar_cut_symbols(grammar, symbols):
    slog(INFO, "-------- removing symbols:", ', '.join(symbols))
    dmap = grammar_lhss_map(grammar)
    unused = grammar_unused(grammar, dmap, symbols)
    for s in unused:
        slog(INFO, " + removing symbol", s)
        del grammar[s]
    return grammar

# make symbol an empty literal production
def grammar_trim_symbols(grammar, symbols):
    grammar_cut_symbols(grammar, symbols)
    for s in symbols:
        slog(INFO, " + adding empty production for symbol", s)
        p = Symbol(s)
        p.set_type(p_literal)
        grammar[s] = p

    return grammar

# flag symbols as non-payload
def grammar_irrelevant_symbols(grammar, symbols):
    for s in symbols:
        grammar[s].set_is_payload(False)

    return grammar
def grammar_create_ebnf(grammar, opts):
    indent = 40
    slog(INFO, "creating ebnf from grammar of size", len(grammar))
    out = ''
    for t, p in grammar.iteritems():
        slog(INFO, "formatting rule", t)
        if not len(p.rules):
            slog(INFO, "ignoring " + t + " (has no rules)\n")
            continue
        out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
        for rule in p.rules[1:]:
            out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
	out += ' ' * indent + ' ;\n'
    return out

def format_token(sym, tp):
    return misc.pad('%token <' + sym + '>', 27) + misc.pad(sym, 20) + '/* ' + tp + ' */'

def grammar_create_y(grammar, opts):
    indent = ' ' * 40
    width = 0
    for t, p in grammar.iteritems():
        if p.term is not None:
            continue
        if len(t) > width:
            width = len(t)
    spaces = 0
    while spaces < width:
        spaces += 8
    indent = '\t' * (spaces / 8)

    out = ""

    # preamble
    out += textwrap.dedent("""\
        %{
        #include <stdio.h>
        #include <string.h>
        #include <assert.h>
        #include <stdlib.h>
        #include <stdarg.h>

        #include <vector>
        #include <string>

    """)

    for f in opts['includes']:
        out += '#include "' + f + '"' + '\n'

    out += "\nusing namespace " + opts['namespace'] + ';\n'

    out += textwrap.dedent("""\
        using namespace std;

        namespace {

        typedef vector<const char *> wrap_t;
        const wrap_t curly_braces{ "{", "}" };
        const wrap_t round_braces{ "(", ")" };

        }

        #ifdef __cplusplus
        // extern "C" {
        #endif

        %}

    """)

    # types
    out += textwrap.dedent("""\
        %union {
    """)

    types = grammar_get_types(grammar)
    for t in types.keys():
        out += '\n\t' + opts['namespace'] + '::' +  t + '_t *' + t + ';'
    out += '\n'

    out += textwrap.dedent("""\
        }

    """)

    # yydecl
    out += textwrap.dedent("""\
        %{
        // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner);
        YY_DECL;
        %}
    """)

    # terminal tokens
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_terminal:
            out += format_token(p.sym, t) +'\n'

    # special tokens
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_special:
            if p.token == '?': # TODO: why is this among the symbols anyway?
                continue
            out += format_token(p.sym, t) +'\n'

    # regex tokens
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_literal:
            out += format_token(p.sym, t) +'\n'

    # types
    out += '\n'
    for t, p in grammar.iteritems():
        if p.tp == p_ruleset:
            out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n'

    out += textwrap.dedent("""\

        %define parse.error verbose
        %define api.pure full
        %param { struct context *context } { void *scanner }
    """)

    # productions
    out += '\n%%\n\n'
    for t, p in grammar.iteritems():

        if not len(p.rules):
            continue
        if p.tp == p_terminal:
            continue
        if p.tp == p_special:
            continue
        slog(INFO, "creating production for symbol", p.str())

        #if p.is_lexical_element is True:
        #    continue
        if len(p.rules) == 0:
            raise Exception("Symbol ", p.str(), "has no rules")
        first = True
        n_rule = 0
        for rule in p.rules:
            n_rule += 1
            n = 0
            s = State()
            if first:
                out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n"
                first = False
            else:
                out += indent + "| " + format_yacc_rule(rule) + "\n"
            out += indent + "{" + "\n"
            out += indent + "\t" + "$$->type = " + opts['namespace'] + '::' + t + "::t_" + str(n_rule) + ";\n"
            tokens = []
            for c in rule:
                if c.tp == t_target_lang:
                    tokens.append(c.token)
            idx = 0
            for c in rule:
                n += 1
                if c.tp == t_grammar:
                    s.update(c.token, 0)
                    continue
                p = grammar[c.token]
                #if is_terminal(c.token) is not None:
                #    continue
                if p.tp not in [ p_ruleset ]:
                    continue
                if not p.is_payload:
                    continue
                tp = tok2name(c.token)
                suffix = ''
                if tokens.count(c.token) > 1:
                    idx += 1
                    suffix = '_' + str(idx)
                out += indent + "\t" + \
                    "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \
                    " = new " + p.datatype + "(*$" + str(n) + ");\n"
            out += indent + "}" + "\n"
        out += indent + ";\n\n"

    # tail
    out += '\n%%\n\n'

    out += textwrap.dedent("""
        #ifdef __cplusplus
        // } /* extern "C" */
        #endif
    """)

    return out + "\n"

def grammar_create_l(grammar, opts):

    ignore = ""

    out = textwrap.dedent("""\
        %option reentrant
        %option bison-bridge

        %{
        #include <slog.h>

    """)

    for f in opts['includes']:
        out += '#include "' + f + '"' + '\n'

    out += "\nusing namespace " + opts['namespace'] + ';\n'

    #out += textwrap.dedent("""\

    #    /* This is too late in the Flex generated file to work. Still lots of
    #     * prototypes are spat into it above it, which end up with C++ linkage, of
    #     * course, causing the linkages to be inconsistent to the functions below this
    #     * extern "C". Only way I found was to use C++ is to use it on Bison only, and
    #     * have Flex use C instead. */
    #    #ifdef __cplusplus
    #    // extern "C" {
    #    #endif

    #    #ifdef _REMOVE_ME
    #    static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
    #    static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip);
    #    #endif

    #    %}

    out += textwrap.dedent("""\
        %}

        %%

        \\n { context->line++; }

        """)

    for t, p in grammar.iteritems():
        if p.term is not None:
            # \. { return T_DOT; }
            assert p.term[0] in [ '"', "'" ], p.term
            assert p.term[-1] in [ '"', "'" ], p.term
            out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'

    #out += textwrap.dedent("""\
    #
    #    %{/* basic_identifier */%}
    #    %{/* extended_identifier */%}
    #    %{/* based_integer */%}
    #    %{/* bit_value */%}
    #    %{/* numeric_literal */%}
    #    %{/* enumeration_literal */%}
    #    %{/* string_literal */%}
    #    %{/* bit_string_literal */%}
    #    %{/* character_literal */%}
    #    %{/* graphic_character */%}
    #    %{/* basic_character */%}
    #    %{/* integer */%}
    #
    #	""")

    ignore += textwrap.dedent("""\

        %{ /* not sure how to handle literals >> */ %}
        \\"[ \\!#-~]*\\" |
        \\'[0-1]\\' {
        	// get_string(yylval_param, yyscanner, 1);
        	/* Gets a string excluding " or ' */
        	int skip = 1;
        	int i;

        	for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++);
        	yytext[i] = 0;
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt=(char *)malloc(i+1);
        	strcpy(lv->txt, yytext+skip);

        	return STRING;
        }

        #[0-9a-f]*# {
        	// get_based_string(yylval_param, yyscanner, 1); /* skip leading # */
        	/* Gets a string excluding # */
        	int i;
        	int skip = 1;

        	for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++);
        	yytext[i] = 0;
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt = (char *)malloc(i+1);
        	strcpy(lv->txt, yytext + skip);

        	return BASED;
        }

        [a-zA-Z_$][a-zA-Z0-9_$.]* {
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	lv->txt=(char *)malloc(strlen(yytext)+1);
        	strcpy(lv->txt, yytext);
        	return NAME;
        }

        [0-9]+ {
        	YYSTYPE *lv = FB_SYM(get_lval(yyscanner));
        	sscanf(yytext, "%d", &lv->n);
        	return NATURAL;
        }

        """)

    out += textwrap.dedent("""\
        . {
        	return yytext[0];
        }

        %{/* not sure how to handle literals << */%}

        %%

        void FB_SYM(error)(struct context *context, void *scanner, const char *msg)
        {
        	struct yyguts_t *yyg =(struct yyguts_t*)scanner;
        	set_error(PRI_ERR, EINVAL, "%s at \\"%s\\" in line %d", msg, yytext, context->line);
        }

        int FB_SYM(wrap)(void *scanner)
        {
        	return 1;
        }

        struct vp_scanner {
        	YY_BUFFER_STATE buf;
        	void *scanner;
        	char *str;
        };

        /* utilities which need to be placed here, because I can't find
         * yylex_init() / _destroy() in any generated header file (??) */
        struct vp_scanner *FB_SYM(init_scanner)(const char *str)
        {
        	struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r));

        	yylex_init(&r->scanner);
        	r->str = strdup(str);
        	r->buf = yy_scan_string(r->str, r->scanner);
        	FB_SYM(set_extra)(r, r->scanner);
        	// yyset_in(stdin, r->scanner);
        	// yyset_out(stdout, r->scanner);
        	return r;
        }

        void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner)
        {
        	return scanner->scanner;
        }

        void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner)
        {
        	free(scanner->str);
        	yy_delete_buffer(scanner->buf, scanner->scanner);
        	yylex_destroy(scanner->scanner);
        	free(scanner);
        }

	""")

    #    #ifdef __cplusplus
    #    // } // extern "C"
    #    #endif
    #
    #	""")

    return out

def grammar_create_h(grammar, opts):
    out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n'
    ns = opts['namespace']

    if ns is not None:
        out += 'namespace ' + ns + '{\n\n'

    types = grammar_get_types(grammar)

    # struct forward declarations
    for t, members in types.iteritems():
        if len(members):
            out += '\nstruct ' +  t + ';'
    out += '\n'

    # struct / non-struct typedefs
    for t, members in types.iteritems():
        if not len(members):
            out += '\ntypedef const char ' +  t + '_t;'
            continue
        out += '\ntypedef struct ' +  t + ' ' + t + '_t;'
    out += '\n'

    # struct definitions
    for t, rules in types.iteritems():
        if not len(rules):
            continue
        out += '\n\nstruct ' +  t + ' {\n'

        # rule structs
        n = 0
        for rule in rules:
            n += 1
            idx = 0
            out += '\n\tstruct ' + 'r' + str(n) + '_t {'
            for m in rule:
                suffix = ''
                if rule.count(m) > 1:
                    idx += 1
                    suffix = '_' + str(idx)
                p = grammar[m]
                out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';'
            out += '\n\t};'

        # type enum
        n = 0
        out += '\n\n\tenum {'
        for rule in rules:
            n += 1
            out += '\n\t\tt_' + str(n) + ','
        out += '\n\t} type;'
        out += '\n'

        # data union
        n = 0
        out += '\n\tunion {'
        for rule in rules:
            n += 1
            out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';'
        out += '\n\t} data;'

        # struct done
        out += '\n};'

    out += '\n'

    if ns is not None:
        out += '\n} /* namespace ' + ns + '*/'

    out += textwrap.dedent("""\

        struct vp_scanner;

        struct vp_scanner *FB_SYM(init_scanner)(const char *str);
        void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner);
        void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner);

	""")

    out += '\n\n#endif /* #ifndef + ' + opts['mip'] + ' */'

    return out

class GrammarCmd(jwutils.Cmd):

    def __init__(self, name, help):
        super(GrammarCmd, self).__init__(name, help=help)

    def add_parser(self, parsers):
        p = super(GrammarCmd, self).add_parser(parsers)
        p.add_argument("input", help="input file")
        p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False)
        p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat)
        p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False)
        p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False)
        p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False)
        p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='')
        p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='')
        p.add_argument('-r', '--irrelevant-symbols', help='exclude symbol from output payload', nargs='?', default='')
        p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='')
        return p

    def processGrammar(self, args, grammar):
        if args.fix_extensions not in fix_extensions_mode:
            raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions")
        grammar = grammar_fix_extensions(grammar, args.fix_extensions)
        if args.unroll_alternatives:
            grammar = grammar_unroll_alternatives(grammar)
        if args.unroll_lists:
            grammar = grammar_unroll_lists(grammar)
        if args.unroll_options:
            grammar = grammar_unroll_options(grammar)
        if len(args.check_symbols):
            check_symbols = []
            if args.check_symbols == 'all':
                args.check_symbols = ''
            check_symbols = args.check_symbols.split()
            grammar_check(grammar, check_symbols)
        if args.replace_whitespace:
            grammar = grammar_replace_whitespace(grammar)
        if len(args.trim_symbols):
            grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(','))
        if len(args.cut_symbols):
            grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(','))
        if len(args.irrelevant_symbols):
            grammar = grammar_irrelevant_symbols(grammar, args.irrelevant_symbols.split(','))
        return grammar

# ------------------------------------------------- TODO: clean this up >

class DerivedGrammarCmd(GrammarCmd):

    def __init__(self, name, help):
        super(DerivedGrammarCmd, self).__init__(name, help=help)

    @abstractmethod
    def _run(self, grammar):
        pass

    def _parse(self, contents):
        return grammar_parse_ebnf(contents)

    def add_parser(self, parsers):
        p = super(DerivedGrammarCmd, self).add_parser(parsers)
        return p

    def run(self, args):
        with open(args.input, 'r') as infile:
            contents = infile.read()
        grammar = self._parse(contents)
        grammar = super(DerivedGrammarCmd, self).processGrammar(args, grammar)
        self._run(args, grammar)

class CmdCreate(DerivedGrammarCmd):

    def __init__(self):
        super(CmdCreate, self).__init__("create", help="Create a file")

    def add_parser(self, parsers):
        p = super(CmdCreate, self).add_parser(parsers)
        p.add_argument("output", help="output file")
        p.add_argument('--namespace', help='namespace of generated AST', default='parser')
        p.add_argument('--includes', help='list of header files to be #included in C/C++ implementation files', default='')
        return p

    def _run(self, args, grammar):
        name, ext = os.path.splitext(args.output)[1]
        #cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output))
        mip = None
        if ext == 'h':
            mip = args.namespace + re.sub(r'[-./]', '_', args.output).upper()

        includes = args.includes.split(',')

        # generated code breaks without this, not sure why
        if ext == 'l':
            tmp = []
            for f in includes:
                if not re.match('.*lex\..*\.h', f):
                    tmp.append(f)
            includes = tmp

        cmd = getattr(sys.modules[__name__], 'grammar_create_' + ext)

        opts = {
            "namespace" : args.namespace,
            "includes"  : includes,
            "mip"       : mip
        }
        out = cmd(grammar, opts)
        print(out)

class CmdCheck(DerivedGrammarCmd):

    def __init__(self):
        super(CmdCheck, self).__init__("check", help="Check grammar")

    def add_parser(self, parsers):
        p = super(CmdCheck, self).add_parser(parsers)
        return p

    def _run(self, args, grammar):
        pass