#!/usr/bin/python # -*- coding: utf-8 -*- import argparse import sys import re import lxml.etree as ET import textwrap import itertools import copy from collections import OrderedDict from abc import abstractmethod import os.path import jwutils #from jwutils.stree import StringTree, serdes import jwutils.stree.serdes as serdes import jwutils.stree.StringTree as StringTree from jwutils.log import * t_grammar = "grammar" t_target_lang = "target" p_ruleset = "ruleset" p_terminal = "term" p_literal = "literal" p_lexical = "lexical" p_special = "special" p_regex = "regex" mode_unroll = "unroll" mode_concat = "concat" mode_keep = "keep" mode_discard = "discard" fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] c_token = "token" c_non_terminal = "non-terminal" member_prefix = '' special_terminals = { "`" : "BACKTICK", "^" : "CARET", "<" : "LT", "<<" : "LEFT_SHIFT", "<=" : "LTE", "<=>" : "SPACE_SHIP", "<>" : "NE", "=" : "EQ", "=>" : "EG", ">" : "GT", ">=" : "GE", ">>" : "RIGHT_SHIFT", "|" : "PIPE", "_" : "UNDERSCORE", "," : "COMMA", ";" : "SEMICOLON", ":" : "COLON", ":=" : "DEFINE", "?" : "QM", "?<" : "QM_LT", "?<=" : "QM_LE", "?=" : "QM_EQ", "?>" : "QM_GT", "?>=" : "QM_GE", "??" : "QM_QM", "?/=" : "QM_DIV_EQ", "/" : "DIV", "/=" : "DIV_EQ", "." : "DOT", "\"" : "DQUOTE", "'" : "QUOTE", "(" : "LPAREN", ")" : "RPAREN", "[" : "LBRACKET", "]" : "RBRACKET", "@" : "AT", "*" : "ASTERISK", "**" : "DASTERISK", "\\" : "BACKSLASH", "&" : "AMPERSAND", "#" : "NUMBER_SIGN", "+" : "PLUS", "-" : "MINUS" } token_regexes = { "PSL_Property_Declaration" : "property[ \t]+[^;]+;", "PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;", "PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;", "PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;", "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", } quotechars = [ '"', "'" ] def dump(obj): for c, v in obj.iteritems(): slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) def dump_grammar(prio, grammar): caller = get_caller_pos() for t, p in grammar.iteritems(): p.dump(prio, caller=caller) def cleanup_token(tok): tok = tok.strip() if len(tok) == 0: return None if tok[0] == "'" and tok[-1] == "'": tok = '"' + tok[1:-1] + '"' return tok def tok2ctype(tok): if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]: return t_grammar return t_target_lang def is_terminal(tok): size = len(tok) if size < 2: return None first = tok[0] last = tok[-1] if (not first in quotechars) and (not last in quotechars): return None if first != last: raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes') return tok[1:-1] def tok2name(tok): tok = cleanup_token(tok) term = is_terminal(tok) if term is not None: if term in special_terminals.keys(): return special_terminals[term] return term return tok def tok2sym(tok): tok = cleanup_token(tok) term = is_terminal(tok) if term is not None: if term in special_terminals.keys(): return "T_" + special_terminals[term].upper() return "T_" + re.sub('[^a-zA-Z0-9]', '_', term).upper() return tok def tok2regex(tok): if tok in token_regexes.keys(): return token_regexes[tok] return re.escape(tok) def format_rule(rule): return ' '.join(c.str() for c in rule) def format_rules(rules): return ', '.join(format_rule(rule) for rule in rules) def format_ebnf_rule(grammar, rule): r = "" last = None for comp in rule: if last is not None: if comp.tp == t_grammar: if last.tp == t_grammar: pass else: if comp.token in [ '[', '(', '{', '<' ]: r += ',' else: if last.tp == t_grammar: if comp.token in [ ']', ')', '}', '>' ]: r += ',' else: r += ',' r += ' ' + comp.token last = comp if len(r) == 0: return r return r.strip() def format_yacc_rule(rule): r = '' for c in rule: if c.tp != t_target_lang: slog(DEBUG, "ignoring non-target-language token", c.token, "in rule") continue r += tok2sym(c.token) + ' ' return r[:-1] class SourceElement: def __init__(self, token, line): self.token = token self.line = line class RuleComp: def __init__(self, token, tp = None, line=-1): assert(token is not None) # assert(token != '|') self.token = token if tp is None: tp = tok2ctype(token) self.tp = tp slog(INFO, "creating rule component >" + self.str() + "<") assert(token != "{ assignment") self.line = line def __eq__(self, rhs): if self.token != rhs.token: return False if self.tp != rhs.tp: return False return True def __ne__(self, rhs): return not self.__eq__(rhs) def str(self): tp = 'u' if self.tp == t_grammar: tp = 'g' elif self.tp == t_target_lang: tp = 'l' else: tp = self.tp return "{" + tp + ": " + self.token + "}" class State: def __init__(self): self.__pair_square = ['[', ']'] self.__pair_curly = ['{', '}'] self.__pair_ext = ['<', '>'] self.__pair_group = ['(', ')'] self.__pair_comment = ['(*', '*)'] self.__pair_special = ['?', '?'] self.reset() def reset(self): self.curly = 0 self.square = 0 self.ext = 0 self.group = 0 self.in_comment = False self.in_special = False self.production = None self.rule = [] self.rules = [] self.things = [] def optional(self): return self.square != 0 or self.curly != 0 def update(self, tok, line): if not self.in_comment: if tok == '[': self.square += 1 self.things.append(self.__pair_square) elif tok == ']': self.square -= 1 assert(self.things.pop() == self.__pair_square) elif tok == '{': self.curly += 1 self.things.append(self.__pair_curly) elif tok == '}': self.curly -= 1 assert(self.things.pop() == self.__pair_curly) elif tok == '(': self.group += 1 self.things.append(self.__pair_group) elif tok == ')': self.group -= 1 assert(self.things.pop() == self.__pair_group) elif tok == '<': self.ext += 1 self.things.append(self.__pair_ext) elif tok == '>': self.ext -= 1 assert(self.things.pop() == self.__pair_ext) elif tok == '?': if not self.in_special: self.in_special = True self.things.append(self.__pair_special) else: self.in_special = False assert(self.things.pop() == self.__pair_special) elif tok == '(*': self.in_comment = True self.things.append(self.__pair_comment) elif tok == '*)': raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line) else: if tok == '(*': raise Exception("Nested EBNF comment", tok, "in line", line) elif tok == '*)': assert(self.things.pop() == self.__pair_comment) self.in_comment = False if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0: raise Exception("Unbalanced BNF bracket", tok, "in line", line) return self.optional() def in_list(self): return self.curly > 0 def in_option(self): return self.square > 0 def in_group(self): return self.group > 0 def in_ext(self): return self.ext > 0 def in_something(self): if len(self.things) == 0: return None return self.things[-1] class Symbol: def __init__(self, token, tp = None, rules = None): self.reset(token, tp, rules) self.set_is_payload(True) def reset(self, token, tp = None, rules = None): if tp == None: if is_terminal(token) is not None: tp = p_terminal else: tp = p_ruleset self.tp = tp self.token = token self.name = tok2name(token) self.sym = tok2sym(token) self.term = None self.regex = None self.is_lexical_element = False self.rules = [] self.datatype = None if rules is not None: self.rules = rules self.set_type(tp) def set_is_payload(self, onoff): self.is_payload = onoff def set_type(self, tp): if tp == p_ruleset: self.term = None self.regex = None self.is_lexical_element = False self.datatype = self.token + '_t' elif tp == p_literal: assert(len(self.rules) == 0) self.term = None self.regex = tok2regex(self.token) self.is_lexical_element = False self.datatype = 'std::string' elif tp == p_special or tp == p_lexical or tp == p_regex: if len(self.rules): self.dump(ERR) raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules") self.term = None self.regex = None self.is_lexical_element = True self.datatype = 'std::string' elif tp == p_terminal: if len(self.rules): slog(ERR, "rules = ", self.rules) self.dump(ERR) raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules") self.term = self.token self.regex = tok2regex(self.token) self.is_lexical_element = False self.datatype = None else: self.dump() raise Exception("Tried to set symbol to unknown type", tp) self.tp = tp def str(self): r = self.name + ' = ' + format_rules(self.rules) return r def equals(self, rhs): for k, v in self.__dict__.iteritems(): if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]: slog(WARNING, k, self.__dict__[k], rhs.__dict__[k]) return False return True def dump(self, prio = NOTICE, msg="", caller=None): if caller is None: caller = get_caller_pos(1) slog(prio, ",----------------", msg, caller=caller) slog(prio, "| type =", self.tp, caller=caller) slog(prio, "| name =", self.name, caller=caller) slog(prio, "| token =", self.token, caller=caller) slog(prio, "| sym =", self.sym, caller=caller) slog(prio, "| term =", self.term, caller=caller) slog(prio, "| regex =", self.regex, caller=caller) slog(prio, "| datatype =", self.datatype, caller=caller) slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller) slog(prio, "| rules =", format_rules(self.rules), caller=caller) slog(prio, "`----------------", msg, caller=caller) def split_list_by(l_, tok): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]] def split_list_by_regex(l_, regex): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]] def grammar_tokenize_ebnf(content): r = [] c = '' l = 0 in_comment = False in_quote = None for line in content.splitlines(True): end = len(line) - 1 l += 1 tok = '' p = -1 while p < end: p += 1 if p < end and in_quote == None: cand = line[p:p+2] if cand == '(*': if in_comment: raise Exception("Syntax error in line", l, ": spurious comment closure") in_comment = True p += 1 continue elif cand == '*)': if not in_comment: raise Exception("Syntax error in line", l, ": spurious comment opener") in_comment = False p += 1 continue if in_comment: continue c = line[p] if c in [ '"', "'" ]: if in_quote is None: in_quote = c else: if in_quote == c: in_quote = None if in_quote is not None: tok += c continue if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]: tok = tok.strip() if len(tok): r.append((tok, l)) tok = '' if not c.isspace(): r.append((c, l)) continue tok += c tok = tok.strip() if len(tok): r.append((tok, l)) return r def grammar_add_symbol(grammar, tok, rules): assert(tok is not None) if tok in grammar.keys(): s = grammar[tok] else: s = Symbol(tok, rules=rules) grammar[tok] = s if rules is not None: slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules)) for rule in rules: if not rule in s.rules: s.rules.append(rule) grammar[tok] = s def grammar_parse_ebnf_tokens(tokens): grammar = OrderedDict() state = State() lhs = None last = None ruleset = [] rule = [] terminals = [] specials = [] for tok, line in tokens: try: state.update(tok, line) if tok == '=': lhs = last continue last = tok if tok == ';': ruleset.append(rule) grammar_add_symbol(grammar, lhs, ruleset) ruleset = [] rule = [] lhs = None continue if tok == ',': continue if tok == '|' and state.in_something() is None: ruleset.append(rule) rule = [] continue if is_terminal(tok) and tok not in terminals: terminals.append(tok) elif state.in_special and tok not in specials: specials.append(tok) if lhs is not None: rule.append(RuleComp(tok, line=line)) except Exception as err: for t in tokens: slog(ERR, t) slog(ERR, "Unexpected error in line", line, ":", str(err)) raise exit(1) for s in terminals: grammar_add_symbol(grammar, s, None) grammar[s].set_type(p_terminal) for s in specials: slog(INFO, "found special sequence symbol", s) grammar_add_symbol(grammar, s, None) grammar[s].set_type(p_special) return grammar def grammar_parse_ebnf(content_): tokens = grammar_tokenize_ebnf(content_) grammar = grammar_parse_ebnf_tokens(tokens) return grammar def grammar_get_types(grammar): types = dict() for t, p in grammar.iteritems(): if not len(p.rules): continue if p.term is not None: continue ruleno = 1 rules = [] for rule in p.rules: members = [] for c in rule: if c.tp != t_target_lang: continue if not c.token in grammar.keys(): p.dump(ERR) raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule)) pp = grammar[c.token] if pp.tp is p_terminal: continue if not pp.is_payload: continue members.append(tok2sym(c.token)) if True or len(members): rules.append(members) if t in types.keys(): raise Exception("Tried to add type", t, "twice") types[t] = rules return types def grammar_fix_extensions(grammar, mode): for tok, p in grammar.iteritems(): newrules = [] for rule in p.rules: newrule = [] prefix = "" paren = 0 for c in rule: if c.tp == t_grammar and c.token in ['<', '>']: if c.token == '<': paren += 1 elif c.token == '>': paren -= 1 if paren <= 1: # don't add first level of control chars continue newrule.append(c) continue if paren > 0: assert(len(c.token) != 0) prefix += '_' + c.token continue if len(prefix) > 0: prefix = prefix[1:] slog(INFO, "Found prefix", prefix) if mode == mode_keep: newrule.append(RuleComp('<')) newrule.append(RuleComp(prefix, t_target_lang)) newrule.append(RuleComp('>')) newrule.append(c) elif mode == mode_discard: prefix = '' continue elif mode in [ mode_unroll, mode_concat ]: combined = RuleComp(c.token, c.tp) combined.token = prefix + c.token prefix = '' newrule.append(combined) slog(INFO, "Appended new rule return value", combined.token) if mode == mode_unroll: if combined.token in grammar.keys(): continue grammar[combined.token] = Symbol(combined.token, rules=[[c]]) else: raise Exception("Invalid prefix mode", mode) prefix = '' continue newrule.append(c) if len(prefix): # undigested prefix, since it was the last newrule.append(RuleComp(prefix[1:], t_target_lang)) newrules.append(newrule) grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only return grammar # TODO: not sure if this is necessary def grammar_unroll_lists(grammar): delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic for tok, p in grammar.iteritems(): newrules = [] for rule in p.rules: newrule = [] listrule = [] prefix = None s = State() slog(INFO, "----------------- list-unrolling rule", format_rule(rule)) for c in rule: s.update(c.token, c.line) if c.token == '{': continue if c.token == '}': if len(listrule) == 0: raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) delpos = [] name = "list" for i, rule in enumerate(listrule): if rule.token in delimiters: delpos.append(i) continue if rule.tp != t_target_lang: continue name += "_" + tok2name(rule.token) # not really: there are lists without delimiters, too #if len(delpos) != 1: # p.dump(ERR) # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) newrule.append(RuleComp(name, t_target_lang)) listrule.insert(0, RuleComp('(', t_grammar)) listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration listrule.append(RuleComp(')', t_grammar)) p = Symbol(name, rules=[[], listrule]) #p = Symbol(name) #p.rules = [ [], listrule ] listrule = [] if name not in grammar.keys(): grammar[name] = p continue if not p.equals(grammar[name]): p.dump(ERR, "old list production") p.dump(ERR, "new list production") raise Exception("List production expands to already taken name", name) continue if s.in_list(): listrule.append(c) continue newrule.append(c) newrules.append(newrule) grammar[tok].rules = newrules return grammar def rules_unroll_options(rules): r = [] found = False slog(DEBUG, "unrolling", format_rules(rules)) for rule in rules: square = 0 option = [] newrule = [] for i, c in enumerate(rule): if c.tp == t_grammar: if c.token == '[': square += 1 elif c.token == ']': square -= 1 if square == 1: continue if square >= 1: option.append(c) continue slog(DEBUG, "square =", square) assert(square == 0) n = len(option) if n == 0: newrule.append(c) continue # first without option replaced = newrule[:] tail = rule[i+1:len(rule)] slog(DEBUG, "i = ", i) slog(DEBUG, "n = ", n) slog(DEBUG, "rule = ", format_rule(rule)) slog(DEBUG, "tail = ", format_rule(tail)) slog(DEBUG, ",-------------------------") slog(DEBUG, "head = ", format_rule(replaced)) replaced.extend(tail) slog(DEBUG, "head + tail = ", format_rule(replaced)) r.append(replaced) # then with option inserted for unrolled in rules_unroll_options([ option ]): replaced = newrule[:] slog(DEBUG, ",-------------------------") slog(DEBUG, "head = ", format_rule(replaced)) slog(DEBUG, "unrolled = ", format_rule(unrolled)) replaced.extend(unrolled) slog(DEBUG, "head + unrolled =", format_rule(replaced)) replaced.extend(tail) slog(DEBUG, "head + unrolled + tail =", format_rule(replaced)) r.append(replaced) found = True break if not found: r.append(newrule) if found: return rules_unroll_options(r) return r def grammar_unroll_options(grammar): for tok, p in grammar.iteritems(): grammar[tok].rules = rules_unroll_options(p.rules) return grammar def rules_unroll_alternatives(rules): r = [] found = False slog(INFO, "unrolling alternatives in", format_rules(rules)) sep = RuleComp('|') for rule in rules: if not sep in rule: r.append(rule) continue found = True state = State() end = len(rule) - 1 first = last = -1 for i, c in enumerate(rule): state.update(c.token, line=c.line) if c.token != '|' or c.tp != t_grammar: slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i) continue slog(INFO, "found token at position", i) container = state.in_something() slog(INFO, "thing delimiters are", container) if container is None: raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule)) first = last = -1 k = i - 1 while k >= 0: prev = rule[k] slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0]) if prev.token == container[0]: first = k break k -= 1 if first == -1: raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule)) k = i while k <= end: nxt = rule[k] slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1]) if nxt.token == container[1]: last = k break k += 1 if last == i: raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule)) break # found what I wanted assert(first > 0) assert(last > 0) assert(last <= end) head = rule[0:first] mid = rule[first+1:last] tail = rule[last+1:end] slog(INFO, "first =", first, "last =", last, "end =", end) slog(INFO, "head = ", format_rule(head)) slog(INFO, "mid = ", format_rule(mid)) slog(INFO, "tail = ", format_rule(tail)) for m in split_list_by(mid, sep): unrolled_rule = head + m + tail r.append(unrolled_rule) #if found: # return rules_unroll_alternatives(r) return r def grammar_unroll_alternatives(grammar): for tok, p in grammar.iteritems(): grammar[tok].rules = rules_unroll_alternatives(p.rules) return grammar def grammar_replace_whitespace(grammar): r = OrderedDict() for tok, s in grammar.iteritems(): newrules = [] for rule in s.rules: newrule = [] for c in rule: newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line) newrule.append(newc) newrules.append(newrule) newtok = tok.replace(' ', '_') s.reset(newtok, tp=s.tp, rules=newrules) r[newtok] = s slog(INFO, "added symbol", newtok) return r def grammar_add_configured_types(grammar, conf): if conf is None: return grammar symbols = conf.get('symbols') if symbols is None: return grammar for t, c in symbols.iteritems(): s = Symbol(t) s.set_type(p_regex) s.regex = c["regex"].value() grammar[t] = s return grammar def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): if checked is None: checked = set() if found is None: found = dict() indent = ' ' * depth * 2 if tok in found.keys(): slog(INFO, indent + " + found cached", tok, "with depth", found[tok]) return found[tok] slog(INFO, indent + " + " + tok) indent = indent + " " if tok in terminals: found[tok] = 1 slog(INFO, indent + " + found terminal", tok, "with depth", found[tok]) return 1 if tok in orphans: found[tok] = 1 slog(INFO, indent + " + found orphan", tok, "with depth", found[tok]) return 1 #if tok in lexicals: # found[tok] = 1 # slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok]) # return 1 if tok in checked: slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked)) return sys.maxint slog(INFO, indent, "checked =", ' '.join(checked)) checked.add(tok) if tok not in grammar.keys(): slog(ERR, "tried to validate unknown token \"" + tok + "\"") return sys.maxint p = grammar[tok] r = sys.maxint slog(INFO, indent, p.token, "has", len(p.rules), "rules") only_optional = True for rule in p.rules: slog(INFO, indent, "testing rule", format_rule(rule)) if tok in [ c.token for c in rule ]: continue mn = sys.maxint mx = 0 s = State() for c in rule: slog(INFO, indent, "testing token", c.token) if c.tp == t_grammar and s.update(c.token, 0): continue if c.tp != t_target_lang: slog(INFO, indent, " token", c.token, "is not a VHDL token") continue only_optional = False # same "found" argument in next call? rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found) slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx) if rr == sys.maxint or rr is None: slog(INFO, indent, " got error for token", c.token) mn = sys.maxint mx = 0 break if rr > mx: slog(INFO, indent, " adjusting mx to", rr) mx = rr if rr < mn: slog(INFO, indent, " adjusting mn to", rr) mn = rr if mn == sys.maxint or mx == 0: # unusable as escape route slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule)) continue slog(INFO, indent, "after checking all rules, mx is", mx) if mx < r: slog(INFO, indent, "setting return value to max", mx) r = mx if only_optional: slog(INFO, indent, tok, "has only optional rules, accepting") r = 0 if r != sys.maxint: r += 1 slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps") found[tok] = r slog(INFO, indent, "returning", r, "for token", tok) return r def grammar_check(grammar, check_symbols = None): terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} elements = set() if check_symbols is None: check_symbols = [] if len(check_symbols) == 0: for tok, p in grammar.iteritems(): if p.is_lexical_element: elements.add(tok) continue for rule in p.rules: for c in rule: if c.tp == t_grammar: continue elements.add(c.token) check_symbols = sorted(list(elements)) found = dict() for tok in check_symbols: slog(INFO, "======= checking", tok) rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) if rr == sys.maxint: slog(ERR, "No way out for", tok) exit(1) if not tok in grammar.keys(): slog(ERR, "Token", tok, "has no production") exit(1) slog(INFO, tok, "->", str(rr)) def grammar_lhss_map(grammar): r = dict() for t in grammar.keys(): r[t] = set() for t, p in grammar.iteritems(): for rule in p.rules: for c in rule: if c.tp == t_target_lang: r[c.token].add(t) return r def do_grammar_lhss(dmap, stop, rhs, buf, recursive): lhss = dmap[rhs] for lhs in lhss: if lhs in buf: continue buf.add(lhs) if lhs in stop: slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion") continue if recursive: do_grammar_lhss(dmap, stop, lhs, buf, recursive) def grammar_lhss(dmap, stop, symbols, recursive = False): r = set() for s in symbols: if s in r: continue do_grammar_lhss(dmap, stop, s, r, recursive) return r def do_grammar_rhss(grammar, stop, sym, buf): p = grammar[sym] for rule in p.rules: for c in rule: if c.tp != t_target_lang: continue if c.token in stop: continue if c.token in buf: continue buf.add(c.token) do_grammar_rhss(grammar, stop, c.token, buf) def grammar_rhss(grammar, stop, symbols): r = set() for s in symbols: if s in r: continue do_grammar_rhss(grammar, stop, s, r) return r def grammar_symbol_in_use(grammar, dmap, stop, checked, sym): if sym in stop: return False # Does this have to be recursive? defined = grammar_lhss(dmap, stop, set([sym])) slog(INFO, " symbol", sym, "defines:", ', '.join(defined)) if not len(defined): return True for d in defined: if d in stop: continue if d in checked: continue checked.add(d) if grammar_symbol_in_use(grammar, dmap, stop, checked, d): return True return False def do_grammar_unused(grammar, dmap, doomed): r = set(doomed) rhss = grammar_rhss(grammar, set(), doomed) for rhs in rhss: slog(INFO, "+++ checking if symbol", rhs, "is in use >>") if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs): slog(INFO, " symbol", rhs, "is not in use") r.add(rhs) slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<") return r def grammar_unused(grammar, dmap, doomed): r = set(doomed) while True: unused = do_grammar_unused(grammar, dmap, r) slog(INFO, "unused:", ', '.join(unused)) slog(INFO, "r: ", ', '.join(r)) if unused == r: break r |= unused return r # eradicate symbols from tree def grammar_cut_symbols(grammar, symbols): slog(INFO, "-------- removing symbols:", ', '.join(symbols)) dmap = grammar_lhss_map(grammar) unused = grammar_unused(grammar, dmap, symbols) for s in unused: slog(INFO, " + removing symbol", s) del grammar[s] return grammar # make symbol an empty literal production def grammar_trim_symbols(grammar, symbols): grammar_cut_symbols(grammar, symbols) for s in symbols: slog(INFO, " + adding empty production for symbol", s) p = Symbol(s) p.set_type(p_literal) grammar[s] = p return grammar # flag symbols as non-payload def grammar_irrelevant_symbols(grammar, symbols): for s in symbols: grammar[s].set_is_payload(False) return grammar def grammar_create_ebnf(grammar, opts): indent = 40 slog(INFO, "creating ebnf from grammar of size", len(grammar)) out = '' for t, p in grammar.iteritems(): slog(INFO, "formatting rule", t) if not len(p.rules): slog(INFO, "ignoring " + t + " (has no rules)\n") continue out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n' for rule in p.rules[1:]: out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n' out += ' ' * indent + ' ;\n' return out def format_token(sym, tp): return misc.pad('%token <' + sym + '>', 27) + misc.pad(sym, 20) + '/* ' + tp + ' */' def grammar_create_y(grammar, opts): indent = ' ' * 40 width = 0 for t, p in grammar.iteritems(): if p.term is not None: continue if len(t) > width: width = len(t) spaces = 0 while spaces < width: spaces += 8 indent = '\t' * (spaces / 8) conf = opts['config'] out = "" # preamble out += textwrap.dedent("""\ %{ #include #include #include #include #include #include #include #include """) for f in opts['includes']: out += '#include "' + f + '"' + '\n' out += "\nusing namespace " + opts['namespace'] + ';\n' #out += textwrap.dedent("""\ # using namespace std; # namespace { # typedef vector wrap_t; # const wrap_t curly_braces{ "{", "}" }; # const wrap_t round_braces{ "(", ")" }; # } # #ifdef __cplusplus # // extern "C" { # #endif out += textwrap.dedent("""\ %} """) # types out += textwrap.dedent("""\ %union { """) types = grammar_get_types(grammar) for t in types.keys(): s = grammar[t] if s.tp == p_regex: continue out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';' out += '\n' out += textwrap.dedent("""\ } """) # yydecl out += textwrap.dedent("""\ %{ // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner); YY_DECL; %} """) # terminal tokens out += '\n' for t, p in grammar.iteritems(): if p.tp == p_terminal: out += format_token(p.sym, t) +'\n' # special tokens out += '\n' for t, p in grammar.iteritems(): if p.tp == p_special: if p.token == '?': # TODO: why is this among the symbols anyway? continue out += format_token(p.sym, t) +'\n' # tokens from grammar out += '\n' for t, p in grammar.iteritems(): if p.tp == p_literal: out += format_token(p.sym, t) +'\n' # tokens from config for t, p in grammar.iteritems(): if p.tp == p_regex: out += format_token(t, "blah") + '\n' # types out += '\n' for t, p in grammar.iteritems(): if p.tp == p_regex: continue if p.tp == p_ruleset: out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n' # options out += textwrap.dedent("""\ %define parse.error verbose %define api.pure full %param { struct context *context } { void *scanner } """) if opts['start'] is not None: out += "%start " + opts['start'] # productions out += '\n%%\n\n' for t, p in grammar.iteritems(): if not len(p.rules): continue if p.tp == p_terminal: continue if p.tp == p_special: continue if p.tp == p_regex: continue slog(INFO, "creating production for symbol", p.str()) #if p.is_lexical_element is True: # continue if len(p.rules) == 0: raise Exception("Symbol ", p.str(), "has no rules") first = True n_rule = 0 for rule in p.rules: n_rule += 1 n = 0 s = State() if first: out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n" first = False else: out += indent + "| " + format_yacc_rule(rule) + "\n" out += indent + "{" + "\n" out += indent + "\t" + 'slog(PRI_NOTICE, "stack size = %d, %d / %d, %d", yyssp - &yyssa[0], yyss - &yyssa[0], yyvsp - &yyvsa[0], yyvs - &yyvsa[0]);\n' out += indent + "\t" + "$$ = new " + opts['namespace'] + '::' + t + ";\n" out += indent + "\t" + "$$->type = " + opts['namespace'] + '::' + t + "::t_" + str(n_rule) + ";\n" tokens = [] for c in rule: if c.tp == t_target_lang: tokens.append(c.token) idx = 0 for c in rule: if c.tp == t_grammar: s.update(c.token, 0) continue if c.token in tokens: continue n += 1 p = grammar[c.token] #if is_terminal(c.token) is not None: # continue if p.tp not in [ p_ruleset ]: continue if not p.is_payload: continue tp = tok2name(c.token) suffix = '' if tokens.count(c.token) > 1: idx += 1 suffix = '_' + str(idx) out += indent + "\t" + \ "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \ " = new " + p.datatype + "(*$" + str(n) + ");\n" out += indent + "}" + "\n" out += indent + ";\n\n" # tail out += '\n%%\n\n' out += textwrap.dedent(""" #ifdef __cplusplus // } /* extern "C" */ #endif """) return out + "\n" def grammar_create_l(grammar, opts): ignore = "" conf = opts['config'] out = textwrap.dedent("""\ %option reentrant %option bison-bridge %{ #include #define YY_USER_ACTION \\ context->first_line = context->last_line; \\ context->first_column = context->last_column; \\ for(int i = 0; yytext[i] != '\\0'; i++) { \\ if(yytext[i] == '\\n') { \\ context->last_line++; \\ context->last_column = 0; \\ } else { \\ context->last_column++; \\ } \\ } """) for f in opts['includes']: out += '#include "' + f + '"' + '\n' out += "\nusing namespace " + opts['namespace'] + ';\n' #out += textwrap.dedent("""\ # /* This is too late in the Flex generated file to work. Still lots of # * prototypes are spat into it above it, which end up with C++ linkage, of # * course, causing the linkages to be inconsistent to the functions below this # * extern "C". Only way I found was to use C++ is to use it on Bison only, and # * have Flex use C instead. */ # #ifdef __cplusplus # // extern "C" { # #endif # #ifdef _REMOVE_ME # static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); # static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); # #endif # %} out += textwrap.dedent("""\ %} %% """) for t, p in grammar.iteritems(): if p.term is not None: # \. { return T_DOT; } assert p.term[0] in [ '"', "'" ], p.term assert p.term[-1] in [ '"', "'" ], p.term out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n' for t, p in grammar.iteritems(): if p.tp == p_regex: c = conf['symbols'][t] lex_as = c.get('lex_as') if lex_as is not None: retval = lex_as.value() else: retval = t regex = c['regex'].value() out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + t + '"); return ' + retval + '; }\n' #out += textwrap.dedent("""\ # # %{/* basic_identifier */%} # %{/* extended_identifier */%} # %{/* based_integer */%} # %{/* bit_value */%} # %{/* numeric_literal */%} # %{/* enumeration_literal */%} # %{/* string_literal */%} # %{/* bit_string_literal */%} # %{/* character_literal */%} # %{/* graphic_character */%} # %{/* basic_character */%} # %{/* integer */%} # # """) ignore += textwrap.dedent("""\ %{ /* not sure how to handle literals >> */ %} \\"[ \\!#-~]*\\" | \\'[0-1]\\' { // get_string(yylval_param, yyscanner, 1); /* Gets a string excluding " or ' */ int skip = 1; int i; for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++); yytext[i] = 0; YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt=(char *)malloc(i+1); strcpy(lv->txt, yytext+skip); return STRING; } #[0-9a-f]*# { // get_based_string(yylval_param, yyscanner, 1); /* skip leading # */ /* Gets a string excluding # */ int i; int skip = 1; for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++); yytext[i] = 0; YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt = (char *)malloc(i+1); strcpy(lv->txt, yytext + skip); return BASED; } [a-zA-Z_$][a-zA-Z0-9_$.]* { YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt=(char *)malloc(strlen(yytext)+1); strcpy(lv->txt, yytext); return NAME; } [0-9]+ { YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); sscanf(yytext, "%d", &lv->n); return NATURAL; } """) out += textwrap.dedent("""\ . { slog(PRI_NOTICE, "returning character '%c'", yytext[0]); return yytext[0]; } %{/* not sure how to handle literals << */%} %% void FB_SYM(error)(struct context *context, void *scanner, const char *msg) { struct yyguts_t *yyg =(struct yyguts_t*)scanner; set_error(PRI_ERR, EINVAL, "%s at \\"%s\\" in line %d:%d", msg, yytext, context->last_line, context->last_column); } int FB_SYM(wrap)(void *scanner) { return 1; } struct vp_scanner { YY_BUFFER_STATE buf; void *scanner; char *str; }; /* utilities which need to be placed here, because I can't find * yylex_init() / _destroy() in any generated header file (??) */ struct vp_scanner *FB_SYM(init_scanner)(const char *str) { struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r)); yylex_init(&r->scanner); r->str = strdup(str); r->buf = yy_scan_string(r->str, r->scanner); FB_SYM(set_extra)(r, r->scanner); // yyset_in(stdin, r->scanner); // yyset_out(stdout, r->scanner); return r; } void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner) { return scanner->scanner; } void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner) { free(scanner->str); yy_delete_buffer(scanner->buf, scanner->scanner); yylex_destroy(scanner->scanner); free(scanner); } int FB_SYM(create_ast)(const char *str) { // TODO: Initialize this in a generated function struct context context = { first_line: 1, last_line: 1, first_column: 0, last_column: 0 }; FB_SYM(debug) = 1; struct vp_scanner *scanner = FB_SYM(init_scanner)(str); int status = FB_SYM(parse)(&context, FB_SYM(scanner_get_data)(scanner)); FB_SYM(cleanup_scanner)(scanner); if (status) { slog(PRI_ERR, "failed to parse (%s)", err()); return -1; } return 0; } """) # #ifdef __cplusplus # // } // extern "C" # #endif # # """) return out def grammar_create_h(grammar, opts): out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n' ns = opts['namespace'] out += textwrap.dedent("""\ #define YY_NO_INPUT #define YY_NO_UNPUT // #define YY_NO_UNISTD_H struct context { int first_line; int last_line; int first_column; int last_column; }; union YYSTYPE; #ifdef __cplusplus extern "C" { #endif struct vp_scanner; struct vp_scanner *FB_SYM(init_scanner)(const char *str); void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner); void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner); int FB_SYM(create_ast)(const char *str); void FB_SYM(error)(struct context *context, void *scanner, const char *s); #ifdef __cplusplus } // extern "C" #endif #define YY_DECL int FB_SYM(lex)(YYSTYPE *yylval_param, struct context *context, void *yyscanner) """) if ns is not None: out += 'namespace ' + ns + '{\n\n' types = grammar_get_types(grammar) # struct forward declarations for t, members in types.iteritems(): s = grammar[t] if s.tp == p_regex: continue if len(members): out += '\nstruct ' + t + ';' out += '\n' # struct / non-struct typedefs for t, members in types.iteritems(): s = grammar[t] if s.tp == p_regex: continue if not len(members): out += '\ntypedef const char ' + t + '_t;' continue out += '\ntypedef struct ' + t + ' ' + t + '_t;' out += '\n' # struct definitions for t, rules in types.iteritems(): s = grammar[t] if s.tp == p_regex: continue if not len(rules): continue out += '\n\nstruct ' + t + ' {\n' # rule structs n = 0 for rule in rules: n += 1 idx = 0 out += '\n\tstruct ' + 'r' + str(n) + '_t {' for m in rule: suffix = '' if rule.count(m) > 1: idx += 1 suffix = '_' + str(idx) ms = grammar[m] if ms.tp == p_regex: continue p = grammar[m] out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' out += '\n\t};' # type enum n = 0 out += '\n\n\tenum {' for rule in rules: n += 1 out += '\n\t\tt_' + str(n) + ',' out += '\n\t} type;' out += '\n' # data union n = 0 out += '\n\tunion {' for rule in rules: n += 1 out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';' out += '\n\t} data;' # struct done out += '\n};' out += '\n' if ns is not None: out += '\n} /* namespace ' + ns + '*/' out += '\n\n#endif /* #ifndef + ' + opts['mip'] + ' */' return out class GrammarCmd(jwutils.Cmd): def __init__(self, name, help): super(GrammarCmd, self).__init__(name, help=help) def add_parser(self, parsers): p = super(GrammarCmd, self).add_parser(parsers) p.add_argument("input", help="input file") p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False) p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat) p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False) p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False) p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False) p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='') p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='') p.add_argument('-r', '--irrelevant-symbols', help='exclude symbol from output payload', nargs='?', default='') p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='') p.add_argument('-s', '--start-symbols', help='use start-symbols', nargs='?', default=None) p.add_argument('-f', '--config-file', help='config file', nargs='?', default=None) return p def processGrammar(self, args, grammar): if args.config_file is not None: config = serdes.read(args.config_file) #config.dump(ERR) grammar = grammar_add_configured_types(grammar, config) if args.fix_extensions not in fix_extensions_mode: raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") grammar = grammar_fix_extensions(grammar, args.fix_extensions) if args.unroll_lists: grammar = grammar_unroll_lists(grammar) if args.unroll_alternatives: grammar = grammar_unroll_alternatives(grammar) if args.unroll_options: grammar = grammar_unroll_options(grammar) if len(args.check_symbols): check_symbols = [] if args.check_symbols == 'all': args.check_symbols = '' check_symbols = args.check_symbols.split() grammar_check(grammar, check_symbols) if args.replace_whitespace: grammar = grammar_replace_whitespace(grammar) if len(args.trim_symbols): grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(',')) if len(args.cut_symbols): grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(',')) if len(args.irrelevant_symbols): grammar = grammar_irrelevant_symbols(grammar, args.irrelevant_symbols.split(',')) return grammar # ------------------------------------------------- TODO: clean this up > class DerivedGrammarCmd(GrammarCmd): def __init__(self, name, help): super(DerivedGrammarCmd, self).__init__(name, help=help) @abstractmethod def _run(self, grammar): pass def _parse(self, contents): return grammar_parse_ebnf(contents) def add_parser(self, parsers): p = super(DerivedGrammarCmd, self).add_parser(parsers) return p def run(self, args): with open(args.input, 'r') as infile: contents = infile.read() grammar = self._parse(contents) grammar = super(DerivedGrammarCmd, self).processGrammar(args, grammar) self._run(args, grammar) class CmdCreate(DerivedGrammarCmd): def __init__(self): super(CmdCreate, self).__init__("create", help="Create a file") def add_parser(self, parsers): p = super(CmdCreate, self).add_parser(parsers) p.add_argument("output", help="output file") p.add_argument('--namespace', help='namespace of generated AST', default='parser') p.add_argument('--includes', help='list of header files to be #included in C/C++ implementation files', default='') return p def _run(self, args, grammar): name, ext = os.path.splitext(args.output) ext = ext[1:] #cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output)) mip = None if ext == 'h': mip = args.namespace + re.sub(r'[-./]', '_', args.output).upper() includes = args.includes.split(',') config = None if args.config_file is not None: config = serdes.read(args.config_file) # generated code breaks without this, not sure why if ext == 'l': tmp = [] for f in includes: if not re.match('.*lex\..*\.h', f): tmp.append(f) includes = tmp opts = { "namespace" : args.namespace, "includes" : includes, "mip" : mip, "config" : config, "start" : args.start_symbols } cmd = getattr(sys.modules[__name__], 'grammar_create_' + ext) out = cmd(grammar, opts) print(out) class CmdCheck(DerivedGrammarCmd): def __init__(self): super(CmdCheck, self).__init__("check", help="Check grammar") def add_parser(self, parsers): p = super(CmdCheck, self).add_parser(parsers) return p def _run(self, args, grammar): pass