#!/usr/bin/python # -*- coding: utf-8 -*- import argparse import sys import re import lxml.etree as ET import textwrap import itertools import copy from collections import OrderedDict from abc import abstractmethod import os.path import jwutils from jwutils.log import * t_grammar = "grammar" t_target_lang = "target" p_ruleset = "ruleset" p_terminal = "term" p_literal = "literal" p_lexical = "lexical" p_special = "special" mode_unroll = "unroll" mode_concat = "concat" mode_keep = "keep" mode_discard = "discard" fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] member_prefix = '' special_terminals = { "`" : "BACKTICK", "^" : "CARET", "<" : "LT", "<<" : "LEFT_SHIFT", "<=" : "LTE", "<=>" : "SPACE_SHIP", "<>" : "NE", "=" : "EQ", "=>" : "EG", ">" : "GT", ">=" : "GE", ">>" : "RIGHT_SHIFT", "|" : "PIPE", "_" : "UNDERSCORE", "," : "COMMA", ";" : "SEMICOLON", ":" : "COLON", ":=" : "DEFINE", "?" : "QM", "?<" : "QM_LT", "?<=" : "QM_LE", "?=" : "QM_EQ", "?>" : "QM_GT", "?>=" : "QM_GE", "??" : "QM_QM", "?/=" : "QM_DIV_EQ", "/" : "DIV", "/=" : "DIV_EQ", "." : "DOT", "\"" : "DQUOTE", "'" : "QUOTE", "(" : "LPAREN", ")" : "RPAREN", "[" : "LBRACKET", "]" : "RBRACKET", "@" : "AT", "*" : "ASTERISK", "**" : "DASTERISK", "\\" : "BACKSLASH", "&" : "AMPERSAND", "#" : "NUMBER_SIGN", "+" : "PLUS", "-" : "MINUS" } token_regexes = { "PSL_Property_Declaration" : "property[ \t]+[^;]+;", "PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;", "PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;", "PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;", "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", } quotechars = [ '"', "'" ] def dump(obj): for c, v in obj.iteritems(): slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) def dump_grammar(prio, grammar): caller = get_caller_pos() for t, p in grammar.iteritems(): p.dump(prio, caller=caller) def cleanup_token(tok): tok = tok.strip() if len(tok) == 0: return None if tok[0] == "'" and tok[-1] == "'": tok = '"' + tok[1:-1] + '"' return tok def tok2ctype(tok): if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]: return t_grammar return t_target_lang def is_terminal(tok): size = len(tok) if size < 2: return None first = tok[0] last = tok[-1] if (not first in quotechars) and (not last in quotechars): return None if first != last: raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes') return tok[1:-1] def tok2name(tok): tok = cleanup_token(tok) term = is_terminal(tok) if term is not None: if term in special_terminals.keys(): return special_terminals[term] return term return tok def tok2sym(tok): tok = cleanup_token(tok) term = is_terminal(tok) if term is not None: if term in special_terminals.keys(): return "T_" + special_terminals[term].upper() return "T_" + term.upper() return tok def tok2regex(tok): if tok in token_regexes.keys(): return token_regexes[tok] return re.escape(tok) def format_rule(rule): return ' '.join(c.str() for c in rule) def format_rules(rules): return ', '.join(format_rule(rule) for rule in rules) def format_ebnf_rule(grammar, rule): r = "" last = None for comp in rule: if last is not None: if comp.tp == t_grammar: if last.tp == t_grammar: pass else: if comp.token in [ '[', '(', '{', '<' ]: r += ',' else: if last.tp == t_grammar: if comp.token in [ ']', ')', '}', '>' ]: r += ',' else: r += ',' r += ' ' + comp.token last = comp if len(r) == 0: return r return r.strip() def format_yacc_rule(rule): r = '' for c in rule: if c.tp != t_target_lang: slog(DEBUG, "ignoring non-target-language token", c.token, "in rule") continue r += tok2sym(c.token) + ' ' return r[:-1] class SourceElement: def __init__(self, token, line): self.token = token self.line = line class RuleComp: def __init__(self, token, tp = None, line=-1): assert(token is not None) # assert(token != '|') self.token = token if tp is None: tp = tok2ctype(token) self.tp = tp slog(INFO, "creating rule component >" + self.str() + "<") assert(token != "{ assignment") self.line = line def __eq__(self, rhs): if self.token != rhs.token: return False if self.tp != rhs.tp: return False return True def __ne__(self, rhs): return not self.__eq__(rhs) def str(self): tp = 'u' if self.tp == t_grammar: tp = 'g' elif self.tp == t_target_lang: tp = 'l' else: tp = self.tp return "{" + tp + ": " + self.token + "}" class State: def __init__(self): self.__pair_square = ['[', ']'] self.__pair_curly = ['{', '}'] self.__pair_ext = ['<', '>'] self.__pair_group = ['(', ')'] self.__pair_comment = ['(*', '*)'] self.__pair_special = ['?', '?'] self.reset() def reset(self): self.curly = 0 self.square = 0 self.ext = 0 self.group = 0 self.in_comment = False self.in_special = False self.production = None self.rule = [] self.rules = [] self.things = [] def optional(self): return self.square != 0 or self.curly != 0 def update(self, tok, line): if not self.in_comment: if tok == '[': self.square += 1 self.things.append(self.__pair_square) elif tok == ']': self.square -= 1 assert(self.things.pop() == self.__pair_square) elif tok == '{': self.curly += 1 self.things.append(self.__pair_curly) elif tok == '}': self.curly -= 1 assert(self.things.pop() == self.__pair_curly) elif tok == '(': self.group += 1 self.things.append(self.__pair_group) elif tok == ')': self.group -= 1 assert(self.things.pop() == self.__pair_group) elif tok == '<': self.ext += 1 self.things.append(self.__pair_ext) elif tok == '>': self.ext -= 1 assert(self.things.pop() == self.__pair_ext) elif tok == '?': if not self.in_special: self.in_special = True self.things.append(self.__pair_special) else: self.in_special = False assert(self.things.pop() == self.__pair_special) elif tok == '(*': self.in_comment = True self.things.append(self.__pair_comment) elif tok == '*)': raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line) else: if tok == '(*': raise Exception("Nested EBNF comment", tok, "in line", line) elif tok == '*)': assert(self.things.pop() == self.__pair_comment) self.in_comment = False if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0: raise Exception("Unbalanced BNF bracket", tok, "in line", line) return self.optional() def in_list(self): return self.curly > 0 def in_option(self): return self.square > 0 def in_group(self): return self.group > 0 def in_ext(self): return self.ext > 0 def in_something(self): if len(self.things) == 0: return None return self.things[-1] class Symbol: def __init__(self, token, tp = None, rules = None): self.reset(token, tp, rules) def reset(self, token, tp = None, rules = None): if tp == None: if is_terminal(token) is not None: tp = p_terminal else: tp = p_ruleset self.tp = tp self.token = token self.name = tok2name(token) self.sym = tok2sym(token) self.term = None self.regex = None self.is_lexical_element = False self.rules = [] self.datatype = None if rules is not None: self.rules = rules self.set_type(tp) def set_type(self, tp): if tp == p_ruleset: self.term = None self.regex = None self.is_lexical_element = False self.datatype = self.token + '_t' elif tp == p_literal: assert(len(self.rules) == 0) self.term = None self.regex = tok2regex(self.token) self.is_lexical_element = False self.datatype = 'std::string' elif tp == p_special or tp == p_lexical: if len(self.rules): self.dump(ERR) raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules") self.term = None self.regex = None self.is_lexical_element = True self.datatype = 'std::string' elif tp == p_terminal: if len(self.rules): slog(ERR, "rules = ", self.rules) self.dump(ERR) raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules") self.term = self.token self.regex = tok2regex(self.token) self.is_lexical_element = False self.datatype = None else: self.dump() raise Exception("Tried to set symbol to unknown type", tp) self.tp = tp def str(self): r = self.name + ' = ' + format_rules(self.rules) return r def equals(self, rhs): for k, v in self.__dict__.iteritems(): if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]: slog(WARNING, k, self.__dict__[k], rhs.__dict__[k]) return False return True def dump(self, prio = NOTICE, msg="", caller=None): if caller is None: caller = get_caller_pos(1) slog(prio, ",----------------", msg, caller=caller) slog(prio, "| type =", self.tp, caller=caller) slog(prio, "| name =", self.name, caller=caller) slog(prio, "| token =", self.token, caller=caller) slog(prio, "| sym =", self.sym, caller=caller) slog(prio, "| term =", self.term, caller=caller) slog(prio, "| regex =", self.regex, caller=caller) slog(prio, "| datatype =", self.datatype, caller=caller) slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller) slog(prio, "| rules =", format_rules(self.rules), caller=caller) slog(prio, "`----------------", msg, caller=caller) def split_list_by(l_, tok): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]] def split_list_by_regex(l_, regex): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]] def grammar_tokenize_ebnf(content): r = [] c = '' l = 0 in_comment = False in_quote = None for line in content.splitlines(True): end = len(line) - 1 l += 1 tok = '' p = -1 while p < end: p += 1 if p < end and in_quote == None: cand = line[p:p+2] if cand == '(*': if in_comment: raise Exception("Syntax error in line", l, ": spurious comment closure") in_comment = True p += 1 continue elif cand == '*)': if not in_comment: raise Exception("Syntax error in line", l, ": spurious comment opener") in_comment = False p += 1 continue if in_comment: continue c = line[p] if c in [ '"', "'" ]: if in_quote is None: in_quote = c else: if in_quote == c: in_quote = None if in_quote is not None: tok += c continue if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]: tok = tok.strip() if len(tok): r.append((tok, l)) tok = '' if not c.isspace(): r.append((c, l)) continue tok += c tok = tok.strip() if len(tok): r.append((tok, l)) return r def grammar_add_symbol(grammar, tok, rules): assert(tok is not None) if tok in grammar.keys(): s = grammar[tok] else: s = Symbol(tok, rules=rules) grammar[tok] = s if rules is not None: slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules)) for rule in rules: if not rule in s.rules: s.rules.append(rule) grammar[tok] = s def grammar_parse_ebnf_tokens(tokens): grammar = OrderedDict() state = State() lhs = None last = None ruleset = [] rule = [] terminals = [] specials = [] for tok, line in tokens: try: state.update(tok, line) if tok == '=': lhs = last continue last = tok if tok == ';': ruleset.append(rule) grammar_add_symbol(grammar, lhs, ruleset) ruleset = [] rule = [] lhs = None continue if tok == ',': continue if tok == '|' and state.in_something() is None: ruleset.append(rule) rule = [] continue if is_terminal(tok) and tok not in terminals: terminals.append(tok) elif state.in_special and tok not in specials: specials.append(tok) if lhs is not None: rule.append(RuleComp(tok, line=line)) except Exception as err: for t in tokens: slog(ERR, t) slog(ERR, "Unexpected error in line", line, ":", str(err)) raise exit(1) for s in terminals: grammar_add_symbol(grammar, s, None) grammar[s].set_type(p_terminal) for s in specials: slog(INFO, "found special sequence symbol", s) grammar_add_symbol(grammar, s, None) grammar[s].set_type(p_special) return grammar def grammar_parse_ebnf(content_): tokens = grammar_tokenize_ebnf(content_) grammar = grammar_parse_ebnf_tokens(tokens) return grammar def grammar_get_types(grammar): types = dict() for t, p in grammar.iteritems(): if not len(p.rules): continue if p.term is not None: continue ruleno = 1 rules = [] for rule in p.rules: members = [] for c in rule: if c.tp != t_target_lang: continue if not c.token in grammar.keys(): p.dump(ERR) raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule)) pp = grammar[c.token] if pp.tp is p_terminal: continue members.append(tok2sym(c.token)) if True or len(members): rules.append(members) if t in types.keys(): raise Exception("Tried to add type", t, "twice") types[t] = rules return types def grammar_fix_extensions(grammar, mode): for tok, p in grammar.iteritems(): newrules = [] for rule in p.rules: newrule = [] prefix = "" paren = 0 for c in rule: if c.tp == t_grammar and c.token in ['<', '>']: if c.token == '<': paren += 1 elif c.token == '>': paren -= 1 if paren <= 1: # don't add first level of control chars continue newrule.append(c) continue if paren > 0: assert(len(c.token) != 0) prefix += '_' + c.token continue if len(prefix) > 0: prefix = prefix[1:] slog(INFO, "Found prefix", prefix) if mode == mode_keep: newrule.append(RuleComp('<')) newrule.append(RuleComp(prefix, t_target_lang)) newrule.append(RuleComp('>')) newrule.append(c) elif mode == mode_discard: prefix = '' continue elif mode in [ mode_unroll, mode_concat ]: combined = RuleComp(c.token, c.tp) combined.token = prefix + c.token prefix = '' newrule.append(combined) slog(INFO, "Appended new rule return value", combined.token) if mode == mode_unroll: if combined.token in grammar.keys(): continue grammar[combined.token] = Symbol(combined.token, rules=[[c]]) else: raise Exception("Invalid prefix mode", mode) prefix = '' continue newrule.append(c) if len(prefix): # undigested prefix, since it was the last newrule.append(RuleComp(prefix[1:], t_target_lang)) newrules.append(newrule) grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only return grammar # TODO: not sure if this is necessary def grammar_unroll_lists(grammar): delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic for tok, p in grammar.iteritems(): newrules = [] for rule in p.rules: newrule = [] listrule = [] prefix = None s = State() slog(INFO, "----------------- list-unrolling rule", format_rule(rule)) for c in rule: s.update(c.token, c.line) if c.token == '{': continue if c.token == '}': if len(listrule) == 0: raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) name = "" delpos = [] for i, rule in enumerate(listrule): if rule.token in delimiters: delpos.append(i) continue if rule.tp != t_target_lang: continue name += tok2name(rule.token) + "_" # not really: there are lists without delimiters, too #if len(delpos) != 1: # p.dump(ERR) # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) name = name + "my_list" newrule.append(RuleComp(name, t_target_lang)) p = Symbol(name, rules=[[], listrule]) #p = Symbol(name) #p.rules = [ [], listrule ] listrule = [] if name not in grammar.keys(): grammar[name] = p continue if not p.equals(grammar[name]): p.dump(ERR, "old list production") p.dump(ERR, "new list production") raise Exception("List production expands to already taken name", name) continue if s.in_list(): listrule.append(c) continue newrule.append(c) newrules.append(newrule) grammar[tok].rules = newrules return grammar def rules_unroll_options(rules): r = [] found = False slog(DEBUG, "unrolling", format_rules(rules)) for rule in rules: square = 0 option = [] newrule = [] for i, c in enumerate(rule): if c.tp == t_grammar: if c.token == '[': square += 1 elif c.token == ']': square -= 1 if square == 1: continue if square >= 1: option.append(c) continue slog(DEBUG, "square =", square) assert(square == 0) n = len(option) if n == 0: newrule.append(c) continue # first without option replaced = newrule[:] tail = rule[i+1:len(rule)] slog(DEBUG, "i = ", i) slog(DEBUG, "n = ", n) slog(DEBUG, "rule = ", format_rule(rule)) slog(DEBUG, "tail = ", format_rule(tail)) slog(DEBUG, ",-------------------------") slog(DEBUG, "head = ", format_rule(replaced)) replaced.extend(tail) slog(DEBUG, "head + tail = ", format_rule(replaced)) r.append(replaced) # then with option inserted for unrolled in rules_unroll_options([ option ]): replaced = newrule[:] slog(DEBUG, ",-------------------------") slog(DEBUG, "head = ", format_rule(replaced)) slog(DEBUG, "unrolled = ", format_rule(unrolled)) replaced.extend(unrolled) slog(DEBUG, "head + unrolled =", format_rule(replaced)) replaced.extend(tail) slog(DEBUG, "head + unrolled + tail =", format_rule(replaced)) r.append(replaced) found = True break if not found: r.append(newrule) if found: return rules_unroll_options(r) return r def grammar_unroll_options(grammar): for tok, p in grammar.iteritems(): grammar[tok].rules = rules_unroll_options(p.rules) return grammar def rules_unroll_alternatives(rules): r = [] found = False slog(INFO, "unrolling alternatives in", format_rules(rules)) sep = RuleComp('|') for rule in rules: if not sep in rule: r.append(rule) continue found = True state = State() end = len(rule) - 1 first = last = -1 for i, c in enumerate(rule): state.update(c.token, line=c.line) if c.token != '|' or c.tp != t_grammar: slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i) continue slog(INFO, "found token at position", i) container = state.in_something() slog(INFO, "thing delimiters are", container) if container is None: raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule)) first = last = -1 k = i - 1 while k >= 0: prev = rule[k] slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0]) if prev.token == container[0]: first = k break k -= 1 if first == -1: raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule)) k = i while k <= end: nxt = rule[k] slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1]) if nxt.token == container[1]: last = k break k += 1 if last == i: raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule)) break # found what I wanted assert(first > 0) assert(last > 0) assert(last <= end) head = rule[0:first] mid = rule[first+1:last] tail = rule[last+1:end] slog(INFO, "first =", first, "last =", last, "end =", end) slog(INFO, "head = ", format_rule(head)) slog(INFO, "mid = ", format_rule(mid)) slog(INFO, "tail = ", format_rule(tail)) for m in split_list_by(mid, sep): unrolled_rule = head + m + tail r.append(unrolled_rule) #if found: # return rules_unroll_alternatives(r) return r def grammar_unroll_alternatives(grammar): for tok, p in grammar.iteritems(): grammar[tok].rules = rules_unroll_alternatives(p.rules) return grammar def grammar_replace_whitespace(grammar): r = OrderedDict() for tok, s in grammar.iteritems(): newrules = [] for rule in s.rules: newrule = [] for c in rule: newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line) newrule.append(newc) newrules.append(newrule) newtok = tok.replace(' ', '_') s.reset(newtok, tp=s.tp, rules=newrules) r[newtok] = s slog(INFO, "added symbol", newtok) return r def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): if checked is None: checked = set() if found is None: found = dict() indent = ' ' * depth * 2 if tok in found.keys(): slog(INFO, indent + " + found cached", tok, "with depth", found[tok]) return found[tok] slog(INFO, indent + " + " + tok) indent = indent + " " if tok in terminals: found[tok] = 1 slog(INFO, indent + " + found terminal", tok, "with depth", found[tok]) return 1 if tok in orphans: found[tok] = 1 slog(INFO, indent + " + found orphan", tok, "with depth", found[tok]) return 1 #if tok in lexicals: # found[tok] = 1 # slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok]) # return 1 if tok in checked: slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked)) return sys.maxint slog(INFO, indent, "checked =", ' '.join(checked)) checked.add(tok) if tok not in grammar.keys(): slog(ERR, "tried to validate unknown token \"" + tok + "\"") return sys.maxint p = grammar[tok] r = sys.maxint slog(INFO, indent, p.token, "has", len(p.rules), "rules") only_optional = True for rule in p.rules: slog(INFO, indent, "testing rule", format_rule(rule)) if tok in [ c.token for c in rule ]: continue mn = sys.maxint mx = 0 s = State() for c in rule: slog(INFO, indent, "testing token", c.token) if c.tp == t_grammar and s.update(c.token, 0): continue if c.tp != t_target_lang: slog(INFO, indent, " token", c.token, "is not a VHDL token") continue only_optional = False # same "found" argument in next call? rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found) slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx) if rr == sys.maxint or rr is None: slog(INFO, indent, " got error for token", c.token) mn = sys.maxint mx = 0 break if rr > mx: slog(INFO, indent, " adjusting mx to", rr) mx = rr if rr < mn: slog(INFO, indent, " adjusting mn to", rr) mn = rr if mn == sys.maxint or mx == 0: # unusable as escape route slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule)) continue slog(INFO, indent, "after checking all rules, mx is", mx) if mx < r: slog(INFO, indent, "setting return value to max", mx) r = mx if only_optional: slog(INFO, indent, tok, "has only optional rules, accepting") r = 0 if r != sys.maxint: r += 1 slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps") found[tok] = r slog(INFO, indent, "returning", r, "for token", tok) return r def grammar_check(grammar, check_symbols = None): terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} elements = set() if check_symbols is None: check_symbols = [] if len(check_symbols) == 0: for tok, p in grammar.iteritems(): if p.is_lexical_element: elements.add(tok) continue for rule in p.rules: for c in rule: if c.tp == t_grammar: continue elements.add(c.token) check_symbols = sorted(list(elements)) found = dict() for tok in check_symbols: slog(INFO, "======= checking", tok) rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) if rr == sys.maxint: slog(ERR, "No way out for", tok) exit(1) if not tok in grammar.keys(): slog(ERR, "Token", tok, "has no production") exit(1) slog(INFO, tok, "->", str(rr)) def grammar_lhss_map(grammar): r = dict() for t in grammar.keys(): r[t] = set() for t, p in grammar.iteritems(): for rule in p.rules: for c in rule: if c.tp == t_target_lang: r[c.token].add(t) return r def do_grammar_lhss(dmap, stop, rhs, buf, recursive): lhss = dmap[rhs] for lhs in lhss: if lhs in buf: continue buf.add(lhs) if lhs in stop: slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion") continue if recursive: do_grammar_lhss(dmap, stop, lhs, buf, recursive) def grammar_lhss(dmap, stop, symbols, recursive = False): r = set() for s in symbols: if s in r: continue do_grammar_lhss(dmap, stop, s, r, recursive) return r def do_grammar_rhss(grammar, stop, sym, buf): p = grammar[sym] for rule in p.rules: for c in rule: if c.tp != t_target_lang: continue if c.token in stop: continue if c.token in buf: continue buf.add(c.token) do_grammar_rhss(grammar, stop, c.token, buf) def grammar_rhss(grammar, stop, symbols): r = set() for s in symbols: if s in r: continue do_grammar_rhss(grammar, stop, s, r) return r def grammar_symbol_in_use(grammar, dmap, stop, checked, sym): if sym in stop: return False # Does this have to be recursive? defined = grammar_lhss(dmap, stop, set([sym])) slog(INFO, " symbol", sym, "defines:", ', '.join(defined)) if not len(defined): return True for d in defined: if d in stop: continue if d in checked: continue checked.add(d) if grammar_symbol_in_use(grammar, dmap, stop, checked, d): return True return False def do_grammar_unused(grammar, dmap, doomed): r = set(doomed) rhss = grammar_rhss(grammar, set(), doomed) for rhs in rhss: slog(INFO, "+++ checking if symbol", rhs, "is in use >>") if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs): slog(INFO, " symbol", rhs, "is not in use") r.add(rhs) slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<") return r def grammar_unused(grammar, dmap, doomed): r = set(doomed) while True: unused = do_grammar_unused(grammar, dmap, r) slog(INFO, "unused:", ', '.join(unused)) slog(INFO, "r: ", ', '.join(r)) if unused == r: break r |= unused return r # eradicate symbols from tree def grammar_cut_symbols(grammar, symbols): slog(INFO, "-------- removing symbols:", ', '.join(symbols)) dmap = grammar_lhss_map(grammar) unused = grammar_unused(grammar, dmap, symbols) for s in unused: slog(INFO, " + removing symbol", s) del grammar[s] return grammar # make symbol an empty literal production def grammar_trim_symbols(grammar, symbols): grammar_cut_symbols(grammar, symbols) for s in symbols: slog(INFO, " + adding empty production for symbol", s) p = Symbol(s) p.set_type(p_literal) grammar[s] = p return grammar def grammar_create_ebnf(grammar, opts): indent = 40 slog(INFO, "creating ebnf from grammar of size", len(grammar)) out = '' for t, p in grammar.iteritems(): slog(INFO, "formatting rule", t) if not len(p.rules): slog(INFO, "ignoring " + t + " (has no rules)\n") continue out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n' for rule in p.rules[1:]: out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n' out += ' ' * indent + ' ;\n' return out def grammar_create_y(grammar, opts): indent = ' ' * 40 width = 0 for t, p in grammar.iteritems(): if p.term is not None: continue if len(t) > width: width = len(t) spaces = 0 while spaces < width: spaces += 8 indent = '\t' * (spaces / 8) out = "" # preamble out += textwrap.dedent("""\ %{ #include #include #include #include #include #include #include """) for f in opts['includes']: out += '#include "' + f + '"' + '\n' #include "include/defs.h" #include "include/vhdl2017.h" #include "include/lex.vhdl2017.h" #include "include/vhdl2017.tab.h" out += "\nusing namespace " + opts['namespace'] + ';\n' out += textwrap.dedent("""\ using namespace std; namespace { typedef vector wrap_t; const wrap_t curly_braces{ "{", "}" }; const wrap_t round_braces{ "(", ")" }; } #ifdef __cplusplus // extern "C" { #endif %} """) # types out += textwrap.dedent("""\ %union { """) types = grammar_get_types(grammar) for t in types.keys(): out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';' out += '\n' out += textwrap.dedent("""\ } """) # yydecl out += textwrap.dedent("""\ %{ // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner); YY_DECL; %} """) # terminal tokens out += '\n' for t, p in grammar.iteritems(): if p.tp == p_terminal: out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' # special tokens out += '\n' for t, p in grammar.iteritems(): if p.tp == p_special: if p.token == '?': # TODO: why is this among the symbols anyway? continue out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' # regex tokens out += '\n' for t, p in grammar.iteritems(): if p.tp == p_literal: out += '%token <' + p.sym + '> ' + p.sym + (20 - len(p.sym)) * ' ' + '/* ' + t + ' */' +'\n' # types out += '\n' for t, p in grammar.iteritems(): if p.tp == p_ruleset: out += '%type <' + tok2sym(p.token) + '> ' + t + (40 - len(t)) * ' ' + '/* ' + t + ' */' +'\n' out += textwrap.dedent("""\ %define parse.error verbose %define api.pure full %param { struct context *context } { void *scanner } """) # productions out += '\n%%\n\n' for t, p in grammar.iteritems(): if not len(p.rules): continue if p.tp == p_terminal: continue if p.tp == p_special: continue slog(INFO, "creating production for symbol", p.str()) #if p.is_lexical_element is True: # continue if len(p.rules) == 0: raise Exception("Symbol ", p.str(), "has no rules") first = True n_rule = 0 for rule in p.rules: n_rule += 1 n = 0 s = State() if first: out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n" first = False else: out += indent + "| " + format_yacc_rule(rule) + "\n" out += indent + "{" + "\n" out += indent + "\t" + "$$->type = " + opts['namespace'] + '::' + t + "::t_" + str(n_rule) + ";\n" tokens = [] for c in rule: if c.tp == t_target_lang: tokens.append(c.token) idx = 0 for c in rule: n += 1 if c.tp == t_grammar: s.update(c.token, 0) continue p = grammar[c.token] #if is_terminal(c.token) is not None: # continue if p.tp not in [ p_ruleset ]: continue tp = tok2name(c.token) suffix = '' if tokens.count(c.token) > 1: idx += 1 suffix = '_' + str(idx) out += indent + "\t" + \ "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \ " = new " + p.datatype + "(*$" + str(n) + ");\n" out += indent + "}" + "\n" out += indent + ";\n\n" # tail out += '\n%%\n\n' out += textwrap.dedent(""" #ifdef __cplusplus // } /* extern "C" */ #endif """) return out + "\n" def grammar_create_l(grammar, opts): ignore = "" out = textwrap.dedent("""\ %option reentrant %option bison-bridge %{ #include """) for f in opts['includes']: out += '#include "' + f + '"' + '\n' #include "include/defs.h" #include "include/vhdl2017.h" #// #include "include/lex.vhdl2017.h" #include "include/vhdl2017.tab.h" out += "\nusing namespace " + opts['namespace'] + ';\n' out += textwrap.dedent("""\ /* This is too late in the Flex generated file to work. Still lots of * prototypes are spat into it above it, which end up with C++ linkage, of * course, causing the linkages to be inconsistent to the functions below this * extern "C". Only way I found was to use C++ is to use it on Bison only, and * have Flex use C instead. */ #ifdef __cplusplus // extern "C" { #endif #ifdef _REMOVE_ME static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); #endif %} %% \\n { context->line++; } """) for t, p in grammar.iteritems(): if p.term is not None: # \. { return T_DOT; } assert p.term[0] in [ '"', "'" ], p.term assert p.term[-1] in [ '"', "'" ], p.term out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n' out += textwrap.dedent("""\ %{/* basic_identifier */%} %{/* extended_identifier */%} %{/* based_integer */%} %{/* bit_value */%} %{/* numeric_literal */%} %{/* enumeration_literal */%} %{/* string_literal */%} %{/* bit_string_literal */%} %{/* character_literal */%} %{/* graphic_character */%} %{/* basic_character */%} %{/* integer */%} """) ignore += textwrap.dedent("""\ %{ /* not sure how to handle literals >> */ %} \\"[ \\!#-~]*\\" | \\'[0-1]\\' { // get_string(yylval_param, yyscanner, 1); /* Gets a string excluding " or ' */ int skip = 1; int i; for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++); yytext[i] = 0; YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt=(char *)malloc(i+1); strcpy(lv->txt, yytext+skip); return STRING; } #[0-9a-f]*# { // get_based_string(yylval_param, yyscanner, 1); /* skip leading # */ /* Gets a string excluding # */ int i; int skip = 1; for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++); yytext[i] = 0; YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt = (char *)malloc(i+1); strcpy(lv->txt, yytext + skip); return BASED; } [a-zA-Z_$][a-zA-Z0-9_$.]* { YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); lv->txt=(char *)malloc(strlen(yytext)+1); strcpy(lv->txt, yytext); return NAME; } [0-9]+ { YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); sscanf(yytext, "%d", &lv->n); return NATURAL; } """) out += textwrap.dedent("""\ . { return yytext[0]; } %{/* not sure how to handle literals << */%} %% void FB_SYM(error)(struct context *context, void *scanner, const char *msg) { struct yyguts_t *yyg =(struct yyguts_t*)scanner; // vp_log(context->vp, VP_LOG_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->lineno); slog(PRI_ERR, "%s at \\"%s\\" in line %d.\\n\\n", msg, yytext, context->line); } int FB_SYM(wrap)(void *scanner) { return 1; } struct vp_scanner { YY_BUFFER_STATE buf; void *scanner; char *str; }; /* utilities which need to be placed here, because I can't find * yylex_init() / _destroy() in any generated header file (??) */ struct vp_scanner *vhdl_default_init_scanner(const char *str) { struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r)); yylex_init(&r->scanner); r->str = strdup(str); r->buf = yy_scan_string(r->str, r->scanner); FB_SYM(set_extra)(r, r->scanner); // yyset_in(stdin, r->scanner); // yyset_out(stdout, r->scanner); return r; } void *vhdl_default_scanner_get_data(const struct vp_scanner *scanner) { return scanner->scanner; } void vhdl_default_cleanup_scanner(struct vp_scanner *scanner) { free(scanner->str); yy_delete_buffer(scanner->buf, scanner->scanner); yylex_destroy(scanner->scanner); free(scanner); } #ifdef __cplusplus // } // extern "C" #endif """) return out def grammar_create_h(grammar, opts): out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n' ns = opts['namespace'] if ns is not None: out += 'namespace ' + ns + '{\n\n' types = grammar_get_types(grammar) # struct forward declarations for t, members in types.iteritems(): if len(members): out += '\nstruct ' + t + ';' out += '\n' # struct / non-struct typedefs for t, members in types.iteritems(): if not len(members): out += '\ntypedef const char ' + t + '_t;' continue out += '\ntypedef struct ' + t + ' ' + t + '_t;' out += '\n' # struct definitions for t, rules in types.iteritems(): if not len(rules): continue out += '\n\nstruct ' + t + ' {\n' # rule structs n = 0 for rule in rules: n += 1 idx = 0 out += '\n\tstruct ' + 'r' + str(n) + '_t {' for m in rule: suffix = '' if rule.count(m) > 1: idx += 1 suffix = '_' + str(idx) p = grammar[m] out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' out += '\n\t};' # type enum n = 0 out += '\n\n\tenum {' for rule in rules: n += 1 out += '\n\t\tt_' + str(n) + ',' out += '\n\t} type;' out += '\n' # data union n = 0 out += '\n\tunion {' for rule in rules: n += 1 out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';' out += '\n\t} data;' # struct done out += '\n};' out += '\n' if ns is not None: out += '\n} /* namespace ' + ns + '*/' out += '\n#endif /* #ifndef + ' + opts['mip'] + ' */' return out class GrammarCmd(jwutils.Cmd): def __init__(self, name, help): super(GrammarCmd, self).__init__(name, help=help) def add_parser(self, parsers): p = super(GrammarCmd, self).add_parser(parsers) p.add_argument("input", help="input file") p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False) p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat) p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False) p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False) p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False) p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='') p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='') p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='') return p def processGrammar(self, args, grammar): if args.fix_extensions not in fix_extensions_mode: raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") grammar = grammar_fix_extensions(grammar, args.fix_extensions) if args.unroll_alternatives: grammar = grammar_unroll_alternatives(grammar) if args.unroll_lists: grammar = grammar_unroll_lists(grammar) if args.unroll_options: grammar = grammar_unroll_options(grammar) if len(args.check_symbols): check_symbols = [] if args.check_symbols == 'all': args.check_symbols = '' check_symbols = args.check_symbols.split() grammar_check(grammar, check_symbols) if len(args.trim_symbols): grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(',')) if len(args.cut_symbols): grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(',')) if args.replace_whitespace: grammar = grammar_replace_whitespace(grammar) return grammar # ------------------------------------------------- TODO: clean this up > class DerivedGrammarCmd(GrammarCmd): def __init__(self, name, help): super(DerivedGrammarCmd, self).__init__(name, help=help) @abstractmethod def _run(self, grammar): pass def _parse(self, contents): return grammar_parse_ebnf(contents) def add_parser(self, parsers): p = super(DerivedGrammarCmd, self).add_parser(parsers) return p def run(self, args): with open(args.input, 'r') as infile: contents = infile.read() grammar = self._parse(contents) grammar = super(DerivedGrammarCmd, self).processGrammar(args, grammar) self._run(args, grammar) class CmdCreate(DerivedGrammarCmd): def __init__(self): super(CmdCreate, self).__init__("create", help="Create a file") def add_parser(self, parsers): p = super(CmdCreate, self).add_parser(parsers) p.add_argument("output", help="output file") p.add_argument('--namespace', help='namespace of generated AST', default='parser') p.add_argument('--includes', help='list of header files to be #included in C/C++ implementation files', default='') return p def _run(self, args, grammar): name, ext = os.path.splitext(args.output)[1] #cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output)) mip = None if ext == 'h': mip = args.namespace + re.sub(r'[-./]', '_', args.output).upper() includes = args.includes.split(',') # generated code breaks without this, not sure why if ext == 'l': tmp = [] for f in includes: if not re.match('.*lex\..*\.h', f): tmp.append(f) includes = tmp cmd = getattr(sys.modules[__name__], 'grammar_create_' + ext) opts = { "namespace" : args.namespace, "includes" : includes, "mip" : mip } out = cmd(grammar, opts) print(out) class CmdCheck(DerivedGrammarCmd): def __init__(self): super(CmdCheck, self).__init__("check", help="Check grammar") def add_parser(self, parsers): p = super(CmdCheck, self).add_parser(parsers) return p def _run(self, args, grammar): pass