diff --git a/test/grammar/Makefile b/test/grammar/Makefile index 7e32548..3799c43 100644 --- a/test/grammar/Makefile +++ b/test/grammar/Makefile @@ -1,5 +1,8 @@ TOPDIR = ../.. +USE_PROJECT_LIB = true +MEMBERS += local.a($(OBJ)) + GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h # These types are meant to be cut off the tree and turned into hand coded flex diff --git a/test/grammar/generate.py b/test/grammar/generate.py index e0c2c19..14c0664 100644 --- a/test/grammar/generate.py +++ b/test/grammar/generate.py @@ -47,12 +47,6 @@ class GrammarCmd(jwutils.grammar.GrammarCmd): with open(args.input, 'r') as infile: contents = infile.read() grammar = jwutils.grammar.grammar_parse_ebnf(contents) - - slog(INFO, "grammar size is", len(grammar)) - for t in grammar.keys(): - slog(INFO, "key =", t) - slog(INFO, "grammar size is", len(grammar)) - jwutils.grammar.dump_grammar(INFO, grammar) grammar = super(GrammarCmd, self).processGrammar(args, grammar) self._run(args, grammar) diff --git a/test/grammar/grammartest-input.ebnf b/test/grammar/grammartest-input.ebnf index 4cbb0e9..acbc89e 100644 --- a/test/grammar/grammartest-input.ebnf +++ b/test/grammar/grammartest-input.ebnf @@ -5,7 +5,7 @@ 'END.' ; identifier = alphabetic character, { alphabetic character | digit } ; number = [ "-" ], digit, { digit } ; - string = '"' , { all characters - '"' }, '"' ; + string = '"' , { all characters }, '"' ; assignment = identifier , ":=" , ( number | identifier | string ) ; alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py index fc6ea11..bf4b73f 100644 --- a/tools/python/jwutils/grammar.py +++ b/tools/python/jwutils/grammar.py @@ -22,6 +22,7 @@ p_ruleset = "ruleset" p_terminal = "term" p_literal = "literal" p_lexical = "lexical" +p_special = "special" mode_unroll = "unroll" mode_concat = "concat" @@ -104,6 +105,11 @@ def cleanup_token(tok): tok = '"' + tok[1:-1] + '"' return tok +def tok2ctype(tok): + if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?' ]: + return t_grammar + return t_target_lang + def is_terminal(tok): size = len(tok) if size < 2: @@ -174,13 +180,24 @@ def format_yacc_rule(rule): r += tok2sym(c.token) + ' ' return r[:-1] +class SourceElement: + + def __init__(self, token, line): + self.token = token + self.line = line + class RuleComp: - def __init__(self, token, tp): + def __init__(self, token, tp = None, line=-1): + assert(token is not None) + # assert(token != '|') self.token = token + if tp is None: + tp = tok2ctype(token) self.tp = tp slog(INFO, "creating rule component >" + self.str() + "<") assert(token != "{ assignment") + self.line = line def __eq__(self, rhs): if self.token != rhs.token: @@ -205,27 +222,54 @@ class RuleComp: class State: def __init__(self): - self.curly = 0 - self.square = 0 + self.reset() def reset(self): self.curly = 0 self.square = 0 + self.ext = 0 + self.group = 0 + self.in_comment = False + self.in_special = False + self.production = None + self.rule = [] + self.rules = [] def optional(self): return self.square != 0 or self.curly != 0 - def update(self, tok): - if tok == '[': - self.square += 1 - elif tok == ']': - self.square -= 1 - elif tok == '{': - self.curly += 1 - elif tok == '}': - self.curly -= 1 - if self.curly < 0 or self.square < 0: - raise Exception("Unbalanced BNF bracket", tok) + def update(self, tok, line): + if not self.in_comment: + if tok == '[': + self.square += 1 + elif tok == ']': + self.square -= 1 + elif tok == '{': + self.curly += 1 + elif tok == '}': + self.curly -= 1 + elif tok == '(': + self.group += 1 + elif tok == ')': + self.group -= 1 + elif tok == '<': + self.ext += 1 + elif tok == '>': + self.ext -= 1 + elif tok == '?': + self.in_special = not self.in_special + elif tok == '(*': + self.in_comment = True + elif tok == '*)': + raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line) + else: + if tok == '(*': + raise Exception("Nested EBNF comment", tok, "in line", line) + elif tok == '*)': + self.in_comment = False + + if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0: + raise Exception("Unbalanced BNF bracket", tok, "in line", line) return self.optional() def in_list(self): @@ -234,9 +278,25 @@ class State: def in_option(self): return self.square > 0 + def in_group(self): + return self.group > 0 + + def in_ext(self): + return self.ext > 0 + + def in_something(self): + if self.square > 0 or self.curly > 0 or self.group > 0 or self.ext > 0 or self.in_comment or self.in_special: + return True + return False + class Symbol: - def __init__(self, token, tp = p_ruleset, rules = None): + def __init__(self, token, tp = None, rules = None): + if tp == None: + if is_terminal(token) is not None: + tp = p_terminal + else: + tp = p_ruleset self.tp = tp self.token = token self.name = tok2name(token) @@ -261,22 +321,27 @@ class Symbol: self.term = None self.regex = tok2regex(self.token) self.is_lexical_element = False - self.datatype = "std::string" - elif tp == p_lexical: - assert(len(self.rules) == 0) + self.datatype = 'std::string' + elif tp == p_special or tp == p_lexical: + if len(self.rules): + self.dump(ERR) + raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules") self.term = None - self.regex = tok2regex(self.token) + self.regex = None self.is_lexical_element = True - self.datatype = None + self.datatype = 'std::string' elif tp == p_terminal: - assert(len(self.rules) == 0) + if len(self.rules): + slog(ERR, "rules = ", self.rules) + self.dump(ERR) + raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules") self.term = self.token self.regex = tok2regex(self.token) self.is_lexical_element = False self.datatype = None else: self.dump() - raise Exception("Tried to set production to unknown type", tp) + raise Exception("Tried to set symbol to unknown type", tp) self.tp = tp def str(self): @@ -309,71 +374,133 @@ def split_list_by(l_, tok): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]] - def split_list_by_regex(l_, regex): l = copy.deepcopy(l_) return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]] -def grammar_parse_ebnf(content_): - - # remove comments +def grammar_tokenize_ebnf(content): + r = [] + c = '' + l = 0 in_comment = False - quoted = None - raw_tokens = re.split("([, ])", content_) - tokens = [] - for t in raw_tokens: - t = t.strip() - if not len(t): - continue - if quoted: - if t == quoted: # FIXME: check backslash before - quoted = None - elif in_comment: - if t == '*)': - in_comment = False - continue - elif t == '(*': - in_comment = True - continue - elif t in [ '"', "'" ]: - quoted = t - tokens.append(t) + in_quote = None + for line in content.splitlines(True): + end = len(line) - 1 + l += 1 + tok = '' + p = -1 + while p < end: + p += 1 + if p < end and in_quote == None: + cand = line[p:p+2] + if cand == '(*': + if in_comment: + raise Exception("Syntax error in line", l, ": spurious comment closure") + in_comment = True + p += 1 + continue + elif cand == '*)': + if not in_comment: + raise Exception("Syntax error in line", l, ": spurious comment opener") + in_comment = False + p += 1 + continue + if in_comment: + continue + c = line[p] + if c in [ '"', "'" ]: + if in_quote is None: + in_quote = c + else: + if in_quote == c: + in_quote = None + if in_quote is not None: + tok += c + continue + if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]: + tok = tok.strip() + if len(tok): + r.append((tok, l)) + tok = '' + if not c.isspace(): + r.append((c, l)) + continue + tok += c + tok = tok.strip() + if len(tok): + r.append((tok, l)) + return r + +def grammar_add_symbol(grammar, tok, rules): + assert(tok is not None) + if tok in grammar.keys(): + s = grammar[tok] + else: + s = Symbol(tok, rules=rules) + grammar[tok] = s + if rules is not None: + slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules)) + for rule in rules: + if not rule in s.rules: + s.rules.append(rule) + grammar[tok] = s + +def grammar_parse_ebnf_tokens(tokens): grammar = OrderedDict() - raw_productions = split_list_by(tokens, ';') - #slog(INFO, "raw_productions =", raw_productions) - for raw_production in raw_productions: - #slog(INFO, "raw_production =", '@'.join(raw_production)) - raw_lhs_rhs = split_list_by(raw_production, '=') - #slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs) - assert(len(raw_lhs_rhs) == 2) - lhs = ' '.join(raw_lhs_rhs[0]) - p = Symbol(lhs) - raw_rules = split_list_by(raw_lhs_rhs[1], '|') - #slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1]) - for raw_rule in raw_rules: - slog(INFO, "raw_rule =", raw_rule) - rule_tokens = split_list_by_regex(raw_rule, ',{}\(\)\[\]') - #slog(INFO, "rule_tokens =", rule_tokens) - rule = [] - for raw_tok in rule_tokens: - tok = cleanup_token(' '.join(raw_tok)) - tp = t_target_lang - if is_terminal(tok) is not None: - if not tok in grammar.keys(): - litp = Symbol(tok, p_terminal) - slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str()) - grammar[tok] = litp - tp = t_target_lang - elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]: - tp = t_grammar - rule.append(RuleComp(tok, tp)) - p.rules.append(rule) - slog(INFO, "Appending production>" + lhs + "< -> ", p.str()) - grammar[lhs] = p + state = State() + lhs = None + last = None + ruleset = [] + rule = [] + terminals = [] + specials = [] + for tok, line in tokens: + try: + state.update(tok, line) + if tok == '=': + lhs = last + continue + last = tok + if tok == ';': + ruleset.append(rule) + grammar_add_symbol(grammar, lhs, ruleset) + ruleset = [] + rule = [] + lhs = None + continue + if tok == ',': + continue + if tok == '|' and not state.in_something(): + ruleset.append(rule) + rule = [] + continue + if is_terminal(tok) and tok not in terminals: + terminals.append(tok) + elif state.in_special and tok not in specials: + specials.append(tok) + if lhs is not None: + rule.append(RuleComp(tok, line=line)) + except Exception as err: + for t in tokens: + slog(ERR, t) + slog(ERR, "Unexpected error in line", line, ":", str(err)) + raise + exit(1) + for s in terminals: + grammar_add_symbol(grammar, s, None) + grammar[s].set_type(p_terminal) + for s in specials: + grammar_add_symbol(grammar, s, None) + grammar[s].set_type(p_special) return grammar +def grammar_parse_ebnf(content_): + tokens = grammar_tokenize_ebnf(content_) + grammar = grammar_parse_ebnf_tokens(tokens) + return grammar + def grammar_get_types(grammar): types = dict() for t, p in grammar.iteritems(): @@ -427,9 +554,9 @@ def grammar_fix_extensions(grammar, mode): prefix = prefix[1:] slog(INFO, "Found prefix", prefix) if mode == mode_keep: - newrule.append(RuleComp('<', t_grammar)) + newrule.append(RuleComp('<')) newrule.append(RuleComp(prefix, t_target_lang)) - newrule.append(RuleComp('>', t_grammar)) + newrule.append(RuleComp('>')) newrule.append(c) elif mode == mode_discard: prefix = '' @@ -464,8 +591,9 @@ def grammar_unroll_lists(grammar): listrule = [] prefix = None s = State() + slog(INFO, "----------------- list-unrolling rule", format_rule(rule)) for c in rule: - s.update(c.token) + s.update(c.token, c.line) if c.token == '{': continue if c.token == '}': @@ -614,7 +742,7 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, s = State() for c in rule: slog(INFO, indent, "testing token", c.token) - if c.tp == t_grammar and s.update(c.token): + if c.tp == t_grammar and s.update(c.token, 0): continue if c.tp != t_target_lang: slog(INFO, indent, " token", c.token, "is not a VHDL token") @@ -942,7 +1070,7 @@ def create_yacc(grammar): for c in rule: n += 1 if c.tp == t_grammar: - s.update(c.token) + s.update(c.token, 0) continue p = grammar[c.token] #if is_terminal(c.token) is not None: @@ -1015,8 +1143,8 @@ def create_lex(grammar): for t, p in grammar.iteritems(): if p.term is not None: # \. { return T_DOT; } - assert(p.term[0] == '"') - assert(p.term[-1] == '"') + assert p.term[0] in [ '"', "'" ], p.term + assert p.term[-1] in [ '"', "'" ], p.term out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n' out += textwrap.dedent("""\