grammar.py: Add grammar_parse_ebnf_tokens()

Add grammar_parse_ebnf_tokens(), to be used by external grammar parsers and grammar_parse_ebnf() Signed-off-by: Jan Lindemann <jan@janware.com>
2026-01-15 01:52:56 +01:00 · 2017-10-28 19:16:00 +02:00 · 2017-10-28 19:16:00 +02:00 · 432d78cdc5
commit 432d78cdc5
parent 860f7d8cab
4 changed files with 216 additions and 91 deletions
--- a/test/grammar/Makefile
+++ b/test/grammar/Makefile
@ -1,5 +1,8 @@
 TOPDIR = ../..

+USE_PROJECT_LIB = true
+MEMBERS               += local.a($(OBJ))
+
 GENERATED_STD         = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h

 # These types are meant to be cut off the tree and turned into hand coded flex
--- a/test/grammar/generate.py
+++ b/test/grammar/generate.py
@ -47,12 +47,6 @@ class GrammarCmd(jwutils.grammar.GrammarCmd):
        with open(args.input, 'r') as infile:
            contents = infile.read()
        grammar = jwutils.grammar.grammar_parse_ebnf(contents)
-
-        slog(INFO, "grammar size is", len(grammar))
-        for t in grammar.keys():
-            slog(INFO, "key =", t)
-        slog(INFO, "grammar size is", len(grammar))
-        jwutils.grammar.dump_grammar(INFO, grammar)
        grammar = super(GrammarCmd, self).processGrammar(args, grammar)
        self._run(args, grammar)

--- a/test/grammar/grammartest-input.ebnf
+++ b/test/grammar/grammartest-input.ebnf
@ -5,7 +5,7 @@
            'END.' ;
 identifier = alphabetic character, { alphabetic character | digit } ;
 number = [ "-" ], digit, { digit } ;
- string = '"' , { all characters - '"' }, '"' ;
+ string = '"' , { all characters }, '"' ;
 assignment = identifier , ":=" , ( number | identifier | string ) ;
 alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G"
                      | "H" | "I" | "J" | "K" | "L" | "M" | "N"
--- a/tools/python/jwutils/grammar.py
+++ b/tools/python/jwutils/grammar.py
@ -22,6 +22,7 @@ p_ruleset  = "ruleset"
 p_terminal = "term"
 p_literal  = "literal"
 p_lexical  = "lexical"
+p_special  = "special"

 mode_unroll  = "unroll"
 mode_concat  = "concat"
@ -104,6 +105,11 @@ def cleanup_token(tok):
        tok = '"' + tok[1:-1] + '"'
    return tok

+def tok2ctype(tok):
+    if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?' ]:
+        return t_grammar
+    return t_target_lang
+
 def is_terminal(tok):
    size = len(tok)
    if size < 2:
@ -174,13 +180,24 @@ def format_yacc_rule(rule):
        r += tok2sym(c.token) + ' '
    return r[:-1]

+class SourceElement:
+
+    def __init__(self, token, line):
+        self.token = token
+        self.line = line
+
 class RuleComp:

-    def __init__(self, token, tp):
+    def __init__(self, token, tp = None, line=-1):
+        assert(token is not None)
+        # assert(token != '|')
        self.token = token
+        if tp is None:
+            tp = tok2ctype(token)
        self.tp = tp
        slog(INFO, "creating rule component >" + self.str() + "<")
        assert(token != "{ assignment")
+        self.line = line

    def __eq__(self, rhs):
        if self.token != rhs.token:
@ -205,27 +222,54 @@ class RuleComp:
 class State:

    def __init__(self):
-        self.curly = 0
-        self.square = 0
+        self.reset() 

    def reset(self):
        self.curly = 0
        self.square = 0
+        self.ext = 0
+        self.group = 0
+        self.in_comment = False
+        self.in_special = False
+        self.production = None
+        self.rule = []
+        self.rules = []

    def optional(self):
        return self.square != 0 or self.curly != 0

-    def update(self, tok):
-        if tok == '[':
-            self.square += 1
-        elif tok == ']':
-            self.square -= 1
-        elif tok == '{':
-            self.curly += 1
-        elif tok == '}':
-            self.curly -= 1
-        if self.curly < 0 or self.square < 0:
-            raise Exception("Unbalanced BNF bracket", tok)
+    def update(self, tok, line):
+        if not self.in_comment:
+            if tok == '[':
+                self.square += 1
+            elif tok == ']':
+                self.square -= 1
+            elif tok == '{':
+                self.curly += 1
+            elif tok == '}':
+                self.curly -= 1
+            elif tok == '(':
+                self.group += 1
+            elif tok == ')':
+                self.group -= 1
+            elif tok == '<':
+                self.ext += 1
+            elif tok == '>':
+                self.ext -= 1
+            elif tok == '?':
+                self.in_special = not self.in_special
+            elif tok == '(*':
+                self.in_comment = True
+            elif tok == '*)':
+                raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line)
+        else:
+            if tok == '(*':
+                raise Exception("Nested EBNF comment", tok, "in line", line)
+            elif tok == '*)':
+                self.in_comment = False
+
+        if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0:
+            raise Exception("Unbalanced BNF bracket", tok, "in line", line)
        return self.optional()

    def in_list(self):
@ -234,9 +278,25 @@ class State:
    def in_option(self):
        return self.square > 0

+    def in_group(self):
+        return self.group > 0
+
+    def in_ext(self):
+        return self.ext > 0
+
+    def in_something(self):
+        if self.square > 0 or self.curly > 0 or self.group > 0 or self.ext > 0 or self.in_comment or self.in_special:
+            return True
+        return False
+
 class Symbol:

-    def __init__(self, token, tp = p_ruleset, rules = None):
+    def __init__(self, token, tp = None, rules = None):
+        if tp == None:
+            if is_terminal(token) is not None:
+                tp = p_terminal
+            else:
+                tp = p_ruleset
        self.tp = tp
        self.token = token
        self.name = tok2name(token)
@ -261,22 +321,27 @@ class Symbol:
            self.term = None
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
-            self.datatype = "std::string"
-        elif tp == p_lexical:
-            assert(len(self.rules) == 0)
+            self.datatype = 'std::string'
+        elif tp == p_special or tp == p_lexical:
+            if len(self.rules):
+                self.dump(ERR)
+                raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules")
            self.term = None
-            self.regex = tok2regex(self.token)
+            self.regex = None
            self.is_lexical_element = True
-            self.datatype = None
+            self.datatype = 'std::string'
        elif tp == p_terminal:
-            assert(len(self.rules) == 0)
+            if len(self.rules):
+                slog(ERR, "rules = ", self.rules)
+                self.dump(ERR)
+                raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules")
            self.term = self.token
            self.regex = tok2regex(self.token)
            self.is_lexical_element = False
            self.datatype = None
        else:
            self.dump()
-            raise Exception("Tried to set production to unknown type", tp)
+            raise Exception("Tried to set symbol to unknown type", tp)
        self.tp = tp

    def str(self):
@ -309,71 +374,133 @@ def split_list_by(l_, tok):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]

-
 def split_list_by_regex(l_, regex):
    l = copy.deepcopy(l_)
    return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]]

-def grammar_parse_ebnf(content_):
-
-    # remove comments
+def grammar_tokenize_ebnf(content):
+    r = []
+    c = ''
+    l = 0
    in_comment = False
-    quoted = None
-    raw_tokens = re.split("([, ])", content_)
-    tokens = []
-    for t in raw_tokens:
-        t = t.strip()
-        if not len(t):
-            continue
-        if quoted:
-            if t == quoted: # FIXME: check backslash before
-                quoted = None
-        elif in_comment:
-            if t == '*)':
-                in_comment = False
-            continue
-        elif t == '(*':
-            in_comment = True
-            continue
-        elif t in [ '"', "'" ]:
-            quoted = t
-        tokens.append(t)
+    in_quote = None
+    for line in content.splitlines(True):
+        end = len(line) - 1
+        l += 1
+        tok = ''
+        p = -1
+        while p < end:
+            p += 1
+            if p < end and in_quote == None:
+                cand = line[p:p+2]
+                if cand == '(*':
+                    if in_comment:
+                        raise Exception("Syntax error in line", l, ": spurious comment closure")
+                    in_comment = True
+                    p += 1
+                    continue
+                elif cand == '*)':
+                    if not in_comment:
+                        raise Exception("Syntax error in line", l, ": spurious comment opener")
+                    in_comment = False
+                    p += 1
+                    continue
+            if in_comment:
+                continue
+            c = line[p]
+            if c in [ '"', "'" ]:
+                if in_quote is None:
+                    in_quote = c
+                else:
+                    if in_quote == c:
+                        in_quote = None
+            if in_quote is not None:
+                tok += c
+                continue
+            if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]:
+                tok = tok.strip()
+                if len(tok):
+                    r.append((tok, l))
+                tok = ''
+                if not c.isspace():
+                    r.append((c, l))
+                continue
+            tok += c

+    tok = tok.strip()
+    if len(tok):
+        r.append((tok, l))
+    return r
+
+def grammar_add_symbol(grammar, tok, rules):
+    assert(tok is not None)
+    if tok in grammar.keys():
+        s = grammar[tok]
+    else:
+        s = Symbol(tok, rules=rules)
+        grammar[tok] = s
+    if rules is not None:
+        slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules))
+        for rule in rules:
+            if not rule in s.rules:
+                s.rules.append(rule)
+        grammar[tok] = s
+
+def grammar_parse_ebnf_tokens(tokens):
    grammar = OrderedDict()
-    raw_productions = split_list_by(tokens, ';')
-    #slog(INFO, "raw_productions =", raw_productions)
-    for raw_production in raw_productions:
-        #slog(INFO, "raw_production =", '@'.join(raw_production))
-        raw_lhs_rhs = split_list_by(raw_production, '=')
-        #slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
-        assert(len(raw_lhs_rhs) == 2)
-        lhs = ' '.join(raw_lhs_rhs[0])
-        p = Symbol(lhs)
-        raw_rules = split_list_by(raw_lhs_rhs[1], '|')
-        #slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
-        for raw_rule in raw_rules:
-            slog(INFO, "raw_rule =", raw_rule)
-            rule_tokens = split_list_by_regex(raw_rule, ',{}\(\)\[\]')
-            #slog(INFO, "rule_tokens =", rule_tokens)
-            rule = []
-            for raw_tok in rule_tokens:
-                tok = cleanup_token(' '.join(raw_tok))
-                tp = t_target_lang
-                if is_terminal(tok) is not None:
-                    if not tok in grammar.keys():
-                        litp = Symbol(tok, p_terminal)
-                        slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
-                        grammar[tok] = litp
-                    tp = t_target_lang
-                elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
-                    tp = t_grammar
-                rule.append(RuleComp(tok, tp))
-            p.rules.append(rule)
-        slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
-        grammar[lhs] = p
+    state = State()
+    lhs = None
+    last = None
+    ruleset = []
+    rule = []
+    terminals = []
+    specials = []
+    for tok, line in tokens:
+        try:
+            state.update(tok, line)
+            if tok == '=':
+                lhs = last
+                continue
+            last = tok
+            if tok == ';':
+                ruleset.append(rule)
+                grammar_add_symbol(grammar, lhs, ruleset)
+                ruleset = []
+                rule = []
+                lhs = None
+                continue
+            if tok == ',':
+                continue
+            if tok == '|' and not state.in_something():
+                ruleset.append(rule)
+                rule = []
+                continue
+            if is_terminal(tok) and tok not in terminals:
+                terminals.append(tok)
+            elif state.in_special and tok not in specials:
+                specials.append(tok)
+            if lhs is not None:
+                rule.append(RuleComp(tok, line=line))
+        except Exception as err:
+            for t in tokens:
+                slog(ERR, t)
+            slog(ERR, "Unexpected error in line", line, ":", str(err))
+            raise
+            exit(1)
+    for s in terminals:
+        grammar_add_symbol(grammar, s, None)
+        grammar[s].set_type(p_terminal)
+    for s in specials:
+        grammar_add_symbol(grammar, s, None)
+        grammar[s].set_type(p_special)

    return grammar

+def grammar_parse_ebnf(content_):
+    tokens = grammar_tokenize_ebnf(content_)
+    grammar = grammar_parse_ebnf_tokens(tokens)
+    return grammar
+
 def grammar_get_types(grammar):
    types = dict()
    for t, p in grammar.iteritems():
@ -427,9 +554,9 @@ def grammar_fix_extensions(grammar, mode):
                    prefix = prefix[1:]
                    slog(INFO, "Found prefix", prefix)
                    if mode == mode_keep:
-                        newrule.append(RuleComp('<', t_grammar))
+                        newrule.append(RuleComp('<'))
                        newrule.append(RuleComp(prefix, t_target_lang))
-                        newrule.append(RuleComp('>', t_grammar))
+                        newrule.append(RuleComp('>'))
                        newrule.append(c)
                    elif mode == mode_discard:
                        prefix = ''
@ -464,8 +591,9 @@ def grammar_unroll_lists(grammar):
            listrule = []
            prefix = None
            s = State()
+            slog(INFO, "----------------- list-unrolling rule", format_rule(rule))
            for c in rule:
-                s.update(c.token)
+                s.update(c.token, c.line)
                if c.token == '{':
                    continue
                if c.token == '}':
@ -614,7 +742,7 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
        s = State()
        for c in rule:
            slog(INFO, indent, "testing token", c.token)
-            if c.tp == t_grammar and s.update(c.token):
+            if c.tp == t_grammar and s.update(c.token, 0):
                continue
            if c.tp != t_target_lang:
                slog(INFO, indent, "  token", c.token, "is not a VHDL token")
@ -942,7 +1070,7 @@ def create_yacc(grammar):
            for c in rule:
                n += 1
                if c.tp == t_grammar:
-                    s.update(c.token)
+                    s.update(c.token, 0)
                    continue
                p = grammar[c.token]
                #if is_terminal(c.token) is not None:
@ -1015,8 +1143,8 @@ def create_lex(grammar):
    for t, p in grammar.iteritems():
        if p.term is not None:
            # \. { return T_DOT; }
-            assert(p.term[0] == '"')
-            assert(p.term[-1] == '"')
+            assert p.term[0] in [ '"', "'" ], p.term
+            assert p.term[-1] in [ '"', "'" ], p.term
            out += re.escape(p.term[1:-1]) + ' { return ' + p.sym + '; }\n'

    out += textwrap.dedent("""\