diff --git a/test/grammar/Makefile b/test/grammar/Makefile new file mode 100644 index 0000000..7e32548 --- /dev/null +++ b/test/grammar/Makefile @@ -0,0 +1,70 @@ +TOPDIR = ../.. + +GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h + +# These types are meant to be cut off the tree and turned into hand coded flex +# regexes +#TRIM_SYMBOLS = blah +TRIM_SYMBOLS = +GENERATE_LOG_LEVEL ?= notice +FIX_EXTENSIONS ?= discard +CHECK_SYMBOLS ?= --check-symbols=all + + +GRAMMAR_INPUT ?= grammartest-input.ebnf +GENERATED = grammartest-dense.ebnf $(GENERATED_STD) +GENERATE_PY = ./generate.py +GENERATE = python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create \ + --fix-extensions $(FIX_EXTENSIONS) \ + --unroll-lists \ + --unroll-options \ + $(CHECK_SYMBOLS) \ + --trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \ + $(CREATE_EXTRA_ARGS) +CHECK_SYMBOLS ?= special_character + +FB_NAME_PREFIX ?= grammartest_ +FB_HDRDIR ?= include +FB_BISON_OUT_EXT ?= cpp +FB_FLEX_OUT_EXT ?= cpp +FB_CASE_INSENSITIVE ?= true +FB_SRC ?= $(filter %.y %.l,$(GENERATED)) + +include $(TOPDIR)/make/proj.mk +include $(MODDIR)/make/flex-bison.mk +include $(MODDIR)/make/py-defs.mk + +all: +debug-all: + GENERATE_LOG_LEVEL=debug make all 2>&1 | tee run.out + +generate: $(GENERATED) + +grammartest.y: include/grammartest.h +lex.grammartest.c: grammartest.l + +check: $(GRAMMAR_INPUT) $(GENERATE_PY) Makefile + python ./$(GENERATE_PY) --log-level info check --fix-extensions unroll --unroll-lists --unroll-options --check-symbols='$(CHECK_SYMBOLS)' $< + +grammartest-dense.ebnf: $(GRAMMAR_INPUT) $(GENERATE_PY) + python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create --fix-extensions keep $< grammartest.ebnf > $@.tmp + mv $@.tmp $@ + +define generate_rule +$(1): $$(GRAMMAR_INPUT) $$(GENERATE_PY) Makefile + $$(GENERATE) $$< $$(patsubst grammartest.%,grammartest.%,$$@) > $$@.tmp + mv $$@.tmp $$@ +endef +$(foreach target,$(GENERATED_STD),$(eval $(call generate_rule,$(target)))) + +clean.generated: + rm -f $(GENERATED) +clean: clean.generated +echo-generated: + @echo GENERATED = $(GENERATED) + +help: + $(GENERATE) --help + +expand-macros: + make 2>/dev/null | sed '/g++/ !d; s/g++\|gcc//; s/-o .*//' | xargs g++ -E -C | indent diff --git a/test/grammar/generate.py b/test/grammar/generate.py new file mode 100644 index 0000000..e0c2c19 --- /dev/null +++ b/test/grammar/generate.py @@ -0,0 +1,85 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from __future__ import print_function +import argparse +import sys +import re +import textwrap +from collections import OrderedDict +from abc import abstractmethod + +import jwutils + +from jwutils.log import * +from jwutils import grammar + +base = 'grammartest' +mip = '_JW_PYTHON_' + base + base.upper() +namespace = base + +def create_grammartest_ebnf(grammar): + print(jwutils.grammar.create_ebnf(grammar)) + +def create_grammartest_y(grammar): + print(jwutils.grammar.create_yacc(grammar)) + +def create_grammartest_l(grammar): + print(jwutils.grammar.create_lex(grammar)) + +def create_include_grammartest_h(grammar): + print(jwutils.grammar.create_header(grammar, mip=mip, namespace=namespace)) + +class GrammarCmd(jwutils.grammar.GrammarCmd): + + def __init__(self, name, help): + super(GrammarCmd, self).__init__(name, help=help) + + @abstractmethod + def _run(self, grammar): + pass + + def add_parser(self, parsers): + p = super(GrammarCmd, self).add_parser(parsers) + return p + + def run(self, args): + with open(args.input, 'r') as infile: + contents = infile.read() + grammar = jwutils.grammar.grammar_parse_ebnf(contents) + + slog(INFO, "grammar size is", len(grammar)) + for t in grammar.keys(): + slog(INFO, "key =", t) + slog(INFO, "grammar size is", len(grammar)) + jwutils.grammar.dump_grammar(INFO, grammar) + grammar = super(GrammarCmd, self).processGrammar(args, grammar) + self._run(args, grammar) + +class CmdCreate(GrammarCmd): + + def __init__(self): + super(CmdCreate, self).__init__("create", help="Create a file") + + def add_parser(self, parsers): + p = super(CmdCreate, self).add_parser(parsers) + p.add_argument("output", help="output file") + return p + + def _run(self, args, grammar): + cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output)) + cmd(grammar) + +class CmdCheck(GrammarCmd): + + def __init__(self): + super(CmdCheck, self).__init__("check", help="Check grammar") + + def add_parser(self, parsers): + p = super(CmdCheck, self).add_parser(parsers) + return p + + def _run(self, args, grammar): + pass + +jwutils.run_sub_commands('generate Test parser files') diff --git a/test/grammar/grammartest-input.ebnf b/test/grammar/grammartest-input.ebnf new file mode 100644 index 0000000..4cbb0e9 --- /dev/null +++ b/test/grammar/grammartest-input.ebnf @@ -0,0 +1,16 @@ + (* a simple program syntax in EBNF − Wikipedia *) + program = 'PROGRAM', white space, identifier, white space, + 'BEGIN', white space, + { assignment, ";", white space }, + 'END.' ; + identifier = alphabetic character, { alphabetic character | digit } ; + number = [ "-" ], digit, { digit } ; + string = '"' , { all characters - '"' }, '"' ; + assignment = identifier , ":=" , ( number | identifier | string ) ; + alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G" + | "H" | "I" | "J" | "K" | "L" | "M" | "N" + | "O" | "P" | "Q" | "R" | "S" | "T" | "U" + | "V" | "W" | "X" | "Y" | "Z" ; + digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; + white space = ? white space characters ? ; + all characters = ? all visible characters ? ; diff --git a/test/grammar/grammartest.code b/test/grammar/grammartest.code new file mode 100644 index 0000000..386e6ce --- /dev/null +++ b/test/grammar/grammartest.code @@ -0,0 +1,10 @@ + PROGRAM DEMO1 + BEGIN + A:=3; + B:=45; + H:=-100023; + C:=A; + D123:=B34A; + BABOON:=GIRAFFE; + TEXT:="Hello world!"; + END. diff --git a/test/grammar/include/defs.h b/test/grammar/include/defs.h new file mode 100644 index 0000000..b677234 --- /dev/null +++ b/test/grammar/include/defs.h @@ -0,0 +1,32 @@ +#ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H +#define _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H + +#define YY_NO_INPUT +#define YY_NO_UNPUT +// #define YY_NO_UNISTD_H + +struct context { + int line; +}; + +union YYSTYPE; + +#ifdef __cplusplus +extern "C" { +#endif + +/* defined in grammartest-parser.l */ +struct vp_scanner; +struct vp_scanner *grammartest_default_init_scanner(const char *str); +void *grammartest_default_scanner_get_data(const struct vp_scanner *scanner); +void grammartest_default_cleanup_scanner(struct vp_scanner *scanner); + +void FB_SYM(error)(struct context *context, void *scanner, const char *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#define YY_DECL int FB_SYM(lex)(YYSTYPE *yylval_param, struct context *context, void *yyscanner) + +#endif /* #ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H */ diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py index e6420b9..7e1387e 100644 --- a/tools/python/jwutils/grammar.py +++ b/tools/python/jwutils/grammar.py @@ -6,6 +6,8 @@ import sys import re import lxml.etree as ET import textwrap +import itertools +import copy from collections import OrderedDict from abc import abstractmethod @@ -83,21 +85,34 @@ token_regexes = { "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", } +quotechars = [ '"', "'" ] + def dump(obj): for c, v in obj.iteritems(): slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) +def dump_grammar(prio, grammar): + for t, p in grammar.iteritems(): + p.dump(prio) + def cleanup_token(tok): tok = tok.strip() if len(tok) == 0: return None + if tok[0] == "'" and tok[-1] == "'": + tok = '"' + tok[1:-1] + '"' return tok def is_terminal(tok): - if not tok.startswith('"'): + size = len(tok) + if size < 2: return None - if not tok.endswith('"'): - raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"') + first = tok[0] + last = tok[-1] + if (not first in quotechars) and (not last in quotechars): + return None + if first != last: + raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes') return tok[1:-1] def tok2name(tok): @@ -129,6 +144,29 @@ def format_rule(rule): def format_rules(rules): return ', '.join(format_rule(rule) for rule in rules) +def format_ebnf_rule(grammar, rule): + r = "" + last = None + for comp in rule: + if last is not None: + if comp.tp == t_grammar: + if last.tp == t_grammar: + pass + else: + if comp.token in [ '[', '(', '{', '<' ]: + r += ',' + else: + if last.tp == t_grammar: + if comp.token in [ ']', ')', '}', '>' ]: + r += ',' + else: + r += ',' + r += ' ' + comp.token + last = comp + if len(r) == 0: + return r + return r.strip() + def format_yacc_rule(rule): r = '' for c in rule: @@ -140,7 +178,7 @@ class RuleComp: def __init__(self, token, tp): self.token = token self.tp = tp - slog(INFO, "creating rule >" + self.str() + "<") + slog(INFO, "creating rule component >" + self.str() + "<") def __eq__(self, rhs): if self.token != rhs.token: @@ -153,7 +191,14 @@ class RuleComp: return not self.__eq__(rhs) def str(self): - return "{" + self.tp + ": " + self.token + "}" + tp = 'u' + if self.tp == t_grammar: + tp = 'g' + elif self.tp == t_target_lang: + tp = 'l' + else: + tp = self.tp + return "{" + tp + ": " + self.token + "}" class State: @@ -192,13 +237,13 @@ class Symbol: def __init__(self, token, tp = p_ruleset, rules = None): self.tp = tp self.token = token - self.name = tok2name(token) - self.sym = tok2sym(token) - self.term = None + self.name = tok2name(token) + self.sym = tok2sym(token) + self.term = None self.regex = None - self.is_lexical_element = False - self.rules = [] - self.datatype = None + self.is_lexical_element = False + self.rules = [] + self.datatype = None if rules is not None: self.rules = rules self.set_type(tp) @@ -244,17 +289,82 @@ class Symbol: return True def dump(self, prio = NOTICE, msg=""): - slog(prio, ",----------------", msg) - slog(prio, "| type =", self.tp) - slog(prio, "| name =", self.name) + slog(prio, ",----------------", msg) + slog(prio, "| type =", self.tp) + slog(prio, "| name =", self.name) slog(prio, "| token =", self.token) - slog(prio, "| sym =", self.sym) - slog(prio, "| term =", self.term) + slog(prio, "| sym =", self.sym) + slog(prio, "| term =", self.term) slog(prio, "| regex =", self.regex) slog(prio, "| datatype =", self.datatype) - slog(prio, "| is_lexical_element =", self.is_lexical_element) - slog(prio, "| rules =", format_rules(self.rules)) - slog(prio, "`----------------", msg) + slog(prio, "| is_lexical_element =", self.is_lexical_element) + slog(prio, "| rules =", format_rules(self.rules)) + slog(prio, "`----------------", msg) + +def split_list_by(l_, tok): + l = copy.deepcopy(l_) + return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]] + +def grammar_parse_ebnf(content_): + + # remove comments + in_comment = False + quoted = None + raw_tokens = re.split("([, ])", content_) + tokens = [] + for t in raw_tokens: + t = t.strip() + if not len(t): + continue + if quoted: + if t == quoted: # FIXME: check backslash before + quoted = None + elif in_comment: + if t == '*)': + in_comment = False + continue + elif t == '(*': + in_comment = True + continue + elif t in [ '"', "'" ]: + quoted = t + tokens.append(t) + + grammar = OrderedDict() + raw_productions = split_list_by(tokens, ';') + #slog(INFO, "raw_productions =", raw_productions) + for raw_production in raw_productions: + #slog(INFO, "raw_production =", '@'.join(raw_production)) + raw_lhs_rhs = split_list_by(raw_production, '=') + #slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs) + assert(len(raw_lhs_rhs) == 2) + lhs = ' '.join(raw_lhs_rhs[0]) + p = Symbol(lhs) + raw_rules = split_list_by(raw_lhs_rhs[1], '|') + #slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1]) + for raw_rule in raw_rules: + #slog(INFO, "raw_rule =", raw_rule) + rule_tokens = split_list_by(raw_rule, ',') + #slog(INFO, "rule_tokens =", rule_tokens) + rule = [] + for raw_tok in rule_tokens: + tok = cleanup_token(' '.join(raw_tok)) + tp = t_target_lang + if is_terminal(tok) is not None: + if not tok in grammar.keys(): + litp = Symbol(tok, p_terminal) + slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str()) + grammar[tok] = litp + tp = t_target_lang + elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]: + tp = t_grammar + rule.append(RuleComp(tok, tp)) + p.rules.append(rule) + slog(INFO, "Appending production>" + lhs + "< -> ", p.str()) + grammar[lhs] = p + + dump_grammar(INFO, grammar) + return grammar def grammar_get_types(grammar): types = dict() @@ -292,10 +402,10 @@ def grammar_fix_extensions(grammar, mode): prefix = "" paren = 0 for c in rule: - if c.tp != t_target_lang: - if c.token == '(': + if c.tp == t_grammar and c.token in ['<', '>']: + if c.token == '<': paren += 1 - elif c.token == ')': + elif c.token == '>': paren -= 1 if paren <= 1: # don't add first level of control chars continue @@ -309,9 +419,9 @@ def grammar_fix_extensions(grammar, mode): prefix = prefix[1:] slog(INFO, "Found prefix", prefix) if mode == mode_keep: - newrule.append(RuleComp('(', t_grammar)) + newrule.append(RuleComp('<', t_grammar)) newrule.append(RuleComp(prefix, t_target_lang)) - newrule.append(RuleComp(')', t_grammar)) + newrule.append(RuleComp('>', t_grammar)) newrule.append(c) elif mode == mode_discard: prefix = '' @@ -362,8 +472,12 @@ def grammar_unroll_lists(grammar): if rule.tp != t_target_lang: continue name += tok2name(rule.token) + "_" - if len(delpos) != 1: - raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule)) + + # not really: there are lists without delimiters, too + #if len(delpos) != 1: + # p.dump(ERR) + # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) + name = name + "my_list" newrule.append(RuleComp(name, t_target_lang)) p = Symbol(name, rules=[[], listrule]) @@ -447,15 +561,6 @@ def grammar_unroll_options(grammar): grammar[tok].rules = rules_unroll_options(p.rules) return grammar -def format_ebnf_rule(grammar, rule): - r = "" - for comp in rule: - if comp.tp == t_grammar: - r = r + " " + comp.token - continue - r = r + " " + comp.token - return r.strip() - def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): if checked is None: checked = set() @@ -538,14 +643,14 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, slog(INFO, indent, "returning", r, "for token", tok) return r -def grammar_check(grammar, selements = None): - if selements is None: - selements = [] +def grammar_check(grammar, check_symbols = None): terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} elements = set() - if len(selements) == 0: + if check_symbols is None: + check_symbols = [] + if len(check_symbols) == 0: for tok, p in grammar.iteritems(): if p.is_lexical_element: elements.add(tok) @@ -555,9 +660,9 @@ def grammar_check(grammar, selements = None): if c.tp == t_grammar: continue elements.add(c.token) - selements = sorted(list(elements)) + check_symbols = sorted(list(elements)) found = dict() - for tok in selements: + for tok in check_symbols: slog(INFO, "======= checking", tok) rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) if rr == sys.maxint: @@ -683,14 +788,18 @@ def grammar_trim_symbols(grammar, symbols): def create_ebnf(grammar): indent = 40 + slog(INFO, "creating ebnf from grammar of size", len(grammar)) + out = '' for t, p in grammar.iteritems(): + slog(INFO, "formatting rule", t) if not len(p.rules): slog(INFO, "ignoring " + t + " (has no rules)\n") continue - out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n' for rule in p.rules[1:]: - out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule) - return out + "\n" + out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n' + out += ' ' * indent + ' ;\n' + return out def create_yacc(grammar): indent = ' ' * 40