Continue implementation of grammar.py

Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
Jan Lindemann 2017-10-25 13:45:44 +02:00
commit 40e6add5ad
6 changed files with 366 additions and 44 deletions

70
test/grammar/Makefile Normal file
View file

@ -0,0 +1,70 @@
TOPDIR = ../..
GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h
# These types are meant to be cut off the tree and turned into hand coded flex
# regexes
#TRIM_SYMBOLS = blah
TRIM_SYMBOLS =
GENERATE_LOG_LEVEL ?= notice
FIX_EXTENSIONS ?= discard
CHECK_SYMBOLS ?= --check-symbols=all
GRAMMAR_INPUT ?= grammartest-input.ebnf
GENERATED = grammartest-dense.ebnf $(GENERATED_STD)
GENERATE_PY = ./generate.py
GENERATE = python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create \
--fix-extensions $(FIX_EXTENSIONS) \
--unroll-lists \
--unroll-options \
$(CHECK_SYMBOLS) \
--trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \
$(CREATE_EXTRA_ARGS)
CHECK_SYMBOLS ?= special_character
FB_NAME_PREFIX ?= grammartest_
FB_HDRDIR ?= include
FB_BISON_OUT_EXT ?= cpp
FB_FLEX_OUT_EXT ?= cpp
FB_CASE_INSENSITIVE ?= true
FB_SRC ?= $(filter %.y %.l,$(GENERATED))
include $(TOPDIR)/make/proj.mk
include $(MODDIR)/make/flex-bison.mk
include $(MODDIR)/make/py-defs.mk
all:
debug-all:
GENERATE_LOG_LEVEL=debug make all 2>&1 | tee run.out
generate: $(GENERATED)
grammartest.y: include/grammartest.h
lex.grammartest.c: grammartest.l
check: $(GRAMMAR_INPUT) $(GENERATE_PY) Makefile
python ./$(GENERATE_PY) --log-level info check --fix-extensions unroll --unroll-lists --unroll-options --check-symbols='$(CHECK_SYMBOLS)' $<
grammartest-dense.ebnf: $(GRAMMAR_INPUT) $(GENERATE_PY)
python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create --fix-extensions keep $< grammartest.ebnf > $@.tmp
mv $@.tmp $@
define generate_rule
$(1): $$(GRAMMAR_INPUT) $$(GENERATE_PY) Makefile
$$(GENERATE) $$< $$(patsubst grammartest.%,grammartest.%,$$@) > $$@.tmp
mv $$@.tmp $$@
endef
$(foreach target,$(GENERATED_STD),$(eval $(call generate_rule,$(target))))
clean.generated:
rm -f $(GENERATED)
clean: clean.generated
echo-generated:
@echo GENERATED = $(GENERATED)
help:
$(GENERATE) --help
expand-macros:
make 2>/dev/null | sed '/g++/ !d; s/g++\|gcc//; s/-o .*//' | xargs g++ -E -C | indent

85
test/grammar/generate.py Normal file
View file

@ -0,0 +1,85 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import argparse
import sys
import re
import textwrap
from collections import OrderedDict
from abc import abstractmethod
import jwutils
from jwutils.log import *
from jwutils import grammar
base = 'grammartest'
mip = '_JW_PYTHON_' + base + base.upper()
namespace = base
def create_grammartest_ebnf(grammar):
print(jwutils.grammar.create_ebnf(grammar))
def create_grammartest_y(grammar):
print(jwutils.grammar.create_yacc(grammar))
def create_grammartest_l(grammar):
print(jwutils.grammar.create_lex(grammar))
def create_include_grammartest_h(grammar):
print(jwutils.grammar.create_header(grammar, mip=mip, namespace=namespace))
class GrammarCmd(jwutils.grammar.GrammarCmd):
def __init__(self, name, help):
super(GrammarCmd, self).__init__(name, help=help)
@abstractmethod
def _run(self, grammar):
pass
def add_parser(self, parsers):
p = super(GrammarCmd, self).add_parser(parsers)
return p
def run(self, args):
with open(args.input, 'r') as infile:
contents = infile.read()
grammar = jwutils.grammar.grammar_parse_ebnf(contents)
slog(INFO, "grammar size is", len(grammar))
for t in grammar.keys():
slog(INFO, "key =", t)
slog(INFO, "grammar size is", len(grammar))
jwutils.grammar.dump_grammar(INFO, grammar)
grammar = super(GrammarCmd, self).processGrammar(args, grammar)
self._run(args, grammar)
class CmdCreate(GrammarCmd):
def __init__(self):
super(CmdCreate, self).__init__("create", help="Create a file")
def add_parser(self, parsers):
p = super(CmdCreate, self).add_parser(parsers)
p.add_argument("output", help="output file")
return p
def _run(self, args, grammar):
cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output))
cmd(grammar)
class CmdCheck(GrammarCmd):
def __init__(self):
super(CmdCheck, self).__init__("check", help="Check grammar")
def add_parser(self, parsers):
p = super(CmdCheck, self).add_parser(parsers)
return p
def _run(self, args, grammar):
pass
jwutils.run_sub_commands('generate Test parser files')

View file

@ -0,0 +1,16 @@
(* a simple program syntax in EBNF Wikipedia *)
program = 'PROGRAM', white space, identifier, white space,
'BEGIN', white space,
{ assignment, ";", white space },
'END.' ;
identifier = alphabetic character, { alphabetic character | digit } ;
number = [ "-" ], digit, { digit } ;
string = '"' , { all characters - '"' }, '"' ;
assignment = identifier , ":=" , ( number | identifier | string ) ;
alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G"
| "H" | "I" | "J" | "K" | "L" | "M" | "N"
| "O" | "P" | "Q" | "R" | "S" | "T" | "U"
| "V" | "W" | "X" | "Y" | "Z" ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
white space = ? white space characters ? ;
all characters = ? all visible characters ? ;

View file

@ -0,0 +1,10 @@
PROGRAM DEMO1
BEGIN
A:=3;
B:=45;
H:=-100023;
C:=A;
D123:=B34A;
BABOON:=GIRAFFE;
TEXT:="Hello world!";
END.

View file

@ -0,0 +1,32 @@
#ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H
#define _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H
#define YY_NO_INPUT
#define YY_NO_UNPUT
// #define YY_NO_UNISTD_H
struct context {
int line;
};
union YYSTYPE;
#ifdef __cplusplus
extern "C" {
#endif
/* defined in grammartest-parser.l */
struct vp_scanner;
struct vp_scanner *grammartest_default_init_scanner(const char *str);
void *grammartest_default_scanner_get_data(const struct vp_scanner *scanner);
void grammartest_default_cleanup_scanner(struct vp_scanner *scanner);
void FB_SYM(error)(struct context *context, void *scanner, const char *s);
#ifdef __cplusplus
} // extern "C"
#endif
#define YY_DECL int FB_SYM(lex)(YYSTYPE *yylval_param, struct context *context, void *yyscanner)
#endif /* #ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H */

View file

@ -6,6 +6,8 @@ import sys
import re import re
import lxml.etree as ET import lxml.etree as ET
import textwrap import textwrap
import itertools
import copy
from collections import OrderedDict from collections import OrderedDict
from abc import abstractmethod from abc import abstractmethod
@ -83,21 +85,34 @@ token_regexes = {
"PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
} }
quotechars = [ '"', "'" ]
def dump(obj): def dump(obj):
for c, v in obj.iteritems(): for c, v in obj.iteritems():
slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
def dump_grammar(prio, grammar):
for t, p in grammar.iteritems():
p.dump(prio)
def cleanup_token(tok): def cleanup_token(tok):
tok = tok.strip() tok = tok.strip()
if len(tok) == 0: if len(tok) == 0:
return None return None
if tok[0] == "'" and tok[-1] == "'":
tok = '"' + tok[1:-1] + '"'
return tok return tok
def is_terminal(tok): def is_terminal(tok):
if not tok.startswith('"'): size = len(tok)
if size < 2:
return None return None
if not tok.endswith('"'): first = tok[0]
raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"') last = tok[-1]
if (not first in quotechars) and (not last in quotechars):
return None
if first != last:
raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
return tok[1:-1] return tok[1:-1]
def tok2name(tok): def tok2name(tok):
@ -129,6 +144,29 @@ def format_rule(rule):
def format_rules(rules): def format_rules(rules):
return ', '.join(format_rule(rule) for rule in rules) return ', '.join(format_rule(rule) for rule in rules)
def format_ebnf_rule(grammar, rule):
r = ""
last = None
for comp in rule:
if last is not None:
if comp.tp == t_grammar:
if last.tp == t_grammar:
pass
else:
if comp.token in [ '[', '(', '{', '<' ]:
r += ','
else:
if last.tp == t_grammar:
if comp.token in [ ']', ')', '}', '>' ]:
r += ','
else:
r += ','
r += ' ' + comp.token
last = comp
if len(r) == 0:
return r
return r.strip()
def format_yacc_rule(rule): def format_yacc_rule(rule):
r = '' r = ''
for c in rule: for c in rule:
@ -140,7 +178,7 @@ class RuleComp:
def __init__(self, token, tp): def __init__(self, token, tp):
self.token = token self.token = token
self.tp = tp self.tp = tp
slog(INFO, "creating rule >" + self.str() + "<") slog(INFO, "creating rule component >" + self.str() + "<")
def __eq__(self, rhs): def __eq__(self, rhs):
if self.token != rhs.token: if self.token != rhs.token:
@ -153,7 +191,14 @@ class RuleComp:
return not self.__eq__(rhs) return not self.__eq__(rhs)
def str(self): def str(self):
return "{" + self.tp + ": " + self.token + "}" tp = 'u'
if self.tp == t_grammar:
tp = 'g'
elif self.tp == t_target_lang:
tp = 'l'
else:
tp = self.tp
return "{" + tp + ": " + self.token + "}"
class State: class State:
@ -192,13 +237,13 @@ class Symbol:
def __init__(self, token, tp = p_ruleset, rules = None): def __init__(self, token, tp = p_ruleset, rules = None):
self.tp = tp self.tp = tp
self.token = token self.token = token
self.name = tok2name(token) self.name = tok2name(token)
self.sym = tok2sym(token) self.sym = tok2sym(token)
self.term = None self.term = None
self.regex = None self.regex = None
self.is_lexical_element = False self.is_lexical_element = False
self.rules = [] self.rules = []
self.datatype = None self.datatype = None
if rules is not None: if rules is not None:
self.rules = rules self.rules = rules
self.set_type(tp) self.set_type(tp)
@ -244,17 +289,82 @@ class Symbol:
return True return True
def dump(self, prio = NOTICE, msg=""): def dump(self, prio = NOTICE, msg=""):
slog(prio, ",----------------", msg) slog(prio, ",----------------", msg)
slog(prio, "| type =", self.tp) slog(prio, "| type =", self.tp)
slog(prio, "| name =", self.name) slog(prio, "| name =", self.name)
slog(prio, "| token =", self.token) slog(prio, "| token =", self.token)
slog(prio, "| sym =", self.sym) slog(prio, "| sym =", self.sym)
slog(prio, "| term =", self.term) slog(prio, "| term =", self.term)
slog(prio, "| regex =", self.regex) slog(prio, "| regex =", self.regex)
slog(prio, "| datatype =", self.datatype) slog(prio, "| datatype =", self.datatype)
slog(prio, "| is_lexical_element =", self.is_lexical_element) slog(prio, "| is_lexical_element =", self.is_lexical_element)
slog(prio, "| rules =", format_rules(self.rules)) slog(prio, "| rules =", format_rules(self.rules))
slog(prio, "`----------------", msg) slog(prio, "`----------------", msg)
def split_list_by(l_, tok):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
def grammar_parse_ebnf(content_):
# remove comments
in_comment = False
quoted = None
raw_tokens = re.split("([, ])", content_)
tokens = []
for t in raw_tokens:
t = t.strip()
if not len(t):
continue
if quoted:
if t == quoted: # FIXME: check backslash before
quoted = None
elif in_comment:
if t == '*)':
in_comment = False
continue
elif t == '(*':
in_comment = True
continue
elif t in [ '"', "'" ]:
quoted = t
tokens.append(t)
grammar = OrderedDict()
raw_productions = split_list_by(tokens, ';')
#slog(INFO, "raw_productions =", raw_productions)
for raw_production in raw_productions:
#slog(INFO, "raw_production =", '@'.join(raw_production))
raw_lhs_rhs = split_list_by(raw_production, '=')
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
assert(len(raw_lhs_rhs) == 2)
lhs = ' '.join(raw_lhs_rhs[0])
p = Symbol(lhs)
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
for raw_rule in raw_rules:
#slog(INFO, "raw_rule =", raw_rule)
rule_tokens = split_list_by(raw_rule, ',')
#slog(INFO, "rule_tokens =", rule_tokens)
rule = []
for raw_tok in rule_tokens:
tok = cleanup_token(' '.join(raw_tok))
tp = t_target_lang
if is_terminal(tok) is not None:
if not tok in grammar.keys():
litp = Symbol(tok, p_terminal)
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
grammar[tok] = litp
tp = t_target_lang
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
tp = t_grammar
rule.append(RuleComp(tok, tp))
p.rules.append(rule)
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
grammar[lhs] = p
dump_grammar(INFO, grammar)
return grammar
def grammar_get_types(grammar): def grammar_get_types(grammar):
types = dict() types = dict()
@ -292,10 +402,10 @@ def grammar_fix_extensions(grammar, mode):
prefix = "" prefix = ""
paren = 0 paren = 0
for c in rule: for c in rule:
if c.tp != t_target_lang: if c.tp == t_grammar and c.token in ['<', '>']:
if c.token == '(': if c.token == '<':
paren += 1 paren += 1
elif c.token == ')': elif c.token == '>':
paren -= 1 paren -= 1
if paren <= 1: # don't add first level of control chars if paren <= 1: # don't add first level of control chars
continue continue
@ -309,9 +419,9 @@ def grammar_fix_extensions(grammar, mode):
prefix = prefix[1:] prefix = prefix[1:]
slog(INFO, "Found prefix", prefix) slog(INFO, "Found prefix", prefix)
if mode == mode_keep: if mode == mode_keep:
newrule.append(RuleComp('(', t_grammar)) newrule.append(RuleComp('<', t_grammar))
newrule.append(RuleComp(prefix, t_target_lang)) newrule.append(RuleComp(prefix, t_target_lang))
newrule.append(RuleComp(')', t_grammar)) newrule.append(RuleComp('>', t_grammar))
newrule.append(c) newrule.append(c)
elif mode == mode_discard: elif mode == mode_discard:
prefix = '' prefix = ''
@ -362,8 +472,12 @@ def grammar_unroll_lists(grammar):
if rule.tp != t_target_lang: if rule.tp != t_target_lang:
continue continue
name += tok2name(rule.token) + "_" name += tok2name(rule.token) + "_"
if len(delpos) != 1:
raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule)) # not really: there are lists without delimiters, too
#if len(delpos) != 1:
# p.dump(ERR)
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
name = name + "my_list" name = name + "my_list"
newrule.append(RuleComp(name, t_target_lang)) newrule.append(RuleComp(name, t_target_lang))
p = Symbol(name, rules=[[], listrule]) p = Symbol(name, rules=[[], listrule])
@ -447,15 +561,6 @@ def grammar_unroll_options(grammar):
grammar[tok].rules = rules_unroll_options(p.rules) grammar[tok].rules = rules_unroll_options(p.rules)
return grammar return grammar
def format_ebnf_rule(grammar, rule):
r = ""
for comp in rule:
if comp.tp == t_grammar:
r = r + " " + comp.token
continue
r = r + " " + comp.token
return r.strip()
def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
if checked is None: if checked is None:
checked = set() checked = set()
@ -538,14 +643,14 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
slog(INFO, indent, "returning", r, "for token", tok) slog(INFO, indent, "returning", r, "for token", tok)
return r return r
def grammar_check(grammar, selements = None): def grammar_check(grammar, check_symbols = None):
if selements is None:
selements = []
terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
elements = set() elements = set()
if len(selements) == 0: if check_symbols is None:
check_symbols = []
if len(check_symbols) == 0:
for tok, p in grammar.iteritems(): for tok, p in grammar.iteritems():
if p.is_lexical_element: if p.is_lexical_element:
elements.add(tok) elements.add(tok)
@ -555,9 +660,9 @@ def grammar_check(grammar, selements = None):
if c.tp == t_grammar: if c.tp == t_grammar:
continue continue
elements.add(c.token) elements.add(c.token)
selements = sorted(list(elements)) check_symbols = sorted(list(elements))
found = dict() found = dict()
for tok in selements: for tok in check_symbols:
slog(INFO, "======= checking", tok) slog(INFO, "======= checking", tok)
rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
if rr == sys.maxint: if rr == sys.maxint:
@ -683,14 +788,18 @@ def grammar_trim_symbols(grammar, symbols):
def create_ebnf(grammar): def create_ebnf(grammar):
indent = 40 indent = 40
slog(INFO, "creating ebnf from grammar of size", len(grammar))
out = ''
for t, p in grammar.iteritems(): for t, p in grammar.iteritems():
slog(INFO, "formatting rule", t)
if not len(p.rules): if not len(p.rules):
slog(INFO, "ignoring " + t + " (has no rules)\n") slog(INFO, "ignoring " + t + " (has no rules)\n")
continue continue
out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
for rule in p.rules[1:]: for rule in p.rules[1:]:
out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule) out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
return out + "\n" out += ' ' * indent + ' ;\n'
return out
def create_yacc(grammar): def create_yacc(grammar):
indent = ' ' * 40 indent = ' ' * 40