Continue implementation of grammar.py

Signed-off-by: Jan Lindemann <jan@janware.com>
This commit is contained in:
Jan Lindemann 2017-10-25 13:45:44 +02:00
commit 40e6add5ad
6 changed files with 366 additions and 44 deletions

70
test/grammar/Makefile Normal file
View file

@ -0,0 +1,70 @@
TOPDIR = ../..
GENERATED_STD = grammartest.l grammartest.y grammartest.ebnf include/grammartest.h
# These types are meant to be cut off the tree and turned into hand coded flex
# regexes
#TRIM_SYMBOLS = blah
TRIM_SYMBOLS =
GENERATE_LOG_LEVEL ?= notice
FIX_EXTENSIONS ?= discard
CHECK_SYMBOLS ?= --check-symbols=all
GRAMMAR_INPUT ?= grammartest-input.ebnf
GENERATED = grammartest-dense.ebnf $(GENERATED_STD)
GENERATE_PY = ./generate.py
GENERATE = python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create \
--fix-extensions $(FIX_EXTENSIONS) \
--unroll-lists \
--unroll-options \
$(CHECK_SYMBOLS) \
--trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \
$(CREATE_EXTRA_ARGS)
CHECK_SYMBOLS ?= special_character
FB_NAME_PREFIX ?= grammartest_
FB_HDRDIR ?= include
FB_BISON_OUT_EXT ?= cpp
FB_FLEX_OUT_EXT ?= cpp
FB_CASE_INSENSITIVE ?= true
FB_SRC ?= $(filter %.y %.l,$(GENERATED))
include $(TOPDIR)/make/proj.mk
include $(MODDIR)/make/flex-bison.mk
include $(MODDIR)/make/py-defs.mk
all:
debug-all:
GENERATE_LOG_LEVEL=debug make all 2>&1 | tee run.out
generate: $(GENERATED)
grammartest.y: include/grammartest.h
lex.grammartest.c: grammartest.l
check: $(GRAMMAR_INPUT) $(GENERATE_PY) Makefile
python ./$(GENERATE_PY) --log-level info check --fix-extensions unroll --unroll-lists --unroll-options --check-symbols='$(CHECK_SYMBOLS)' $<
grammartest-dense.ebnf: $(GRAMMAR_INPUT) $(GENERATE_PY)
python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create --fix-extensions keep $< grammartest.ebnf > $@.tmp
mv $@.tmp $@
define generate_rule
$(1): $$(GRAMMAR_INPUT) $$(GENERATE_PY) Makefile
$$(GENERATE) $$< $$(patsubst grammartest.%,grammartest.%,$$@) > $$@.tmp
mv $$@.tmp $$@
endef
$(foreach target,$(GENERATED_STD),$(eval $(call generate_rule,$(target))))
clean.generated:
rm -f $(GENERATED)
clean: clean.generated
echo-generated:
@echo GENERATED = $(GENERATED)
help:
$(GENERATE) --help
expand-macros:
make 2>/dev/null | sed '/g++/ !d; s/g++\|gcc//; s/-o .*//' | xargs g++ -E -C | indent

85
test/grammar/generate.py Normal file
View file

@ -0,0 +1,85 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import argparse
import sys
import re
import textwrap
from collections import OrderedDict
from abc import abstractmethod
import jwutils
from jwutils.log import *
from jwutils import grammar
base = 'grammartest'
mip = '_JW_PYTHON_' + base + base.upper()
namespace = base
def create_grammartest_ebnf(grammar):
print(jwutils.grammar.create_ebnf(grammar))
def create_grammartest_y(grammar):
print(jwutils.grammar.create_yacc(grammar))
def create_grammartest_l(grammar):
print(jwutils.grammar.create_lex(grammar))
def create_include_grammartest_h(grammar):
print(jwutils.grammar.create_header(grammar, mip=mip, namespace=namespace))
class GrammarCmd(jwutils.grammar.GrammarCmd):
def __init__(self, name, help):
super(GrammarCmd, self).__init__(name, help=help)
@abstractmethod
def _run(self, grammar):
pass
def add_parser(self, parsers):
p = super(GrammarCmd, self).add_parser(parsers)
return p
def run(self, args):
with open(args.input, 'r') as infile:
contents = infile.read()
grammar = jwutils.grammar.grammar_parse_ebnf(contents)
slog(INFO, "grammar size is", len(grammar))
for t in grammar.keys():
slog(INFO, "key =", t)
slog(INFO, "grammar size is", len(grammar))
jwutils.grammar.dump_grammar(INFO, grammar)
grammar = super(GrammarCmd, self).processGrammar(args, grammar)
self._run(args, grammar)
class CmdCreate(GrammarCmd):
def __init__(self):
super(CmdCreate, self).__init__("create", help="Create a file")
def add_parser(self, parsers):
p = super(CmdCreate, self).add_parser(parsers)
p.add_argument("output", help="output file")
return p
def _run(self, args, grammar):
cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output))
cmd(grammar)
class CmdCheck(GrammarCmd):
def __init__(self):
super(CmdCheck, self).__init__("check", help="Check grammar")
def add_parser(self, parsers):
p = super(CmdCheck, self).add_parser(parsers)
return p
def _run(self, args, grammar):
pass
jwutils.run_sub_commands('generate Test parser files')

View file

@ -0,0 +1,16 @@
(* a simple program syntax in EBNF Wikipedia *)
program = 'PROGRAM', white space, identifier, white space,
'BEGIN', white space,
{ assignment, ";", white space },
'END.' ;
identifier = alphabetic character, { alphabetic character | digit } ;
number = [ "-" ], digit, { digit } ;
string = '"' , { all characters - '"' }, '"' ;
assignment = identifier , ":=" , ( number | identifier | string ) ;
alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G"
| "H" | "I" | "J" | "K" | "L" | "M" | "N"
| "O" | "P" | "Q" | "R" | "S" | "T" | "U"
| "V" | "W" | "X" | "Y" | "Z" ;
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
white space = ? white space characters ? ;
all characters = ? all visible characters ? ;

View file

@ -0,0 +1,10 @@
PROGRAM DEMO1
BEGIN
A:=3;
B:=45;
H:=-100023;
C:=A;
D123:=B34A;
BABOON:=GIRAFFE;
TEXT:="Hello world!";
END.

View file

@ -0,0 +1,32 @@
#ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H
#define _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H
#define YY_NO_INPUT
#define YY_NO_UNPUT
// #define YY_NO_UNISTD_H
struct context {
int line;
};
union YYSTYPE;
#ifdef __cplusplus
extern "C" {
#endif
/* defined in grammartest-parser.l */
struct vp_scanner;
struct vp_scanner *grammartest_default_init_scanner(const char *str);
void *grammartest_default_scanner_get_data(const struct vp_scanner *scanner);
void grammartest_default_cleanup_scanner(struct vp_scanner *scanner);
void FB_SYM(error)(struct context *context, void *scanner, const char *s);
#ifdef __cplusplus
} // extern "C"
#endif
#define YY_DECL int FB_SYM(lex)(YYSTYPE *yylval_param, struct context *context, void *yyscanner)
#endif /* #ifndef _JW_PYTHON_GRAMMARTEST_PARSER_DEFS_H */

View file

@ -6,6 +6,8 @@ import sys
import re
import lxml.etree as ET
import textwrap
import itertools
import copy
from collections import OrderedDict
from abc import abstractmethod
@ -83,21 +85,34 @@ token_regexes = {
"PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}",
}
quotechars = [ '"', "'" ]
def dump(obj):
for c, v in obj.iteritems():
slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v)))
def dump_grammar(prio, grammar):
for t, p in grammar.iteritems():
p.dump(prio)
def cleanup_token(tok):
tok = tok.strip()
if len(tok) == 0:
return None
if tok[0] == "'" and tok[-1] == "'":
tok = '"' + tok[1:-1] + '"'
return tok
def is_terminal(tok):
if not tok.startswith('"'):
size = len(tok)
if size < 2:
return None
if not tok.endswith('"'):
raise Exception('Token "' + tok + '" isn\'t entirely enclosed in quotes, ends with "' + tok[-1:] + '"')
first = tok[0]
last = tok[-1]
if (not first in quotechars) and (not last in quotechars):
return None
if first != last:
raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes')
return tok[1:-1]
def tok2name(tok):
@ -129,6 +144,29 @@ def format_rule(rule):
def format_rules(rules):
return ', '.join(format_rule(rule) for rule in rules)
def format_ebnf_rule(grammar, rule):
r = ""
last = None
for comp in rule:
if last is not None:
if comp.tp == t_grammar:
if last.tp == t_grammar:
pass
else:
if comp.token in [ '[', '(', '{', '<' ]:
r += ','
else:
if last.tp == t_grammar:
if comp.token in [ ']', ')', '}', '>' ]:
r += ','
else:
r += ','
r += ' ' + comp.token
last = comp
if len(r) == 0:
return r
return r.strip()
def format_yacc_rule(rule):
r = ''
for c in rule:
@ -140,7 +178,7 @@ class RuleComp:
def __init__(self, token, tp):
self.token = token
self.tp = tp
slog(INFO, "creating rule >" + self.str() + "<")
slog(INFO, "creating rule component >" + self.str() + "<")
def __eq__(self, rhs):
if self.token != rhs.token:
@ -153,7 +191,14 @@ class RuleComp:
return not self.__eq__(rhs)
def str(self):
return "{" + self.tp + ": " + self.token + "}"
tp = 'u'
if self.tp == t_grammar:
tp = 'g'
elif self.tp == t_target_lang:
tp = 'l'
else:
tp = self.tp
return "{" + tp + ": " + self.token + "}"
class State:
@ -192,13 +237,13 @@ class Symbol:
def __init__(self, token, tp = p_ruleset, rules = None):
self.tp = tp
self.token = token
self.name = tok2name(token)
self.sym = tok2sym(token)
self.term = None
self.name = tok2name(token)
self.sym = tok2sym(token)
self.term = None
self.regex = None
self.is_lexical_element = False
self.rules = []
self.datatype = None
self.is_lexical_element = False
self.rules = []
self.datatype = None
if rules is not None:
self.rules = rules
self.set_type(tp)
@ -244,17 +289,82 @@ class Symbol:
return True
def dump(self, prio = NOTICE, msg=""):
slog(prio, ",----------------", msg)
slog(prio, "| type =", self.tp)
slog(prio, "| name =", self.name)
slog(prio, ",----------------", msg)
slog(prio, "| type =", self.tp)
slog(prio, "| name =", self.name)
slog(prio, "| token =", self.token)
slog(prio, "| sym =", self.sym)
slog(prio, "| term =", self.term)
slog(prio, "| sym =", self.sym)
slog(prio, "| term =", self.term)
slog(prio, "| regex =", self.regex)
slog(prio, "| datatype =", self.datatype)
slog(prio, "| is_lexical_element =", self.is_lexical_element)
slog(prio, "| rules =", format_rules(self.rules))
slog(prio, "`----------------", msg)
slog(prio, "| is_lexical_element =", self.is_lexical_element)
slog(prio, "| rules =", format_rules(self.rules))
slog(prio, "`----------------", msg)
def split_list_by(l_, tok):
l = copy.deepcopy(l_)
return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]]
def grammar_parse_ebnf(content_):
# remove comments
in_comment = False
quoted = None
raw_tokens = re.split("([, ])", content_)
tokens = []
for t in raw_tokens:
t = t.strip()
if not len(t):
continue
if quoted:
if t == quoted: # FIXME: check backslash before
quoted = None
elif in_comment:
if t == '*)':
in_comment = False
continue
elif t == '(*':
in_comment = True
continue
elif t in [ '"', "'" ]:
quoted = t
tokens.append(t)
grammar = OrderedDict()
raw_productions = split_list_by(tokens, ';')
#slog(INFO, "raw_productions =", raw_productions)
for raw_production in raw_productions:
#slog(INFO, "raw_production =", '@'.join(raw_production))
raw_lhs_rhs = split_list_by(raw_production, '=')
#slog(INFO, "raw_lhs_rhs =", raw_lhs_rhs)
assert(len(raw_lhs_rhs) == 2)
lhs = ' '.join(raw_lhs_rhs[0])
p = Symbol(lhs)
raw_rules = split_list_by(raw_lhs_rhs[1], '|')
#slog(INFO, "raw_lhs_rhs[1] = ", raw_lhs_rhs[1])
for raw_rule in raw_rules:
#slog(INFO, "raw_rule =", raw_rule)
rule_tokens = split_list_by(raw_rule, ',')
#slog(INFO, "rule_tokens =", rule_tokens)
rule = []
for raw_tok in rule_tokens:
tok = cleanup_token(' '.join(raw_tok))
tp = t_target_lang
if is_terminal(tok) is not None:
if not tok in grammar.keys():
litp = Symbol(tok, p_terminal)
slog(INFO, "Appending terminal production>" + tok + "< -> ", litp.str())
grammar[tok] = litp
tp = t_target_lang
elif tok in [ '{', '}', '[', ']', '<', '>', '(', ')' ]:
tp = t_grammar
rule.append(RuleComp(tok, tp))
p.rules.append(rule)
slog(INFO, "Appending production>" + lhs + "< -> ", p.str())
grammar[lhs] = p
dump_grammar(INFO, grammar)
return grammar
def grammar_get_types(grammar):
types = dict()
@ -292,10 +402,10 @@ def grammar_fix_extensions(grammar, mode):
prefix = ""
paren = 0
for c in rule:
if c.tp != t_target_lang:
if c.token == '(':
if c.tp == t_grammar and c.token in ['<', '>']:
if c.token == '<':
paren += 1
elif c.token == ')':
elif c.token == '>':
paren -= 1
if paren <= 1: # don't add first level of control chars
continue
@ -309,9 +419,9 @@ def grammar_fix_extensions(grammar, mode):
prefix = prefix[1:]
slog(INFO, "Found prefix", prefix)
if mode == mode_keep:
newrule.append(RuleComp('(', t_grammar))
newrule.append(RuleComp('<', t_grammar))
newrule.append(RuleComp(prefix, t_target_lang))
newrule.append(RuleComp(')', t_grammar))
newrule.append(RuleComp('>', t_grammar))
newrule.append(c)
elif mode == mode_discard:
prefix = ''
@ -362,8 +472,12 @@ def grammar_unroll_lists(grammar):
if rule.tp != t_target_lang:
continue
name += tok2name(rule.token) + "_"
if len(delpos) != 1:
raise Exception("need exactly one delimiter in list rule:", ' '.join(listrule))
# not really: there are lists without delimiters, too
#if len(delpos) != 1:
# p.dump(ERR)
# raise Exception("need exactly one delimiter in list rule:", format_rule(listrule))
name = name + "my_list"
newrule.append(RuleComp(name, t_target_lang))
p = Symbol(name, rules=[[], listrule])
@ -447,15 +561,6 @@ def grammar_unroll_options(grammar):
grammar[tok].rules = rules_unroll_options(p.rules)
return grammar
def format_ebnf_rule(grammar, rule):
r = ""
for comp in rule:
if comp.tp == t_grammar:
r = r + " " + comp.token
continue
r = r + " " + comp.token
return r.strip()
def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None):
if checked is None:
checked = set()
@ -538,14 +643,14 @@ def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None,
slog(INFO, indent, "returning", r, "for token", tok)
return r
def grammar_check(grammar, selements = None):
if selements is None:
selements = []
def grammar_check(grammar, check_symbols = None):
terminals = {tok for tok, p in grammar.iteritems() if p.term is not None}
orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar}
lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True}
elements = set()
if len(selements) == 0:
if check_symbols is None:
check_symbols = []
if len(check_symbols) == 0:
for tok, p in grammar.iteritems():
if p.is_lexical_element:
elements.add(tok)
@ -555,9 +660,9 @@ def grammar_check(grammar, selements = None):
if c.tp == t_grammar:
continue
elements.add(c.token)
selements = sorted(list(elements))
check_symbols = sorted(list(elements))
found = dict()
for tok in selements:
for tok in check_symbols:
slog(INFO, "======= checking", tok)
rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found)
if rr == sys.maxint:
@ -683,14 +788,18 @@ def grammar_trim_symbols(grammar, symbols):
def create_ebnf(grammar):
indent = 40
slog(INFO, "creating ebnf from grammar of size", len(grammar))
out = ''
for t, p in grammar.iteritems():
slog(INFO, "formatting rule", t)
if not len(p.rules):
slog(INFO, "ignoring " + t + " (has no rules)\n")
continue
out = t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0])
out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n'
for rule in p.rules[1:]:
out += "\n" + ' ' * indent + " | " + format_ebnf_rule(grammar, rule)
return out + "\n"
out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n'
out += ' ' * indent + ' ;\n'
return out
def create_yacc(grammar):
indent = ' ' * 40