From c2c409ed4b87b2a9fbb6c38c3b843f0f1838a5f0 Mon Sep 17 00:00:00 2001 From: Jan Lindemann Date: Sun, 12 Nov 2017 16:08:26 +0100 Subject: [PATCH] Move grammar-related stuff into package jw-grammar Signed-off-by: Jan Lindemann --- make/generate-flex-bison.mk | 83 -- test/grammar/Makefile | 14 - test/grammar/generate.conf | 20 - test/grammar/generate.py | 6 - test/grammar/grammartest-input.ebnf | 16 - test/grammar/grammartest.code | 9 - test/grammar/main.cpp | 30 - tools/python/jwutils/grammar.py | 1826 --------------------------- 8 files changed, 2004 deletions(-) delete mode 100644 make/generate-flex-bison.mk delete mode 100644 test/grammar/Makefile delete mode 100644 test/grammar/generate.conf delete mode 100644 test/grammar/generate.py delete mode 100644 test/grammar/grammartest-input.ebnf delete mode 100644 test/grammar/grammartest.code delete mode 100644 test/grammar/main.cpp delete mode 100644 tools/python/jwutils/grammar.py diff --git a/make/generate-flex-bison.mk b/make/generate-flex-bison.mk deleted file mode 100644 index 6f3f20c..0000000 --- a/make/generate-flex-bison.mk +++ /dev/null @@ -1,83 +0,0 @@ -NAMESPACE_IN_GENERATED ?= $(FB_NAME) - -# These types are meant to be cut off the tree and turned into hand coded flex -# regexes -TRIM_SYMBOLS ?= -CUT_SYMBOLS ?= -IRRELEVANT_SYMBOLS ?= -GENERATE_LOG_LEVEL ?= notice -FIX_EXTENSIONS ?= discard -CHECK_SYMBOLS ?= all -ifneq ($(CHECK_SYMBOLS),) - OPT_CHECK_SYMBOLS ?= --check-symbols='$(CHECK_SYMBOLS)' -endif - -ifneq ($(GENERATE_CONFIG_FILE),) - OPT_CONFIG_FILE ?= --config-file=$(GENERATE_CONFIG_FILE) -endif - -GENERATED_STD += $(FB_NAME).l $(FB_NAME).y $(FB_NAME).ebnf $(FB_COMMON_H) -GENERATED += $(FB_NAME)-dense.ebnf $(GENERATED_STD) -GRAMMAR_INPUT ?= $(FB_NAME)-input.ebnf -FB_NAME_PREFIX ?= $(FB_NAME)_ -FB_HDRDIR ?= include -FB_BISON_OUT_EXT ?= cpp -FB_FLEX_OUT_EXT ?= cpp -FB_CASE_INSENSITIVE ?= true -FB_SRC ?= $(filter %.y %.l,$(GENERATED)) -FB_COMMON_H ?= $(FB_HDRDIR)/$(FB_NAME).h - -INCLUDED_BY_GENERATED += $(FB_COMMON_H) include/lex.$(FB_NAME).h include/$(FB_NAME).tab.h - -GENERATE_PY ?= ./generate.py -GENERATE ?= python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create \ - --fix-extensions $(FIX_EXTENSIONS) \ - --unroll-lists \ - --unroll-options \ - --unroll-alternatives \ - --replace-whitespace \ - $(OPT_CHECK_SYMBOLS) \ - $(OPT_CONFIG_FILE) \ - --trim-symbols=$(shell echo $(TRIM_SYMBOLS) | sed 's/ */,/g') \ - --cut-symbols=$(shell echo $(CUT_SYMBOLS) | sed 's/ */,/g') \ - --irrelevant-symbols=$(shell echo $(IRRELEVANT_SYMBOLS) | sed 's/ */,/g') \ - --namespace=$(NAMESPACE_IN_GENERATED) \ - --includes=$(shell echo $(INCLUDED_BY_GENERATED) | sed 's/ */,/g') \ - $(CREATE_EXTRA_ARGS) -include $(TOPDIR)/make/proj.mk -include $(MODDIR)/make/flex-bison.mk -include $(MODDIR)/make/py-defs.mk - -all: -debug-all: - GENERATE_LOG_LEVEL=debug make all 2>&1 | tee run.out - -generate: $(GENERATED) -#$(FB_NAME).y: $(FB_COMMON_H) lex.$(FB_NAME).$(FB_FLEX_OUT_EXT) -#lex.$(FB_NAME).$(FB_FLEX_OUT_EXT): $(FB_NAME).l - -check: $(GRAMMAR_INPUT) $(GENERATE_PY) Makefile - python ./$(GENERATE_PY) --log-level info check --fix-extensions unroll --unroll-lists --unroll-options $(OPT_CHECK_SYMBOLS) $< - -$(FB_NAME)-dense.ebnf: $(GRAMMAR_INPUT) $(GENERATE_PY) - python ./$(GENERATE_PY) --log-level $(GENERATE_LOG_LEVEL) create --fix-extensions keep $< $(FB_NAME).ebnf > $@.tmp - mv $@.tmp $@ - -define generate_rule -$(1): $$(GRAMMAR_INPUT) $$(GENERATE_PY) Makefile $(GENERATE_CONFIG_FILE) - $$(GENERATE) $$< $$(patsubst $(FB_NAME).%,$(FB_NAME).%,$$@) > $$@.tmp - mv $$@.tmp $$@ -endef -$(foreach target,$(GENERATED_STD),$(eval $(call generate_rule,$(target)))) - -clean.generated: - rm -f $(GENERATED) -clean: clean.generated -echo-generated: - @echo GENERATED = $(GENERATED) - -help: - $(GENERATE) --help - -expand-macros: - make 2>/dev/null | sed '/g++/ !d; s/g++\|gcc//; s/-o .*//' | xargs g++ -E -C | indent diff --git a/test/grammar/Makefile b/test/grammar/Makefile deleted file mode 100644 index 8c57a37..0000000 --- a/test/grammar/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -TOPDIR = ../.. - --include local.mk - -EXE_ARGS ?= grammartest.code -PREREQ_BUILD += ytools -FB_NAME = grammartest -NAMESPACE_IN_GENERATED = gt -GENERATE_CONFIG_FILE = generate.conf -IRRELEVANT_SYMBOLS ?= white_space - -include $(TOPDIR)/make/proj.mk -include $(TOPDIR)/make/generate-flex-bison.mk -include $(MODDIR)/make/exe.mk diff --git a/test/grammar/generate.conf b/test/grammar/generate.conf deleted file mode 100644 index 8bc1e28..0000000 --- a/test/grammar/generate.conf +++ /dev/null @@ -1,20 +0,0 @@ -[symbols] - - [white_space[ - type = token - lex_extra_action = "if memchr(yytext, '\n', yyleng) context->line++;" - regex = "[ \n\t\r]+" - ] - - [all_characters[ - type = non-terminal - regex = "[[:print:]]" - #lex_as = yytext[0] - ] - - [test[ - type = token - dings = bums - regex = "bumsdings" - ] - diff --git a/test/grammar/generate.py b/test/grammar/generate.py deleted file mode 100644 index c2f7b18..0000000 --- a/test/grammar/generate.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import jwutils - -jwutils.run_sub_commands('generate Test parser files', modules = ['jwutils.grammar']) diff --git a/test/grammar/grammartest-input.ebnf b/test/grammar/grammartest-input.ebnf deleted file mode 100644 index e06d923..0000000 --- a/test/grammar/grammartest-input.ebnf +++ /dev/null @@ -1,16 +0,0 @@ - (* a simple program syntax in EBNF − Wikipedia *) - program = 'PROGRAM', white space, identifier, white space, - 'BEGIN', white space, - { assignment, ";", white space }, - 'END.', [ white space ]; - identifier = alphabetic character, { alphabetic character | digit } ; - number = [ "-" ], digit, { digit } ; - string = '"' , { all characters }, '"' ; - assignment = identifier , ":=" , ( number | identifier | string ) ; - alphabetic character = "A" | "B" | "C" | "D" | "E" | "F" | "G" - | "H" | "I" | "J" | "K" | "L" | "M" | "N" - | "O" | "P" | "Q" | "R" | "S" | "T" | "U" - | "V" | "W" | "X" | "Y" | "Z" ; - digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; - white space = ? white space characters ? ; - all characters = ? all visible characters ? ; diff --git a/test/grammar/grammartest.code b/test/grammar/grammartest.code deleted file mode 100644 index b6f8c4d..0000000 --- a/test/grammar/grammartest.code +++ /dev/null @@ -1,9 +0,0 @@ -PROGRAM DEMO1 -BEGIN - A:=3; - B:=45; - H:=-100023; - C:=A; - D123:=B34A; - BABOON:=GIRAFFE; -END. diff --git a/test/grammar/main.cpp b/test/grammar/main.cpp deleted file mode 100644 index 719f520..0000000 --- a/test/grammar/main.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#include -#include - -#include "include/grammartest.h" - -using namespace std; - -int main(int argc, const char *argv[]) -{ - if (argc < 2) { - fprintf(stderr, "usage: %s input-file\n", filenotdir(argv[0])); - return 1; - } - - const char *path = argv[1]; - string content; - if (YMisc::suck_in_file(path, content)<0) { - slog(PRI_ERR, "failed to read [%s] (%s)", path, err()); - return 1; - } - - if (FB_SYM(create_ast)(content.c_str())<0) { - slog(PRI_ERR, "failed to create AST from [%s] (%s)", path, err()); - return 1; - } - - return 0; -} diff --git a/tools/python/jwutils/grammar.py b/tools/python/jwutils/grammar.py deleted file mode 100644 index c775e65..0000000 --- a/tools/python/jwutils/grammar.py +++ /dev/null @@ -1,1826 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import argparse -import sys -import re -import lxml.etree as ET -import textwrap -import itertools -import copy -from collections import OrderedDict -from abc import abstractmethod -import os.path - -import jwutils -#from jwutils.stree import StringTree, serdes -import jwutils.stree.serdes as serdes -import jwutils.stree.StringTree as StringTree - -from jwutils.log import * - -t_grammar = "grammar" -t_target_lang = "target" - -p_ruleset = "ruleset" -p_terminal = "term" -p_literal = "literal" -p_lexical = "lexical" -p_special = "special" -p_regex = "regex" - -mode_unroll = "unroll" -mode_concat = "concat" -mode_keep = "keep" -mode_discard = "discard" -fix_extensions_mode = [ mode_unroll, mode_concat, mode_keep, mode_discard ] - -c_token = "token" -c_non_terminal = "non-terminal" - -member_prefix = '' - -special_terminals = { - "`" : "BACKTICK", - "^" : "CARET", - "<" : "LT", - "<<" : "LEFT_SHIFT", - "<=" : "LTE", - "<=>" : "SPACE_SHIP", - "<>" : "NE", - "=" : "EQ", - "=>" : "EG", - ">" : "GT", - ">=" : "GE", - ">>" : "RIGHT_SHIFT", - "|" : "PIPE", - "_" : "UNDERSCORE", - "," : "COMMA", - ";" : "SEMICOLON", - ":" : "COLON", - ":=" : "DEFINE", - "?" : "QM", - "?<" : "QM_LT", - "?<=" : "QM_LE", - "?=" : "QM_EQ", - "?>" : "QM_GT", - "?>=" : "QM_GE", - "??" : "QM_QM", - "?/=" : "QM_DIV_EQ", - "/" : "DIV", - "/=" : "DIV_EQ", - "." : "DOT", - "\"" : "DQUOTE", - "'" : "QUOTE", - "(" : "LPAREN", - ")" : "RPAREN", - "[" : "LBRACKET", - "]" : "RBRACKET", - "@" : "AT", - "*" : "ASTERISK", - "**" : "DASTERISK", - "\\" : "BACKSLASH", - "&" : "AMPERSAND", - "#" : "NUMBER_SIGN", - "+" : "PLUS", - "-" : "MINUS" -} - -token_regexes = { - "PSL_Property_Declaration" : "property[ \t]+[^;]+;", - "PSL_Sequence_Declaration" : "sequence[ \t]+[^;]+;", - "PSL_Clock_Declaration" : "default[ \t]+clock[ \t]+[^;]+;", - "PSL_Directive" : "([^;]+:)*(assert|assume|restrict|restrict!|cover|fairness|strong_fairness)[ \t]+[^;]+;", - "PSL_Verification_Unit" : "(vunit|vpkg|vprop|vmode)[^{]*{[^}]*}", -} - -quotechars = [ '"', "'" ] - -def dump(obj): - for c, v in obj.iteritems(): - slog(INFO, "obj.%s = %s (=> %s)" % (str(type(c)), str(c), str(v))) - -def dump_grammar(prio, grammar): - caller = get_caller_pos() - for t, p in grammar.iteritems(): - p.dump(prio, caller=caller) - -def cleanup_token(tok): - tok = tok.strip() - if len(tok) == 0: - return None - if tok[0] == "'" and tok[-1] == "'": - tok = '"' + tok[1:-1] + '"' - return tok - -def tok2ctype(tok): - if tok in [ '{', '}', '[', ']', '<', '>', '(', ')', '?', '|' ]: - return t_grammar - return t_target_lang - -def is_terminal(tok): - size = len(tok) - if size < 2: - return None - first = tok[0] - last = tok[-1] - if (not first in quotechars) and (not last in quotechars): - return None - if first != last: - raise Exception('Token >"' + tok + '"< isn\'t symmetrically enclosed in quotes') - return tok[1:-1] - -def tok2name(tok): - tok = cleanup_token(tok) - term = is_terminal(tok) - if term is not None: - if term in special_terminals.keys(): - return special_terminals[term] - return term - return tok - -def tok2sym(tok): - tok = cleanup_token(tok) - term = is_terminal(tok) - if term is not None: - if term in special_terminals.keys(): - return "T_" + special_terminals[term].upper() - return "T_" + re.sub('[^a-zA-Z0-9]', '_', term).upper() - return tok - -def tok2regex(tok): - if tok in token_regexes.keys(): - return token_regexes[tok] - return re.escape(tok) - -def format_rule(rule): - return ' '.join(c.str() for c in rule) - -def format_rules(rules): - return ', '.join(format_rule(rule) for rule in rules) - -def format_ebnf_rule(grammar, rule): - r = "" - last = None - for comp in rule: - if last is not None: - if comp.tp == t_grammar: - if last.tp == t_grammar: - pass - else: - if comp.token in [ '[', '(', '{', '<' ]: - r += ',' - else: - if last.tp == t_grammar: - if comp.token in [ ']', ')', '}', '>' ]: - r += ',' - else: - r += ',' - r += ' ' + comp.token - last = comp - if len(r) == 0: - return r - return r.strip() - -def format_yacc_rule(rule): - r = '' - for c in rule: - if c.tp != t_target_lang: - slog(DEBUG, "ignoring non-target-language token", c.token, "in rule") - continue - r += tok2sym(c.token) + ' ' - return r[:-1] - -class SourceElement: - - def __init__(self, token, line): - self.token = token - self.line = line - -class RuleComp: - - def __init__(self, token, tp = None, line=-1): - assert(token is not None) - # assert(token != '|') - self.token = token - if tp is None: - tp = tok2ctype(token) - self.tp = tp - slog(INFO, "creating rule component >" + self.str() + "<") - assert(token != "{ assignment") - self.line = line - - def __eq__(self, rhs): - if self.token != rhs.token: - return False - if self.tp != rhs.tp: - return False - return True - - def __ne__(self, rhs): - return not self.__eq__(rhs) - - def str(self): - tp = 'u' - if self.tp == t_grammar: - tp = 'g' - elif self.tp == t_target_lang: - tp = 'l' - else: - tp = self.tp - return "{" + tp + ": " + self.token + "}" - -class State: - - def __init__(self): - self.__pair_square = ['[', ']'] - self.__pair_curly = ['{', '}'] - self.__pair_ext = ['<', '>'] - self.__pair_group = ['(', ')'] - self.__pair_comment = ['(*', '*)'] - self.__pair_special = ['?', '?'] - self.reset() - - def reset(self): - self.curly = 0 - self.square = 0 - self.ext = 0 - self.group = 0 - self.in_comment = False - self.in_special = False - self.production = None - self.rule = [] - self.rules = [] - self.things = [] - - def optional(self): - return self.square != 0 or self.curly != 0 - - def update(self, tok, line): - if not self.in_comment: - if tok == '[': - self.square += 1 - self.things.append(self.__pair_square) - elif tok == ']': - self.square -= 1 - assert(self.things.pop() == self.__pair_square) - elif tok == '{': - self.curly += 1 - self.things.append(self.__pair_curly) - elif tok == '}': - self.curly -= 1 - assert(self.things.pop() == self.__pair_curly) - elif tok == '(': - self.group += 1 - self.things.append(self.__pair_group) - elif tok == ')': - self.group -= 1 - assert(self.things.pop() == self.__pair_group) - elif tok == '<': - self.ext += 1 - self.things.append(self.__pair_ext) - elif tok == '>': - self.ext -= 1 - assert(self.things.pop() == self.__pair_ext) - elif tok == '?': - if not self.in_special: - self.in_special = True - self.things.append(self.__pair_special) - else: - self.in_special = False - assert(self.things.pop() == self.__pair_special) - elif tok == '(*': - self.in_comment = True - self.things.append(self.__pair_comment) - elif tok == '*)': - raise Exception("Unmatched closing EBNF comment mark", tok, "in line", line) - else: - if tok == '(*': - raise Exception("Nested EBNF comment", tok, "in line", line) - elif tok == '*)': - assert(self.things.pop() == self.__pair_comment) - self.in_comment = False - - if self.curly < 0 or self.square < 0 or self.ext < 0 or self.group < 0: - raise Exception("Unbalanced BNF bracket", tok, "in line", line) - return self.optional() - - def in_list(self): - return self.curly > 0 - - def in_option(self): - return self.square > 0 - - def in_group(self): - return self.group > 0 - - def in_ext(self): - return self.ext > 0 - - def in_something(self): - if len(self.things) == 0: - return None - return self.things[-1] - - def is_optional(self): - return self.in_list() or self.in_option() - -class Symbol: - - def __init__(self, token, tp = None, rules = None): - self.reset(token, tp, rules) - self.set_is_payload(True) - - def reset(self, token, tp = None, rules = None): - if tp == None: - if is_terminal(token) is not None: - tp = p_terminal - else: - tp = p_ruleset - self.tp = tp - self.token = token - self.name = tok2name(token) - self.sym = tok2sym(token) - self.term = None - self.regex = None - self.is_lexical_element = False - self.rules = [] - self.datatype = None - if rules is not None: - self.rules = rules - self.set_type(tp) - - def set_is_payload(self, onoff): - self.is_payload = onoff - - def set_type(self, tp): - if tp == p_ruleset: - self.term = None - self.regex = None - self.is_lexical_element = False - self.datatype = self.token + '_t' - elif tp == p_literal: - assert(len(self.rules) == 0) - self.term = None - self.regex = tok2regex(self.token) - self.is_lexical_element = False - self.datatype = 'std::string' - elif tp == p_special or tp == p_lexical or tp == p_regex: - if len(self.rules): - self.dump(ERR) - raise Exception("Tried to set symbol", self.token, "to special which has", len(self.rules), "rules") - self.term = None - self.regex = None - self.is_lexical_element = True - self.datatype = 'std::string' - elif tp == p_terminal: - if len(self.rules): - slog(ERR, "rules = ", self.rules) - self.dump(ERR) - raise Exception("Tried to set symbol", self.token, "to terminal which has", len(self.rules), "rules") - self.term = self.token - self.regex = tok2regex(self.token) - self.is_lexical_element = False - self.datatype = None - else: - self.dump() - raise Exception("Tried to set symbol to unknown type", tp) - self.tp = tp - - def str(self): - r = self.name + ' = ' + format_rules(self.rules) - return r - - def equals(self, rhs): - for k, v in self.__dict__.iteritems(): - if (not k in rhs.__dict__) or self.__dict__[k] != rhs.__dict__[k]: - slog(WARNING, k, self.__dict__[k], rhs.__dict__[k]) - return False - return True - - def dump(self, prio = NOTICE, msg="", caller=None): - if caller is None: - caller = get_caller_pos(1) - slog(prio, ",----------------", msg, caller=caller) - slog(prio, "| type =", self.tp, caller=caller) - slog(prio, "| name =", self.name, caller=caller) - slog(prio, "| token =", self.token, caller=caller) - slog(prio, "| sym =", self.sym, caller=caller) - slog(prio, "| term =", self.term, caller=caller) - slog(prio, "| regex =", self.regex, caller=caller) - slog(prio, "| datatype =", self.datatype, caller=caller) - slog(prio, "| is_lexical_element =", self.is_lexical_element, caller=caller) - slog(prio, "| rules =", format_rules(self.rules), caller=caller) - slog(prio, "`----------------", msg, caller=caller) - -def split_list_by(l_, tok): - l = copy.deepcopy(l_) - return [list(x[1]) for x in itertools.groupby(l, lambda x: x==tok) if not x[0]] - -def split_list_by_regex(l_, regex): - l = copy.deepcopy(l_) - return [list(x[1]) for x in itertools.groupby(l, lambda x: re.match(regex, x)) if not x[0]] - -def remove_duplicate_rules(rules): - r = [] - for rule in rules: - if rule in r: - continue - r.append(rule) - slog(DEBUG, "rules after removing duplicates >") - for rule in rules: - slog(DEBUG, "-> " + format_rule(rule)) - slog(DEBUG, "rules after removing duplicates <") - return r - -def grammar_tokenize_ebnf(content): - r = [] - c = '' - l = 0 - in_comment = False - in_quote = None - for line in content.splitlines(True): - end = len(line) - 1 - l += 1 - tok = '' - p = -1 - while p < end: - p += 1 - if p < end and in_quote == None: - cand = line[p:p+2] - if cand == '(*': - if in_comment: - raise Exception("Syntax error in line", l, ": spurious comment closure") - in_comment = True - p += 1 - continue - elif cand == '*)': - if not in_comment: - raise Exception("Syntax error in line", l, ": spurious comment opener") - in_comment = False - p += 1 - continue - if in_comment: - continue - c = line[p] - if c in [ '"', "'" ]: - if in_quote is None: - in_quote = c - else: - if in_quote == c: - in_quote = None - if in_quote is not None: - tok += c - continue - if c in [ '(', ')', '[', ']', '{', '}', ',', ';', '=', '?', '|', '\n' ]: - tok = tok.strip() - if len(tok): - r.append((tok, l)) - tok = '' - if not c.isspace(): - r.append((c, l)) - continue - tok += c - - tok = tok.strip() - if len(tok): - r.append((tok, l)) - return r - -def grammar_add_symbol(grammar, tok, rules): - assert(tok is not None) - if tok in grammar.keys(): - s = grammar[tok] - else: - s = Symbol(tok, rules=rules) - grammar[tok] = s - if rules is not None: - slog(NOTICE, "Adding rules for symbol", tok, ":", format_rules(rules)) - for rule in rules: - if not rule in s.rules: - s.rules.append(rule) - grammar[tok] = s - -def grammar_parse_ebnf_tokens(tokens): - grammar = OrderedDict() - state = State() - lhs = None - last = None - ruleset = [] - rule = [] - terminals = [] - specials = [] - for tok, line in tokens: - try: - state.update(tok, line) - if tok == '=': - lhs = last - continue - last = tok - if tok == ';': - ruleset.append(rule) - grammar_add_symbol(grammar, lhs, ruleset) - ruleset = [] - rule = [] - lhs = None - continue - if tok == ',': - continue - if tok == '|' and state.in_something() is None: - ruleset.append(rule) - rule = [] - continue - if is_terminal(tok) and tok not in terminals: - terminals.append(tok) - elif state.in_special and tok not in specials: - specials.append(tok) - if lhs is not None: - rule.append(RuleComp(tok, line=line)) - except Exception as err: - for t in tokens: - slog(ERR, t) - slog(ERR, "Unexpected error in line", line, ":", str(err)) - raise - exit(1) - for s in terminals: - grammar_add_symbol(grammar, s, None) - grammar[s].set_type(p_terminal) - for s in specials: - slog(INFO, "found special sequence symbol", s) - grammar_add_symbol(grammar, s, None) - grammar[s].set_type(p_special) - - return grammar - -def grammar_parse_ebnf(content_): - tokens = grammar_tokenize_ebnf(content_) - grammar = grammar_parse_ebnf_tokens(tokens) - return grammar - -def grammar_get_types(grammar): - types = dict() - for t, p in grammar.iteritems(): - if not len(p.rules): - continue - if p.term is not None: - continue - ruleno = 1 - rules = [] - for rule in p.rules: - members = [] - for c in rule: - if c.tp != t_target_lang: - continue - if not c.token in grammar.keys(): - p.dump(ERR) - raise Exception("Can't make type from unknown token \"" + c.token + "\" in rule", format_rule(rule)) - pp = grammar[c.token] - if pp.tp is p_terminal: - continue - if not pp.is_payload: - continue - members.append(tok2sym(c.token)) - if True or len(members): - rules.append(members) - if t in types.keys(): - raise Exception("Tried to add type", t, "twice") - types[t] = rules - return types - -def grammar_fix_extensions(grammar, mode): - for tok, p in grammar.iteritems(): - newrules = [] - for rule in p.rules: - newrule = [] - prefix = "" - paren = 0 - for c in rule: - if c.tp == t_grammar and c.token in ['<', '>']: - if c.token == '<': - paren += 1 - elif c.token == '>': - paren -= 1 - if paren <= 1: # don't add first level of control chars - continue - newrule.append(c) - continue - if paren > 0: - assert(len(c.token) != 0) - prefix += '_' + c.token - continue - if len(prefix) > 0: - prefix = prefix[1:] - slog(INFO, "Found prefix", prefix) - if mode == mode_keep: - newrule.append(RuleComp('<')) - newrule.append(RuleComp(prefix, t_target_lang)) - newrule.append(RuleComp('>')) - newrule.append(c) - elif mode == mode_discard: - prefix = '' - continue - elif mode in [ mode_unroll, mode_concat ]: - combined = RuleComp(c.token, c.tp) - combined.token = prefix + c.token - prefix = '' - newrule.append(combined) - slog(INFO, "Appended new rule return value", combined.token) - if mode == mode_unroll: - if combined.token in grammar.keys(): - continue - grammar[combined.token] = Symbol(combined.token, rules=[[c]]) - else: - raise Exception("Invalid prefix mode", mode) - prefix = '' - continue - newrule.append(c) - if len(prefix): # undigested prefix, since it was the last - newrule.append(RuleComp(prefix[1:], t_target_lang)) - newrules.append(newrule) - grammar[tok].rules = newrules # TODO: not sure if this could be done on iterator only - return grammar # TODO: not sure if this is necessary - -def grammar_unroll_lists(grammar): - delimiters = [ '","', '";"', '"|"' ] # TODO: this could be a function parameter to make it generic - newrule = None - for tok, p in grammar.iteritems(): - newrules = [] - for rule in p.rules: - newrule = [] - listrule = [] - prefix = None - s = State() - slog(INFO, "----------------- list-unrolling rule", format_rule(rule)) - for c in rule: - s.update(c.token, c.line) - if c.token == '{': - continue - if c.token == '}': - if len(listrule) == 0: - raise Exception("Rule of production", p.name, "contains empty list:", format_rule(rule)) - delpos = [] - name = "list" - for i, rule in enumerate(listrule): - if rule.token in delimiters: - delpos.append(i) - continue - if rule.tp != t_target_lang: - continue - name += "_" + tok2name(rule.token) - - # not really: there are lists without delimiters, too - #if len(delpos) != 1: - # p.dump(ERR) - # raise Exception("need exactly one delimiter in list rule:", format_rule(listrule)) - - newrule.append(RuleComp(name, t_target_lang)) - listrule.insert(0, RuleComp('(', t_grammar)) - listrule.insert(0, RuleComp(name, t_target_lang)) # enable iteration - listrule.append(RuleComp(')', t_grammar)) - #p = Symbol(name, rules=[[], listrule]) - p = Symbol(name, rules=[listrule]) - #p = Symbol(name) - #p.rules = [ [], listrule ] - listrule = [] - if name not in grammar.keys(): - grammar[name] = p - continue - if not p.equals(grammar[name]): - p.dump(ERR, "old list production") - p.dump(ERR, "new list production") - raise Exception("List production expands to already taken name", name) - continue - if s.in_list(): - listrule.append(c) - continue - newrule.append(c) - slog(DEBUG, "appending " + format_rule(newrule)) - newrules.append(newrule) - newrule = None - else: - if newrule is not None: - slog(DEBUG, "appending " + format_rule(newrule)) - newrules.append(newrule) - newrule = None - slog(DEBUG, "done processing rules for " + tok) - grammar[tok].rules = remove_duplicate_rules(newrules) - return grammar - -def rules_unroll_options(rules): - r = [] - found = False - newrule = None - slog(DEBUG, "unrolling", format_rules(rules)) - for rule in rules: - square = 0 - option = [] - newrule = [] - for i, c in enumerate(rule): - if c.tp == t_grammar: - if c.token == '[': - square += 1 - elif c.token == ']': - square -= 1 - if square == 1: - continue - if square >= 1: - option.append(c) - continue - slog(DEBUG, "square =", square) - assert(square == 0) - n = len(option) - if n == 0: - newrule.append(c) - continue - # first without option - replaced = newrule[:] - tail = rule[i+1:len(rule)] - slog(DEBUG, "i = ", i) - slog(DEBUG, "n = ", n) - slog(DEBUG, "rule = ", format_rule(rule)) - slog(DEBUG, "tail = ", format_rule(tail)) - slog(DEBUG, ",-------------------------") - slog(DEBUG, "head = ", format_rule(replaced)) - replaced.extend(tail) - slog(DEBUG, "head + tail = ", format_rule(replaced)) - r.append(replaced) - # then with option inserted - for unrolled in rules_unroll_options([ option ]): - replaced = newrule[:] - slog(DEBUG, ",-------------------------") - slog(DEBUG, "head = ", format_rule(replaced)) - slog(DEBUG, "unrolled = ", format_rule(unrolled)) - replaced.extend(unrolled) - slog(DEBUG, "head + unrolled =", format_rule(replaced)) - replaced.extend(tail) - slog(DEBUG, "head + unrolled + tail =", format_rule(replaced)) - r.append(replaced) - found = True - break - if not found: - r.append(newrule) - newrule = None - else: - if newrule is not None: - slog(DEBUG, "appending " + format_rule(newrule)) - r.append(newrule) - newrule = None - if found: - return rules_unroll_options(r) - return r - -def grammar_unroll_options(grammar): - for tok, p in grammar.iteritems(): - grammar[tok].rules = remove_duplicate_rules(rules_unroll_options(p.rules)) - return grammar - -def rules_unroll_alternatives(rules): - r = [] - found = False - slog(INFO, "unrolling alternatives in", format_rules(rules)) - sep = RuleComp('|') - for rule in rules: - if not sep in rule: - r.append(rule) - continue - found = True - state = State() - end = len(rule) - 1 - first = last = -1 - for i, c in enumerate(rule): - state.update(c.token, line=c.line) - if c.token != '|' or c.tp != t_grammar: - slog(INFO, "checking token", c.token, "of type", c.tp, "at position", i) - continue - slog(INFO, "found token at position", i) - container = state.in_something() - slog(INFO, "thing delimiters are", container) - if container is None: - raise Exception("Alternative in line", c.line, "at rule position", i, "outside container:", format_rule(rule)) - first = last = -1 - k = i - 1 - while k >= 0: - prev = rule[k] - slog(INFO, "comparing token", rule[k].token, "at position", k, "against opener", container[0]) - if prev.token == container[0]: - first = k - break - k -= 1 - if first == -1: - raise Exception("Alternative in line", c.line, "missing previous element:", format_rule(rule)) - k = i - while k <= end: - nxt = rule[k] - slog(INFO, "comparing token", rule[k].token, "at position", k, "against closer", container[1]) - if nxt.token == container[1]: - last = k - break - k += 1 - if last == i: - raise Exception("Alternative in line", c.line, "missing next element:", format_rule(rule)) - break # found what I wanted - assert(first > 0) - assert(last > 0) - assert(last <= end) - head = rule[0:first] - mid = rule[first+1:last] - tail = rule[last+1:end] - slog(INFO, "first =", first, "last =", last, "end =", end) - slog(INFO, "head = ", format_rule(head)) - slog(INFO, "mid = ", format_rule(mid)) - slog(INFO, "tail = ", format_rule(tail)) - for m in split_list_by(mid, sep): - unrolled_rule = head + m + tail - r.append(unrolled_rule) - #if found: - # return rules_unroll_alternatives(r) - return r - -def grammar_unroll_alternatives(grammar): - for tok, p in grammar.iteritems(): - grammar[tok].rules = rules_unroll_alternatives(p.rules) - return grammar - -def grammar_replace_whitespace(grammar): - r = OrderedDict() - for tok, s in grammar.iteritems(): - newrules = [] - for rule in s.rules: - newrule = [] - for c in rule: - newc = RuleComp(c.token.replace(' ', '_'), tp=c.tp, line=c.line) - newrule.append(newc) - newrules.append(newrule) - newtok = tok.replace(' ', '_') - s.reset(newtok, tp=s.tp, rules=newrules) - r[newtok] = s - slog(INFO, "added symbol", newtok) - return r - -def grammar_add_configured_types(grammar, conf): - if conf is None: - return grammar - symbols = conf.get('symbols') - if symbols is None: - return grammar - for t, c in symbols.iteritems(): - s = Symbol(t) - s.set_type(p_regex) - s.regex = c["regex"].value() - grammar[t] = s - return grammar - -def step_out(grammar, terminals, orphans, lexicals, tok, depth, checked = None, found = None): - if checked is None: - checked = set() - if found is None: - found = dict() - indent = ' ' * depth * 2 - if tok in found.keys(): - slog(INFO, indent + " + found cached", tok, "with depth", found[tok]) - return found[tok] - slog(INFO, indent + " + " + tok) - indent = indent + " " - if tok in terminals: - found[tok] = 1 - slog(INFO, indent + " + found terminal", tok, "with depth", found[tok]) - return 1 - if tok in orphans: - found[tok] = 1 - slog(INFO, indent + " + found orphan", tok, "with depth", found[tok]) - return 1 - #if tok in lexicals: - # found[tok] = 1 - # slog(INFO, indent + " + found lexical element", tok, "with depth", found[tok]) - # return 1 - if tok in checked: - slog(INFO, indent, "token", tok, "is among checked", ' '.join(checked)) - return sys.maxint - - slog(INFO, indent, "checked =", ' '.join(checked)) - checked.add(tok) - if tok not in grammar.keys(): - slog(ERR, "tried to validate unknown token \"" + tok + "\"") - return sys.maxint - p = grammar[tok] - r = sys.maxint - slog(INFO, indent, p.token, "has", len(p.rules), "rules") - only_optional = True - for rule in p.rules: - slog(INFO, indent, "testing rule", format_rule(rule)) - if tok in [ c.token for c in rule ]: - continue - mn = sys.maxint - mx = 0 - s = State() - for c in rule: - slog(INFO, indent, "testing token", c.token) - if c.tp == t_grammar and s.update(c.token, 0): - continue - if c.tp != t_target_lang: - slog(INFO, indent, " token", c.token, "is not a VHDL token") - continue - only_optional = False - # same "found" argument in next call? - rr = step_out(grammar, terminals, orphans, lexicals, c.token, depth + 1, checked.copy(), found) - slog(INFO, indent, " token", c.token, "needs", rr, "steps to escape, mn=", mn, "mx=", mx) - if rr == sys.maxint or rr is None: - slog(INFO, indent, " got error for token", c.token) - mn = sys.maxint - mx = 0 - break - if rr > mx: - slog(INFO, indent, " adjusting mx to", rr) - mx = rr - if rr < mn: - slog(INFO, indent, " adjusting mn to", rr) - mn = rr - if mn == sys.maxint or mx == 0: # unusable as escape route - slog(INFO, indent, " unusable as escape route for " + tok + ":", format_rule(rule)) - continue - slog(INFO, indent, "after checking all rules, mx is", mx) - if mx < r: - slog(INFO, indent, "setting return value to max", mx) - r = mx - if only_optional: - slog(INFO, indent, tok, "has only optional rules, accepting") - r = 0 - if r != sys.maxint: - r += 1 - slog(INFO, indent, "found way out for", tok, "at depth", depth, "with", r, "steps") - found[tok] = r - slog(INFO, indent, "returning", r, "for token", tok) - return r - -def grammar_check(grammar, check_symbols = None): - terminals = {tok for tok, p in grammar.iteritems() if p.term is not None} - orphans = {tok for tok, p in grammar.iteritems() if p.token not in grammar} - lexicals = {tok for tok, p in grammar.iteritems() if p.is_lexical_element is True} - elements = set() - if check_symbols is None: - check_symbols = [] - if len(check_symbols) == 0: - for tok, p in grammar.iteritems(): - if p.is_lexical_element: - elements.add(tok) - continue - for rule in p.rules: - for c in rule: - if c.tp == t_grammar: - continue - elements.add(c.token) - check_symbols = sorted(list(elements)) - found = dict() - for tok in check_symbols: - slog(INFO, "======= checking", tok) - rr = step_out(grammar, terminals, orphans, lexicals, tok, 0, checked=set(), found=found) - if rr == sys.maxint: - slog(ERR, "No way out for", tok) - exit(1) - if not tok in grammar.keys(): - slog(ERR, "Token", tok, "has no production") - exit(1) - slog(INFO, tok, "->", str(rr)) - -def grammar_lhss_map(grammar): - r = dict() - for t in grammar.keys(): - r[t] = set() - for t, p in grammar.iteritems(): - for rule in p.rules: - for c in rule: - if c.tp == t_target_lang: - r[c.token].add(t) - return r - -def do_grammar_lhss(dmap, stop, rhs, buf, recursive): - lhss = dmap[rhs] - for lhs in lhss: - if lhs in buf: - continue - buf.add(lhs) - if lhs in stop: - slog(INFO, " symbol", lhs, "is among stop symbols, stopping recursion") - continue - if recursive: - do_grammar_lhss(dmap, stop, lhs, buf, recursive) - -def grammar_lhss(dmap, stop, symbols, recursive = False): - r = set() - for s in symbols: - if s in r: - continue - do_grammar_lhss(dmap, stop, s, r, recursive) - return r - -def do_grammar_rhss(grammar, stop, sym, buf): - p = grammar[sym] - for rule in p.rules: - for c in rule: - if c.tp != t_target_lang: - continue - if c.token in stop: - continue - if c.token in buf: - continue - buf.add(c.token) - do_grammar_rhss(grammar, stop, c.token, buf) - -def grammar_rhss(grammar, stop, symbols): - r = set() - for s in symbols: - if s in r: - continue - do_grammar_rhss(grammar, stop, s, r) - return r - -def grammar_symbol_in_use(grammar, dmap, stop, checked, sym): - if sym in stop: - return False - # Does this have to be recursive? - defined = grammar_lhss(dmap, stop, set([sym])) - slog(INFO, " symbol", sym, "defines:", ', '.join(defined)) - if not len(defined): - return True - for d in defined: - if d in stop: - continue - if d in checked: - continue - checked.add(d) - if grammar_symbol_in_use(grammar, dmap, stop, checked, d): - return True - return False - -def do_grammar_unused(grammar, dmap, doomed): - r = set(doomed) - rhss = grammar_rhss(grammar, set(), doomed) - for rhs in rhss: - slog(INFO, "+++ checking if symbol", rhs, "is in use >>") - if not grammar_symbol_in_use(grammar, dmap, doomed, set(), rhs): - slog(INFO, " symbol", rhs, "is not in use") - r.add(rhs) - slog(INFO, "+++ checking if symbol", rhs, "is in use (yes) <<") - return r - -def grammar_unused(grammar, dmap, doomed): - r = set(doomed) - while True: - unused = do_grammar_unused(grammar, dmap, r) - slog(INFO, "unused:", ', '.join(unused)) - slog(INFO, "r: ", ', '.join(r)) - if unused == r: - break - r |= unused - return r - -# eradicate symbols from tree -def grammar_cut_symbols(grammar, symbols): - slog(INFO, "-------- removing symbols:", ', '.join(symbols)) - dmap = grammar_lhss_map(grammar) - unused = grammar_unused(grammar, dmap, symbols) - for s in unused: - slog(INFO, " + removing symbol", s) - del grammar[s] - return grammar - -# make symbol an empty literal production -def grammar_trim_symbols(grammar, symbols): - grammar_cut_symbols(grammar, symbols) - for s in symbols: - slog(INFO, " + adding empty production for symbol", s) - p = Symbol(s) - p.set_type(p_literal) - grammar[s] = p - - return grammar - -# flag symbols as non-payload -def grammar_irrelevant_symbols(grammar, symbols): - for s in symbols: - grammar[s].set_is_payload(False) - - return grammar -def grammar_create_ebnf(grammar, opts): - indent = 40 - slog(INFO, "creating ebnf from grammar of size", len(grammar)) - out = '' - for t, p in grammar.iteritems(): - slog(INFO, "formatting rule", t) - if not len(p.rules): - slog(INFO, "ignoring " + t + " (has no rules)\n") - continue - out += t + ' ' * (indent - len(t)) + " = " + format_ebnf_rule(grammar, p.rules[0]) + '\n' - for rule in p.rules[1:]: - out += ' ' * indent + " | " + format_ebnf_rule(grammar, rule) + '\n' - out += ' ' * indent + ' ;\n' - return out - -def format_token(sym, tp): - return misc.pad('%token <' + sym + '>', 27) + misc.pad(sym, 20) + '/* ' + tp + ' */' - -def grammar_create_y(grammar, opts): - indent = ' ' * 40 - width = 0 - for t, p in grammar.iteritems(): - if p.term is not None: - continue - if len(t) > width: - width = len(t) - spaces = 0 - while spaces < width: - spaces += 8 - indent = '\t' * (spaces / 8) - - conf = opts['config'] - - out = "" - - # preamble - out += textwrap.dedent("""\ - %{ - #include - #include - #include - #include - #include - - #include - #include - - #include - - """) - - for f in opts['includes']: - out += '#include "' + f + '"' + '\n' - - out += "\nusing namespace " + opts['namespace'] + ';\n' - - #out += textwrap.dedent("""\ - # using namespace std; - - # namespace { - - # typedef vector wrap_t; - # const wrap_t curly_braces{ "{", "}" }; - # const wrap_t round_braces{ "(", ")" }; - - # } - - # #ifdef __cplusplus - # // extern "C" { - # #endif - - out += textwrap.dedent("""\ - %} - - """) - - # types - out += textwrap.dedent("""\ - %union { - """) - - types = grammar_get_types(grammar) - for t in types.keys(): - s = grammar[t] - if s.tp == p_regex: - continue - out += '\n\t' + opts['namespace'] + '::' + t + '_t *' + t + ';' - out += '\n' - - out += textwrap.dedent("""\ - } - - """) - - # yydecl - out += textwrap.dedent("""\ - %{ - // int FB_SYM(lex)(YYSTYPE *lval, struct vprun *vprun, void *scanner); - YY_DECL; - %} - """) - - # terminal tokens - out += '\n' - for t, p in grammar.iteritems(): - if p.tp == p_terminal: - out += format_token(p.sym, t) +'\n' - - # special tokens - out += '\n' - for t, p in grammar.iteritems(): - if p.tp == p_special: - if p.token == '?': # TODO: why is this among the symbols anyway? - continue - out += format_token(p.sym, t) +'\n' - - # tokens from grammar - out += '\n' - for t, p in grammar.iteritems(): - if p.tp == p_literal: - out += format_token(p.sym, t) +'\n' - - # tokens from config - for t, p in grammar.iteritems(): - if p.tp == p_regex: - out += format_token(t, "blah") + '\n' - - # types - out += '\n' - for t, p in grammar.iteritems(): - if p.tp == p_regex: - continue - if p.tp == p_ruleset: - out += misc.pad('%type <' + tok2sym(p.token) + '>', 40) + misc.pad(t, 35) + '/* ' + t + ' */' +'\n' - - # options - out += textwrap.dedent("""\ - - %define parse.error verbose - // %define lr.type ielr - %define api.pure full - %param { struct context *context } { void *scanner } - """) - - if opts['start'] is not None: - out += "%start " + opts['start'] - - # productions - out += '\n%%\n\n' - for t, p in grammar.iteritems(): - - if not len(p.rules): - continue - if p.tp == p_terminal: - continue - if p.tp == p_special: - continue - if p.tp == p_regex: - continue - slog(INFO, "creating production for symbol", p.str()) - - #if p.is_lexical_element is True: - # continue - if len(p.rules) == 0: - raise Exception("Symbol ", p.str(), "has no rules") - first = True - n_rule = 0 - for rule in p.rules: - n_rule += 1 - n = 0 - s = State() - if first: - out += t + ":" + (spaces - (len(t) + 1)) * ' ' + format_yacc_rule(rule) + "\n" - first = False - else: - out += indent + "| " + format_yacc_rule(rule) + "\n" - out += indent + "{" + "\n" - out += indent + "\t" + 'slog(PRI_NOTICE, "stack size = %d, %d / %d, %d", yyssp - &yyssa[0], yyss - &yyssa[0], yyvsp - &yyvsa[0], yyvs - &yyvsa[0]);\n' - out += indent + "\t" + "$$ = new " + opts['namespace'] + '::' + t + ";\n" - out += indent + "\t" + "$$->type = " + opts['namespace'] + '::' + t + "::t_" + str(n_rule) + ";\n" - tokens = [] - for c in rule: - if c.tp == t_target_lang: - tokens.append(c.token) - idx = 0 - for c in rule: - if c.tp == t_grammar: - s.update(c.token, 0) - continue - if c.token in tokens: - continue - n += 1 - p = grammar[c.token] - #if is_terminal(c.token) is not None: - # continue - if p.tp not in [ p_ruleset ]: - continue - if not p.is_payload: - continue - tp = tok2name(c.token) - suffix = '' - if tokens.count(c.token) > 1: - idx += 1 - suffix = '_' + str(idx) - out += indent + "\t" + \ - "$$->data.r" + str(n_rule) + '.' + member_prefix + tp + suffix + \ - " = new " + p.datatype + "(*$" + str(n) + ");\n" - out += indent + "}" + "\n" - out += indent + ";\n\n" - - # tail - out += '\n%%\n\n' - - out += textwrap.dedent(""" - #ifdef __cplusplus - // } /* extern "C" */ - #endif - """) - - return out + "\n" - -def grammar_create_l(grammar, opts): - - ignore = "" - conf = opts['config'] - - out = textwrap.dedent("""\ - %option reentrant - %option bison-bridge - - %{ - #include - - #define YY_USER_ACTION \\ - context->first_line = context->last_line; \\ - context->first_column = context->last_column; \\ - for(int i = 0; yytext[i] != '\\0'; i++) { \\ - if(yytext[i] == '\\n') { \\ - context->last_line++; \\ - context->last_column = 0; \\ - } else { \\ - context->last_column++; \\ - } \\ - } - """) - - for f in opts['includes']: - out += '#include "' + f + '"' + '\n' - - out += "\nusing namespace " + opts['namespace'] + ';\n' - - #out += textwrap.dedent("""\ - - # /* This is too late in the Flex generated file to work. Still lots of - # * prototypes are spat into it above it, which end up with C++ linkage, of - # * course, causing the linkages to be inconsistent to the functions below this - # * extern "C". Only way I found was to use C++ is to use it on Bison only, and - # * have Flex use C instead. */ - # #ifdef __cplusplus - # // extern "C" { - # #endif - - # #ifdef _REMOVE_ME - # static void get_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); - # static void get_based_string(YYSTYPE *yylval_param, yyscan_t yyscanner, int skip); - # #endif - - # %} - - out += textwrap.dedent("""\ - %} - - %% - - """) - - for t, p in grammar.iteritems(): - if p.term is not None: - # \. { return T_DOT; } - assert p.term[0] in [ '"', "'" ], p.term - assert p.term[-1] in [ '"', "'" ], p.term - out += re.escape(p.term[1:-1]) + ' { slog(PRI_NOTICE, "found terminal ' + p.sym + '"); return ' + p.sym + '; }\n' - - for t, p in grammar.iteritems(): - if p.tp == p_regex: - c = conf['symbols'][t] - lex_as = c.get('lex_as') - if lex_as is not None: - retval = lex_as.value() - else: - retval = t - regex = c['regex'].value() - out += regex + ' { slog(PRI_NOTICE, "found regex \\"' + regex + '\\" for ' + t + '"); return ' + retval + '; }\n' - - #out += textwrap.dedent("""\ - # - # %{/* basic_identifier */%} - # %{/* extended_identifier */%} - # %{/* based_integer */%} - # %{/* bit_value */%} - # %{/* numeric_literal */%} - # %{/* enumeration_literal */%} - # %{/* string_literal */%} - # %{/* bit_string_literal */%} - # %{/* character_literal */%} - # %{/* graphic_character */%} - # %{/* basic_character */%} - # %{/* integer */%} - # - # """) - - ignore += textwrap.dedent("""\ - - %{ /* not sure how to handle literals >> */ %} - \\"[ \\!#-~]*\\" | - \\'[0-1]\\' { - // get_string(yylval_param, yyscanner, 1); - /* Gets a string excluding " or ' */ - int skip = 1; - int i; - - for (i=skip; yytext[i]!='"' && yytext[i]!='\\'' && yytext[i]!=0; i++); - yytext[i] = 0; - YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); - lv->txt=(char *)malloc(i+1); - strcpy(lv->txt, yytext+skip); - - return STRING; - } - - #[0-9a-f]*# { - // get_based_string(yylval_param, yyscanner, 1); /* skip leading # */ - /* Gets a string excluding # */ - int i; - int skip = 1; - - for(i=skip; yytext[i] !='#' && yytext[i]!=0; i++); - yytext[i] = 0; - YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); - lv->txt = (char *)malloc(i+1); - strcpy(lv->txt, yytext + skip); - - return BASED; - } - - [a-zA-Z_$][a-zA-Z0-9_$.]* { - YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); - lv->txt=(char *)malloc(strlen(yytext)+1); - strcpy(lv->txt, yytext); - return NAME; - } - - [0-9]+ { - YYSTYPE *lv = FB_SYM(get_lval(yyscanner)); - sscanf(yytext, "%d", &lv->n); - return NATURAL; - } - - """) - - out += textwrap.dedent("""\ - . { - slog(PRI_NOTICE, "returning character '%c'", yytext[0]); - return yytext[0]; - } - - %{/* not sure how to handle literals << */%} - - %% - - void FB_SYM(error)(struct context *context, void *scanner, const char *msg) - { - struct yyguts_t *yyg =(struct yyguts_t*)scanner; - set_error(PRI_ERR, EINVAL, "%s at \\"%s\\" in line %d:%d", msg, yytext, context->last_line, context->last_column); - } - - int FB_SYM(wrap)(void *scanner) - { - return 1; - } - - struct vp_scanner { - YY_BUFFER_STATE buf; - void *scanner; - char *str; - }; - - /* utilities which need to be placed here, because I can't find - * yylex_init() / _destroy() in any generated header file (??) */ - struct vp_scanner *FB_SYM(init_scanner)(const char *str) - { - struct vp_scanner *r = (struct vp_scanner *)calloc(1, sizeof(*r)); - - yylex_init(&r->scanner); - r->str = strdup(str); - r->buf = yy_scan_string(r->str, r->scanner); - FB_SYM(set_extra)(r, r->scanner); - // yyset_in(stdin, r->scanner); - // yyset_out(stdout, r->scanner); - return r; - } - - void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner) - { - return scanner->scanner; - } - - void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner) - { - free(scanner->str); - yy_delete_buffer(scanner->buf, scanner->scanner); - yylex_destroy(scanner->scanner); - free(scanner); - } - - int FB_SYM(create_ast)(const char *str) - { - // TODO: Initialize this in a generated function - struct context context = { - first_line: 1, - last_line: 1, - first_column: 0, - last_column: 0 - }; - - FB_SYM(debug) = 1; - - struct vp_scanner *scanner = FB_SYM(init_scanner)(str); - int status = FB_SYM(parse)(&context, FB_SYM(scanner_get_data)(scanner)); - FB_SYM(cleanup_scanner)(scanner); - if (status) { - slog(PRI_ERR, "failed to parse (%s)", err()); - return -1; - } - - return 0; - } - - """) - - # #ifdef __cplusplus - # // } // extern "C" - # #endif - # - # """) - - return out - -def grammar_create_h(grammar, opts): - out = "#ifndef " + opts['mip'] + '\n#define ' + opts['mip'] + '\n\n' - ns = opts['namespace'] - - out += textwrap.dedent("""\ - - #define YY_NO_INPUT - #define YY_NO_UNPUT - // #define YY_NO_UNISTD_H - - struct context { - int first_line; - int last_line; - int first_column; - int last_column; - }; - - union YYSTYPE; - - #ifdef __cplusplus - extern "C" { - #endif - - struct vp_scanner; - - struct vp_scanner *FB_SYM(init_scanner)(const char *str); - void *FB_SYM(scanner_get_data)(const struct vp_scanner *scanner); - void FB_SYM(cleanup_scanner)(struct vp_scanner *scanner); - int FB_SYM(create_ast)(const char *str); - - void FB_SYM(error)(struct context *context, void *scanner, const char *s); - - #ifdef __cplusplus - } // extern "C" - #endif - - #define YY_DECL int FB_SYM(lex)(YYSTYPE *yylval_param, struct context *context, void *yyscanner) - - """) - - if ns is not None: - out += 'namespace ' + ns + '{\n\n' - - types = grammar_get_types(grammar) - - # struct forward declarations - for t, members in types.iteritems(): - s = grammar[t] - if s.tp == p_regex: - continue - if len(members): - out += '\nstruct ' + t + ';' - out += '\n' - - # struct / non-struct typedefs - for t, members in types.iteritems(): - s = grammar[t] - if s.tp == p_regex: - continue - if not len(members): - out += '\ntypedef const char ' + t + '_t;' - continue - out += '\ntypedef struct ' + t + ' ' + t + '_t;' - out += '\n' - - # struct definitions - for t, rules in types.iteritems(): - s = grammar[t] - if s.tp == p_regex: - continue - if not len(rules): - continue - out += '\n\nstruct ' + t + ' {\n' - - # rule structs - n = 0 - for rule in rules: - n += 1 - idx = 0 - out += '\n\tstruct ' + 'r' + str(n) + '_t {' - for m in rule: - suffix = '' - if rule.count(m) > 1: - idx += 1 - suffix = '_' + str(idx) - ms = grammar[m] - if ms.tp == p_regex: - continue - p = grammar[m] - out += '\n\t\t' + p.datatype + ' *' + member_prefix + m + suffix + ';' - out += '\n\t};' - - # type enum - n = 0 - out += '\n\n\tenum {' - for rule in rules: - n += 1 - out += '\n\t\tt_' + str(n) + ',' - out += '\n\t} type;' - out += '\n' - - # data union - n = 0 - out += '\n\tunion {' - for rule in rules: - n += 1 - out += '\n\t\tstruct ' + 'r' + str(n) + '_t r' + str(n) + ';' - out += '\n\t} data;' - - # struct done - out += '\n};' - - out += '\n' - - if ns is not None: - out += '\n} /* namespace ' + ns + '*/' - - out += '\n\n#endif /* #ifndef + ' + opts['mip'] + ' */' - - return out - -def grammar_fix_list_recursion(grammar): - dead_end = set() - for kl, l in grammar.iteritems(): - if not re.match('^list_', kl): - continue - for ks, s in grammar.iteritems(): - for rule in s.rules: - state = State() - for c in rule: - state.update(c.token, c.line) - if c.token == kl: - if state.is_optional(): - continue - dead_end.add(c.token) - for t in dead_end: - grammar[t].rules.insert(0, []) - return grammar - -class GrammarCmd(jwutils.Cmd): - - def __init__(self, name, help): - super(GrammarCmd, self).__init__(name, help=help) - - def add_parser(self, parsers): - p = super(GrammarCmd, self).add_parser(parsers) - p.add_argument("input", help="input file") - p.add_argument('-l', '--unroll-lists', help='unroll EBNF lists', action='store_true', default=False) - p.add_argument('-e', '--fix-extensions', help='fix EBNF prefix extensions (' + '|'.join(fix_extensions_mode) + ')', default=mode_concat) - p.add_argument('-o', '--unroll-options', help='unroll EBNF options', action='store_true', default=False) - p.add_argument('-a', '--unroll-alternatives', help='unroll EBNF alternatives', action='store_true', default=False) - p.add_argument('-w', '--replace-whitespace', help='replace white space in tokens by underscore characters', action='store_true', default=False) - p.add_argument('--check-symbols', help='check symbols, comma-separated or "all"', nargs='?', default='') - p.add_argument('-t', '--trim-symbols', help='trim grammar tree at symbol', nargs='?', default='') - p.add_argument('-r', '--irrelevant-symbols', help='exclude symbol from output payload', nargs='?', default='') - p.add_argument('-c', '--cut-symbols', help='cut grammar tree at symbol', nargs='?', default='') - p.add_argument('-s', '--start-symbols', help='use start-symbols', nargs='?', default=None) - p.add_argument('-f', '--config-file', help='config file', nargs='?', default=None) - return p - - def processGrammar(self, args, grammar): - - if args.config_file is not None: - config = serdes.read(args.config_file) - #config.dump(ERR) - grammar = grammar_add_configured_types(grammar, config) - if args.fix_extensions not in fix_extensions_mode: - raise Exception("Illegal argument ", args.fix_extensions, "to --fix-extensions") - grammar = grammar_fix_extensions(grammar, args.fix_extensions) - if args.unroll_lists: - grammar = grammar_unroll_lists(grammar) - if args.unroll_alternatives: - grammar = grammar_unroll_alternatives(grammar) - if args.unroll_options: - grammar = grammar_unroll_options(grammar) - grammar = grammar_fix_list_recursion(grammar) - #grammar['logical_expression'].dump(ERR) - if len(args.check_symbols): - check_symbols = [] - if args.check_symbols == 'all': - args.check_symbols = '' - check_symbols = args.check_symbols.split() - grammar_check(grammar, check_symbols) - if args.replace_whitespace: - grammar = grammar_replace_whitespace(grammar) - if len(args.trim_symbols): - grammar = grammar_trim_symbols(grammar, args.trim_symbols.split(',')) - if len(args.cut_symbols): - grammar = grammar_cut_symbols(grammar, args.cut_symbols.split(',')) - if len(args.irrelevant_symbols): - grammar = grammar_irrelevant_symbols(grammar, args.irrelevant_symbols.split(',')) - return grammar - -# ------------------------------------------------- TODO: clean this up > - -class DerivedGrammarCmd(GrammarCmd): - - def __init__(self, name, help): - super(DerivedGrammarCmd, self).__init__(name, help=help) - - @abstractmethod - def _run(self, grammar): - pass - - def _parse(self, contents): - return grammar_parse_ebnf(contents) - - def add_parser(self, parsers): - p = super(DerivedGrammarCmd, self).add_parser(parsers) - return p - - def run(self, args): - with open(args.input, 'r') as infile: - contents = infile.read() - grammar = self._parse(contents) - grammar = super(DerivedGrammarCmd, self).processGrammar(args, grammar) - self._run(args, grammar) - -class CmdCreate(DerivedGrammarCmd): - - def __init__(self): - super(CmdCreate, self).__init__("create", help="Create a file") - - def add_parser(self, parsers): - p = super(CmdCreate, self).add_parser(parsers) - p.add_argument("output", help="output file") - p.add_argument('--namespace', help='namespace of generated AST', default='parser') - p.add_argument('--includes', help='list of header files to be #included in C/C++ implementation files', default='') - return p - - def _run(self, args, grammar): - name, ext = os.path.splitext(args.output) - ext = ext[1:] - #cmd = getattr(sys.modules[__name__], 'create_' + re.sub(r'[-./]', '_', args.output)) - mip = None - if ext == 'h': - mip = args.namespace + re.sub(r'[-./]', '_', args.output).upper() - - includes = args.includes.split(',') - - config = None - if args.config_file is not None: - config = serdes.read(args.config_file) - - # generated code breaks without this, not sure why - if ext == 'l': - tmp = [] - for f in includes: - if not re.match('.*lex\..*\.h', f): - tmp.append(f) - includes = tmp - - opts = { - "namespace" : args.namespace, - "includes" : includes, - "mip" : mip, - "config" : config, - "start" : args.start_symbols - } - - cmd = getattr(sys.modules[__name__], 'grammar_create_' + ext) - out = cmd(grammar, opts) - print(out) - -class CmdCheck(DerivedGrammarCmd): - - def __init__(self): - super(CmdCheck, self).__init__("check", help="Check grammar") - - def add_parser(self, parsers): - p = super(CmdCheck, self).add_parser(parsers) - return p - - def _run(self, args, grammar): - pass