aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--deps.mk54
-rw-r--r--src/urubu/compressor.go (renamed from src/urubu/compressor/compressor.go)0
-rw-r--r--src/urubu/driver/lexer.go (renamed from src/urubu/driver/lexer/template.go)398
-rw-r--r--src/urubu/driver/lexer/lexer.go335
-rw-r--r--src/urubu/driver/lexer/spec.go71
-rw-r--r--src/urubu/driver/parser.go1439
-rw-r--r--src/urubu/driver/parser/parser.go416
-rw-r--r--src/urubu/driver/parser/semantic_action.go371
-rw-r--r--src/urubu/driver/parser/spec.go73
-rw-r--r--src/urubu/driver/parser/template.go535
-rw-r--r--src/urubu/driver/parser/token_stream.go65
-rw-r--r--src/urubu/error.go (renamed from src/urubu/error/error.go)0
-rw-r--r--src/urubu/grammar.go2911
-rw-r--r--src/urubu/grammar/first.go148
-rw-r--r--src/urubu/grammar/grammar.go1390
-rw-r--r--src/urubu/grammar/item.go206
-rw-r--r--src/urubu/grammar/lalr1.go318
-rw-r--r--src/urubu/grammar/lexical.go (renamed from src/urubu/grammar/lexical/compiler.go)164
-rw-r--r--src/urubu/grammar/lexical/dfa.go (renamed from src/urubu/grammar/lexical/dfa/tree.go)343
-rw-r--r--src/urubu/grammar/lexical/dfa/dfa.go173
-rw-r--r--src/urubu/grammar/lexical/dfa/symbol_position.go182
-rw-r--r--src/urubu/grammar/lexical/entry.go171
-rw-r--r--src/urubu/grammar/lexical/parser.go1668
-rw-r--r--src/urubu/grammar/lexical/parser/error.go36
-rw-r--r--src/urubu/grammar/lexical/parser/fragment.go72
-rw-r--r--src/urubu/grammar/lexical/parser/lexer.go594
-rw-r--r--src/urubu/grammar/lexical/parser/parser.go531
-rw-r--r--src/urubu/grammar/lexical/parser/tree.go459
-rw-r--r--src/urubu/grammar/lr0.go197
-rw-r--r--src/urubu/grammar/parsing_table.go553
-rw-r--r--src/urubu/grammar/production.go117
-rw-r--r--src/urubu/grammar/semantic_error.go30
-rw-r--r--src/urubu/grammar/symbol.go (renamed from src/urubu/grammar/symbol/symbol.go)0
-rw-r--r--src/urubu/spec/grammar.go (renamed from src/urubu/spec/grammar/grammar.go)93
-rw-r--r--src/urubu/spec/grammar/clexspec.json (renamed from src/urubu/spec/grammar/parser/clexspec.json)0
-rw-r--r--src/urubu/spec/grammar/description.go71
-rw-r--r--src/urubu/spec/grammar/lexspec.json (renamed from src/urubu/spec/grammar/parser/lexspec.json)0
-rw-r--r--src/urubu/spec/grammar/parser.go (renamed from src/urubu/spec/grammar/parser/vartan_lexer.go)911
-rw-r--r--src/urubu/spec/grammar/parser/lexer.go297
-rw-r--r--src/urubu/spec/grammar/parser/parser.go582
-rw-r--r--src/urubu/spec/grammar/parser/syntax_error.go45
-rw-r--r--src/urubu/spec/grammar/util.go21
-rw-r--r--src/urubu/spec/test.go (renamed from src/urubu/spec/test/tree_lexer.go)1402
-rw-r--r--src/urubu/spec/test/parser.go336
-rw-r--r--src/urubu/spec/test/tree_parser.go716
-rw-r--r--src/urubu/spec/test/tree_semantic_action.go367
-rw-r--r--src/urubu/spec/tree-report.json (renamed from src/urubu/spec/test/tree-report.json)0
-rw-r--r--src/urubu/spec/tree.json (renamed from src/urubu/spec/test/tree.json)0
-rw-r--r--src/urubu/spec/tree.vartan (renamed from src/urubu/spec/test/tree.vartan)0
-rw-r--r--src/urubu/tester.go (renamed from src/urubu/tester/tester.go)0
-rw-r--r--src/urubu/ucd.go (renamed from src/urubu/ucd/codepoint.go)649
-rw-r--r--src/urubu/ucd/api.go180
-rw-r--r--src/urubu/ucd/codepoint.go.tmpl65
-rw-r--r--src/urubu/ucd/parser.go155
-rw-r--r--src/urubu/ucd/prop_list.go50
-rw-r--r--src/urubu/ucd/property.go95
-rw-r--r--src/urubu/ucd/property_value_aliases.go82
-rw-r--r--src/urubu/ucd/scripts.go52
-rw-r--r--src/urubu/ucd/unicode_data.go56
-rw-r--r--src/urubu/utf8.go (renamed from src/urubu/utf8/utf8.go)0
61 files changed, 10003 insertions, 10246 deletions
diff --git a/Makefile b/Makefile
index 2e7680a..cbb1225 100644
--- a/Makefile
+++ b/Makefile
@@ -125,11 +125,11 @@ $(libs.a): src/$(NAME).go src/version.go
$(existing.a):
- go tool compile -I src -o $@ -p `echo $* | sed 's,^src/,,'` `find $*/*.go`
+ go tool compile -I src -o $@ -p `echo $* | sed 's,^src/,,'` $*.go
$(xtests.a):
p="`echo $* | sed 's,^tests/unit/,urubu/,'`"; \
- go tool compile -I src -o $@ -p $$p src/$$p/*.go $*/*.go
+ go tool compile -I src -o $@ -p $$p src/$$p.go $*/*.go
$(xmains.a):
go tool compile -I src -o $@ -p main $*/*.go
diff --git a/deps.mk b/deps.mk
index 0aab710..8f598fd 100644
--- a/deps.mk
+++ b/deps.mk
@@ -100,21 +100,21 @@ src/urubu/cmd/vartan-go.bin: src/urubu/cmd/vartan-go.a
-src/urubu/compressor.a: src/urubu/compressor/compressor.go
-src/urubu/error.a: src/urubu/error/error.go
-src/urubu/grammar/symbol.a: src/urubu/grammar/symbol/symbol.go
-src/urubu/spec/grammar.a: src/urubu/spec/grammar/description.go src/urubu/spec/grammar/grammar.go src/urubu/spec/grammar/util.go
-src/urubu/spec/test.a: src/urubu/spec/test/parser.go src/urubu/spec/test/tree_lexer.go src/urubu/spec/test/tree_parser.go src/urubu/spec/test/tree_semantic_action.go
-src/urubu/ucd.a: src/urubu/ucd/api.go src/urubu/ucd/codepoint.go src/urubu/ucd/parser.go src/urubu/ucd/property.go src/urubu/ucd/property_value_aliases.go src/urubu/ucd/prop_list.go src/urubu/ucd/scripts.go src/urubu/ucd/unicode_data.go
-src/urubu/utf8.a: src/urubu/utf8/utf8.go
-src/urubu/spec/grammar/parser.a: src/urubu/spec/grammar/parser/lexer.go src/urubu/spec/grammar/parser/parser.go src/urubu/spec/grammar/parser/syntax_error.go src/urubu/spec/grammar/parser/vartan_lexer.go
-src/urubu/grammar.a: src/urubu/grammar/first.go src/urubu/grammar/grammar.go src/urubu/grammar/item.go src/urubu/grammar/lalr1.go src/urubu/grammar/lr0.go src/urubu/grammar/parsing_table.go src/urubu/grammar/production.go src/urubu/grammar/semantic_error.go
-src/urubu/tester.a: src/urubu/tester/tester.go
-src/urubu/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa/dfa.go src/urubu/grammar/lexical/dfa/symbol_position.go src/urubu/grammar/lexical/dfa/tree.go
-src/urubu/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser/error.go src/urubu/grammar/lexical/parser/fragment.go src/urubu/grammar/lexical/parser/lexer.go src/urubu/grammar/lexical/parser/parser.go src/urubu/grammar/lexical/parser/tree.go
-src/urubu/grammar/lexical.a: src/urubu/grammar/lexical/compiler.go src/urubu/grammar/lexical/entry.go
-src/urubu/driver/lexer.a: src/urubu/driver/lexer/lexer.go src/urubu/driver/lexer/spec.go src/urubu/driver/lexer/spec.go
-src/urubu/driver/parser.a: src/urubu/driver/parser/parser.go src/urubu/driver/parser/semantic_action.go src/urubu/driver/parser/spec.go src/urubu/driver/parser/template.go src/urubu/driver/parser/token_stream.go
+src/urubu/compressor.a: src/urubu/compressor.go
+src/urubu/error.a: src/urubu/error.go
+src/urubu/grammar/symbol.a: src/urubu/grammar/symbol.go
+src/urubu/spec/grammar.a: src/urubu/spec/grammar.go
+src/urubu/spec/test.a: src/urubu/spec/test.go
+src/urubu/ucd.a: src/urubu/ucd.go
+src/urubu/utf8.a: src/urubu/utf8.go
+src/urubu/spec/grammar/parser.a: src/urubu/spec/grammar/parser.go
+src/urubu/grammar.a: src/urubu/grammar.go
+src/urubu/tester.a: src/urubu/tester.go
+src/urubu/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa.go
+src/urubu/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser.go
+src/urubu/grammar/lexical.a: src/urubu/grammar/lexical.go
+src/urubu/driver/lexer.a: src/urubu/driver/lexer.go
+src/urubu/driver/parser.a: src/urubu/driver/parser.go
src/urubu/cmd/ucdgen.a: src/urubu/cmd/ucdgen/main.go
src/urubu/cmd/vartan.a: src/urubu/cmd/vartan/compile.go src/urubu/cmd/vartan/main.go src/urubu/cmd/vartan/parse.go src/urubu/cmd/vartan/root.go src/urubu/cmd/vartan/show.go src/urubu/cmd/vartan/test.go
@@ -122,15 +122,15 @@ src/urubu/cmd/vartan-go.a: src/urubu/cmd/vartan-go/generate.go src/urubu/cmd/var
-tests/unit/compressor.a: src/urubu/compressor/compressor.go tests/unit/compressor/compressor_test.go
-tests/unit/grammar/symbol.a: src/urubu/grammar/symbol/symbol.go tests/unit/grammar/symbol/symbol_test.go
-tests/unit/spec/test.a: src/urubu/spec/test/parser.go src/urubu/spec/test/tree_lexer.go src/urubu/spec/test/tree_parser.go src/urubu/spec/test/tree_semantic_action.go tests/unit/spec/test/parser_test.go
-tests/unit/utf8.a: src/urubu/utf8/utf8.go tests/unit/utf8/utf8_test.go
-tests/unit/spec/grammar/parser.a: src/urubu/spec/grammar/parser/lexer.go src/urubu/spec/grammar/parser/parser.go src/urubu/spec/grammar/parser/syntax_error.go src/urubu/spec/grammar/parser/vartan_lexer.go tests/unit/spec/grammar/parser/lexer_test.go tests/unit/spec/grammar/parser/parser_test.go
-tests/unit/grammar.a: src/urubu/grammar/first.go src/urubu/grammar/grammar.go src/urubu/grammar/item.go src/urubu/grammar/lalr1.go src/urubu/grammar/lr0.go src/urubu/grammar/parsing_table.go src/urubu/grammar/production.go src/urubu/grammar/semantic_error.go tests/unit/grammar/first_test.go tests/unit/grammar/grammar_test.go tests/unit/grammar/lalr1_test.go tests/unit/grammar/lr0_test.go tests/unit/grammar/parsing_table_test.go tests/unit/grammar/test_helper_test.go
-tests/unit/tester.a: src/urubu/tester/tester.go tests/unit/tester/tester_test.go
-tests/unit/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa/dfa.go src/urubu/grammar/lexical/dfa/symbol_position.go src/urubu/grammar/lexical/dfa/tree.go tests/unit/grammar/lexical/dfa/dfa_test.go tests/unit/grammar/lexical/dfa/symbol_position_test.go tests/unit/grammar/lexical/dfa/tree_test.go
-tests/unit/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser/error.go src/urubu/grammar/lexical/parser/fragment.go src/urubu/grammar/lexical/parser/lexer.go src/urubu/grammar/lexical/parser/parser.go src/urubu/grammar/lexical/parser/tree.go tests/unit/grammar/lexical/parser/lexer_test.go tests/unit/grammar/lexical/parser/parser_test.go
-tests/unit/grammar/lexical.a: src/urubu/grammar/lexical/compiler.go src/urubu/grammar/lexical/entry.go tests/unit/grammar/lexical/compiler_test.go
-tests/unit/driver/lexer.a: src/urubu/driver/lexer/lexer.go src/urubu/driver/lexer/spec.go src/urubu/driver/lexer/spec.go tests/unit/driver/lexer/lexer_test.go
-tests/unit/driver/parser.a: src/urubu/driver/parser/parser.go src/urubu/driver/parser/semantic_action.go src/urubu/driver/parser/spec.go src/urubu/driver/parser/template.go src/urubu/driver/parser/token_stream.go tests/unit/driver/parser/conflict_test.go tests/unit/driver/parser/lac_test.go tests/unit/driver/parser/parser_test.go tests/unit/driver/parser/semantic_action_test.go tests/unit/driver/parser/syntax_error_test.go
+tests/unit/compressor.a: src/urubu/compressor.go tests/unit/compressor/compressor_test.go
+tests/unit/grammar/symbol.a: src/urubu/grammar/symbol.go tests/unit/grammar/symbol/symbol_test.go
+tests/unit/spec/test.a: src/urubu/spec/test.go tests/unit/spec/test/parser_test.go
+tests/unit/utf8.a: src/urubu/utf8.go tests/unit/utf8/utf8_test.go
+tests/unit/spec/grammar/parser.a: src/urubu/spec/grammar/parser.go tests/unit/spec/grammar/parser/lexer_test.go tests/unit/spec/grammar/parser/parser_test.go
+tests/unit/grammar.a: src/urubu/grammar.go tests/unit/grammar/first_test.go tests/unit/grammar/grammar_test.go tests/unit/grammar/lalr1_test.go tests/unit/grammar/lr0_test.go tests/unit/grammar/parsing_table_test.go tests/unit/grammar/test_helper_test.go
+tests/unit/tester.a: src/urubu/tester.go tests/unit/tester/tester_test.go
+tests/unit/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa.go tests/unit/grammar/lexical/dfa/dfa_test.go tests/unit/grammar/lexical/dfa/symbol_position_test.go tests/unit/grammar/lexical/dfa/tree_test.go
+tests/unit/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser.go tests/unit/grammar/lexical/parser/lexer_test.go tests/unit/grammar/lexical/parser/parser_test.go
+tests/unit/grammar/lexical.a: src/urubu/grammar/lexical.go tests/unit/grammar/lexical/compiler_test.go
+tests/unit/driver/lexer.a: src/urubu/driver/lexer.go tests/unit/driver/lexer/lexer_test.go
+tests/unit/driver/parser.a: src/urubu/driver/parser.go tests/unit/driver/parser/conflict_test.go tests/unit/driver/parser/lac_test.go tests/unit/driver/parser/parser_test.go tests/unit/driver/parser/semantic_action_test.go tests/unit/driver/parser/syntax_error_test.go
diff --git a/src/urubu/compressor/compressor.go b/src/urubu/compressor.go
index cdfeacb..cdfeacb 100644
--- a/src/urubu/compressor/compressor.go
+++ b/src/urubu/compressor.go
diff --git a/src/urubu/driver/lexer/template.go b/src/urubu/driver/lexer.go
index 35dfd93..7423668 100644
--- a/src/urubu/driver/lexer/template.go
+++ b/src/urubu/driver/lexer.go
@@ -8,6 +8,7 @@ import (
"go/format"
"go/parser"
"go/token"
+ "io"
"strings"
"text/template"
@@ -15,6 +16,403 @@ import (
spec "urubu/spec/grammar"
)
+type ModeID int
+
+func (id ModeID) Int() int {
+ return int(id)
+}
+
+type StateID int
+
+func (id StateID) Int() int {
+ return int(id)
+}
+
+type KindID int
+
+func (id KindID) Int() int {
+ return int(id)
+}
+
+type ModeKindID int
+
+func (id ModeKindID) Int() int {
+ return int(id)
+}
+
+type LexSpec interface {
+ InitialMode() ModeID
+ Pop(mode ModeID, modeKind ModeKindID) bool
+ Push(mode ModeID, modeKind ModeKindID) (ModeID, bool)
+ ModeName(mode ModeID) string
+ InitialState(mode ModeID) StateID
+ NextState(mode ModeID, state StateID, v int) (StateID, bool)
+ Accept(mode ModeID, state StateID) (ModeKindID, bool)
+ KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string)
+}
+
+// Token representes a token.
+type Token struct {
+ // ModeID is an ID of a lex mode.
+ ModeID ModeID
+
+ // KindID is an ID of a kind. This is unique among all modes.
+ KindID KindID
+
+ // ModeKindID is an ID of a lexical kind. This is unique only within a mode.
+ // Note that you need to use KindID field if you want to identify a kind across all modes.
+ ModeKindID ModeKindID
+
+ // BytePos is a byte position where a token appears.
+ BytePos int
+
+ // ByteLen is a length of a token.
+ ByteLen int
+
+ // Row is a row number where a token appears.
+ Row int
+
+ // Col is a column number where a token appears.
+ // Note that Col is counted in code points, not bytes.
+ Col int
+
+ // Lexeme is a byte sequence matched a pattern of a lexical specification.
+ Lexeme []byte
+
+ // When this field is true, it means the token is the EOF token.
+ EOF bool
+
+ // When this field is true, it means the token is an error token.
+ Invalid bool
+}
+
+type LexerOption func(l *Lexer) error
+
+// DisableModeTransition disables the active mode transition. Thus, even if the lexical specification has the push and pop
+// operations, the lexer doesn't perform these operations. When the lexical specification has multiple modes, and this option is
+// enabled, you need to call the Lexer.Push and Lexer.Pop methods to perform the mode transition. You can use the Lexer.Mode method
+// to know the current lex mode.
+func DisableModeTransition() LexerOption {
+ return func(l *Lexer) error {
+ l.passiveModeTran = true
+ return nil
+ }
+}
+
+type lexerState struct {
+ srcPtr int
+ row int
+ col int
+}
+
+type Lexer struct {
+ spec LexSpec
+ src []byte
+ state lexerState
+ lastAcceptedState lexerState
+ tokBuf []*Token
+ modeStack []ModeID
+ passiveModeTran bool
+}
+
+// NewLexer returns a new lexer.
+func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) {
+ b, err := io.ReadAll(src)
+ if err != nil {
+ return nil, err
+ }
+ l := &Lexer{
+ spec: spec,
+ src: b,
+ state: lexerState{
+ srcPtr: 0,
+ row: 0,
+ col: 0,
+ },
+ lastAcceptedState: lexerState{
+ srcPtr: 0,
+ row: 0,
+ col: 0,
+ },
+ modeStack: []ModeID{
+ spec.InitialMode(),
+ },
+ passiveModeTran: false,
+ }
+ for _, opt := range opts {
+ err := opt(l)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return l, nil
+}
+
+// Next returns a next token.
+func (l *Lexer) Next() (*Token, error) {
+ if len(l.tokBuf) > 0 {
+ tok := l.tokBuf[0]
+ l.tokBuf = l.tokBuf[1:]
+ return tok, nil
+ }
+
+ tok, err := l.nextAndTransition()
+ if err != nil {
+ return nil, err
+ }
+ if !tok.Invalid {
+ return tok, nil
+ }
+ errTok := tok
+ for {
+ tok, err = l.nextAndTransition()
+ if err != nil {
+ return nil, err
+ }
+ if !tok.Invalid {
+ break
+ }
+ errTok.ByteLen += tok.ByteLen
+ errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...)
+ }
+ l.tokBuf = append(l.tokBuf, tok)
+
+ return errTok, nil
+}
+
+func (l *Lexer) nextAndTransition() (*Token, error) {
+ tok, err := l.next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.EOF || tok.Invalid {
+ return tok, nil
+ }
+ if l.passiveModeTran {
+ return tok, nil
+ }
+ mode := l.Mode()
+ if l.spec.Pop(mode, tok.ModeKindID) {
+ err := l.PopMode()
+ if err != nil {
+ return nil, err
+ }
+ }
+ if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok {
+ l.PushMode(mode)
+ }
+ // The checking length of the mode stack must be at after pop and push operations because those operations can be performed
+ // at the same time. When the mode stack has just one element and popped it, the mode stack will be temporarily emptied.
+ // However, since a push operation may be performed immediately after it, the lexer allows the stack to be temporarily empty.
+ if len(l.modeStack) == 0 {
+ return nil, fmt.Errorf("a mode stack must have at least one element")
+ }
+ return tok, nil
+}
+
+func (l *Lexer) next() (*Token, error) {
+ mode := l.Mode()
+ state := l.spec.InitialState(mode)
+ buf := []byte{}
+ startPos := l.state.srcPtr
+ row := l.state.row
+ col := l.state.col
+ var tok *Token
+ for {
+ v, eof := l.read()
+ if eof {
+ if tok != nil {
+ l.revert()
+ return tok, nil
+ }
+ // When `buf` has unaccepted data and reads the EOF, the lexer treats the buffered data as an invalid token.
+ if len(buf) > 0 {
+ return &Token{
+ ModeID: mode,
+ ModeKindID: 0,
+ BytePos: startPos,
+ ByteLen: l.state.srcPtr - startPos,
+ Lexeme: buf,
+ Row: row,
+ Col: col,
+ Invalid: true,
+ }, nil
+ }
+ return &Token{
+ ModeID: mode,
+ ModeKindID: 0,
+ BytePos: startPos,
+ Row: row,
+ Col: col,
+ EOF: true,
+ }, nil
+ }
+ buf = append(buf, v)
+ nextState, ok := l.spec.NextState(mode, state, int(v))
+ if !ok {
+ if tok != nil {
+ l.revert()
+ return tok, nil
+ }
+ return &Token{
+ ModeID: mode,
+ ModeKindID: 0,
+ BytePos: startPos,
+ ByteLen: l.state.srcPtr - startPos,
+ Lexeme: buf,
+ Row: row,
+ Col: col,
+ Invalid: true,
+ }, nil
+ }
+ state = nextState
+ if modeKindID, ok := l.spec.Accept(mode, state); ok {
+ kindID, _ := l.spec.KindIDAndName(mode, modeKindID)
+ tok = &Token{
+ ModeID: mode,
+ KindID: kindID,
+ ModeKindID: modeKindID,
+ BytePos: startPos,
+ ByteLen: l.state.srcPtr - startPos,
+ Lexeme: buf,
+ Row: row,
+ Col: col,
+ }
+ l.accept()
+ }
+ }
+}
+
+// Mode returns the current lex mode.
+func (l *Lexer) Mode() ModeID {
+ return l.modeStack[len(l.modeStack)-1]
+}
+
+// PushMode adds a lex mode onto the mode stack.
+func (l *Lexer) PushMode(mode ModeID) {
+ l.modeStack = append(l.modeStack, mode)
+}
+
+// PopMode removes a lex mode from the top of the mode stack.
+func (l *Lexer) PopMode() error {
+ sLen := len(l.modeStack)
+ if sLen == 0 {
+ return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more")
+ }
+ l.modeStack = l.modeStack[:sLen-1]
+ return nil
+}
+
+func (l *Lexer) read() (byte, bool) {
+ if l.state.srcPtr >= len(l.src) {
+ return 0, true
+ }
+
+ b := l.src[l.state.srcPtr]
+ l.state.srcPtr++
+
+ // Count the token positions.
+ // The driver treats LF as the end of lines and counts columns in code points, not bytes.
+ // To count in code points, we refer to the First Byte column in the Table 3-6.
+ //
+ // Reference:
+ // - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6. UTF-8 Bit Distribution
+ if b < 128 {
+ // 0x0A is LF.
+ if b == 0x0A {
+ l.state.row++
+ l.state.col = 0
+ } else {
+ l.state.col++
+ }
+ } else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 {
+ l.state.col++
+ }
+
+ return b, false
+}
+
+// accept saves the current state.
+func (l *Lexer) accept() {
+ l.lastAcceptedState = l.state
+}
+
+// revert reverts the lexer state to the last accepted state.
+//
+// We must not call this function consecutively.
+func (l *Lexer) revert() {
+ l.state = l.lastAcceptedState
+}
+
+type lexSpec struct {
+ spec *spec.LexicalSpec
+}
+
+func NewLexSpec(spec *spec.LexicalSpec) *lexSpec {
+ return &lexSpec{
+ spec: spec,
+ }
+}
+
+func (s *lexSpec) InitialMode() ModeID {
+ return ModeID(s.spec.InitialModeID.Int())
+}
+
+func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool {
+ return s.spec.Specs[mode].Pop[modeKind] == 1
+}
+
+func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) {
+ modeID := s.spec.Specs[mode].Push[modeKind]
+ return ModeID(modeID.Int()), !modeID.IsNil()
+}
+
+func (s *lexSpec) ModeName(mode ModeID) string {
+ return s.spec.ModeNames[mode].String()
+}
+
+func (s *lexSpec) InitialState(mode ModeID) StateID {
+ return StateID(s.spec.Specs[mode].DFA.InitialStateID.Int())
+}
+
+func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) {
+ switch s.spec.CompressionLevel {
+ case 2:
+ tran := s.spec.Specs[mode].DFA.Transition
+ rowNum := tran.RowNums[state]
+ d := tran.UniqueEntries.RowDisplacement[rowNum]
+ if tran.UniqueEntries.Bounds[d+v] != rowNum {
+ return StateID(tran.UniqueEntries.EmptyValue.Int()), false
+ }
+ return StateID(tran.UniqueEntries.Entries[d+v].Int()), true
+ case 1:
+ tran := s.spec.Specs[mode].DFA.Transition
+ next := tran.UncompressedUniqueEntries[tran.RowNums[state]*tran.OriginalColCount+v]
+ if next == spec.StateIDNil {
+ return StateID(spec.StateIDNil.Int()), false
+ }
+ return StateID(next.Int()), true
+ }
+
+ modeSpec := s.spec.Specs[mode]
+ next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v]
+ if next == spec.StateIDNil {
+ return StateID(spec.StateIDNil), false
+ }
+ return StateID(next.Int()), true
+}
+
+func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) {
+ modeKindID := s.spec.Specs[mode].DFA.AcceptingStates[state]
+ return ModeKindID(modeKindID.Int()), modeKindID != spec.LexModeKindIDNil
+}
+
+func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) {
+ kindID := s.spec.KindIDs[mode][modeKind]
+ return KindID(kindID.Int()), s.spec.KindNames[kindID].String()
+}
+
// go:embed lexer.go
var lexerCoreSrc string
diff --git a/src/urubu/driver/lexer/lexer.go b/src/urubu/driver/lexer/lexer.go
deleted file mode 100644
index 3f9712e..0000000
--- a/src/urubu/driver/lexer/lexer.go
+++ /dev/null
@@ -1,335 +0,0 @@
-package lexer
-
-import (
- "fmt"
- "io"
-)
-
-type ModeID int
-
-func (id ModeID) Int() int {
- return int(id)
-}
-
-type StateID int
-
-func (id StateID) Int() int {
- return int(id)
-}
-
-type KindID int
-
-func (id KindID) Int() int {
- return int(id)
-}
-
-type ModeKindID int
-
-func (id ModeKindID) Int() int {
- return int(id)
-}
-
-type LexSpec interface {
- InitialMode() ModeID
- Pop(mode ModeID, modeKind ModeKindID) bool
- Push(mode ModeID, modeKind ModeKindID) (ModeID, bool)
- ModeName(mode ModeID) string
- InitialState(mode ModeID) StateID
- NextState(mode ModeID, state StateID, v int) (StateID, bool)
- Accept(mode ModeID, state StateID) (ModeKindID, bool)
- KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string)
-}
-
-// Token representes a token.
-type Token struct {
- // ModeID is an ID of a lex mode.
- ModeID ModeID
-
- // KindID is an ID of a kind. This is unique among all modes.
- KindID KindID
-
- // ModeKindID is an ID of a lexical kind. This is unique only within a mode.
- // Note that you need to use KindID field if you want to identify a kind across all modes.
- ModeKindID ModeKindID
-
- // BytePos is a byte position where a token appears.
- BytePos int
-
- // ByteLen is a length of a token.
- ByteLen int
-
- // Row is a row number where a token appears.
- Row int
-
- // Col is a column number where a token appears.
- // Note that Col is counted in code points, not bytes.
- Col int
-
- // Lexeme is a byte sequence matched a pattern of a lexical specification.
- Lexeme []byte
-
- // When this field is true, it means the token is the EOF token.
- EOF bool
-
- // When this field is true, it means the token is an error token.
- Invalid bool
-}
-
-type LexerOption func(l *Lexer) error
-
-// DisableModeTransition disables the active mode transition. Thus, even if the lexical specification has the push and pop
-// operations, the lexer doesn't perform these operations. When the lexical specification has multiple modes, and this option is
-// enabled, you need to call the Lexer.Push and Lexer.Pop methods to perform the mode transition. You can use the Lexer.Mode method
-// to know the current lex mode.
-func DisableModeTransition() LexerOption {
- return func(l *Lexer) error {
- l.passiveModeTran = true
- return nil
- }
-}
-
-type lexerState struct {
- srcPtr int
- row int
- col int
-}
-
-type Lexer struct {
- spec LexSpec
- src []byte
- state lexerState
- lastAcceptedState lexerState
- tokBuf []*Token
- modeStack []ModeID
- passiveModeTran bool
-}
-
-// NewLexer returns a new lexer.
-func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) {
- b, err := io.ReadAll(src)
- if err != nil {
- return nil, err
- }
- l := &Lexer{
- spec: spec,
- src: b,
- state: lexerState{
- srcPtr: 0,
- row: 0,
- col: 0,
- },
- lastAcceptedState: lexerState{
- srcPtr: 0,
- row: 0,
- col: 0,
- },
- modeStack: []ModeID{
- spec.InitialMode(),
- },
- passiveModeTran: false,
- }
- for _, opt := range opts {
- err := opt(l)
- if err != nil {
- return nil, err
- }
- }
-
- return l, nil
-}
-
-// Next returns a next token.
-func (l *Lexer) Next() (*Token, error) {
- if len(l.tokBuf) > 0 {
- tok := l.tokBuf[0]
- l.tokBuf = l.tokBuf[1:]
- return tok, nil
- }
-
- tok, err := l.nextAndTransition()
- if err != nil {
- return nil, err
- }
- if !tok.Invalid {
- return tok, nil
- }
- errTok := tok
- for {
- tok, err = l.nextAndTransition()
- if err != nil {
- return nil, err
- }
- if !tok.Invalid {
- break
- }
- errTok.ByteLen += tok.ByteLen
- errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...)
- }
- l.tokBuf = append(l.tokBuf, tok)
-
- return errTok, nil
-}
-
-func (l *Lexer) nextAndTransition() (*Token, error) {
- tok, err := l.next()
- if err != nil {
- return nil, err
- }
- if tok.EOF || tok.Invalid {
- return tok, nil
- }
- if l.passiveModeTran {
- return tok, nil
- }
- mode := l.Mode()
- if l.spec.Pop(mode, tok.ModeKindID) {
- err := l.PopMode()
- if err != nil {
- return nil, err
- }
- }
- if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok {
- l.PushMode(mode)
- }
- // The checking length of the mode stack must be at after pop and push operations because those operations can be performed
- // at the same time. When the mode stack has just one element and popped it, the mode stack will be temporarily emptied.
- // However, since a push operation may be performed immediately after it, the lexer allows the stack to be temporarily empty.
- if len(l.modeStack) == 0 {
- return nil, fmt.Errorf("a mode stack must have at least one element")
- }
- return tok, nil
-}
-
-func (l *Lexer) next() (*Token, error) {
- mode := l.Mode()
- state := l.spec.InitialState(mode)
- buf := []byte{}
- startPos := l.state.srcPtr
- row := l.state.row
- col := l.state.col
- var tok *Token
- for {
- v, eof := l.read()
- if eof {
- if tok != nil {
- l.revert()
- return tok, nil
- }
- // When `buf` has unaccepted data and reads the EOF, the lexer treats the buffered data as an invalid token.
- if len(buf) > 0 {
- return &Token{
- ModeID: mode,
- ModeKindID: 0,
- BytePos: startPos,
- ByteLen: l.state.srcPtr - startPos,
- Lexeme: buf,
- Row: row,
- Col: col,
- Invalid: true,
- }, nil
- }
- return &Token{
- ModeID: mode,
- ModeKindID: 0,
- BytePos: startPos,
- Row: row,
- Col: col,
- EOF: true,
- }, nil
- }
- buf = append(buf, v)
- nextState, ok := l.spec.NextState(mode, state, int(v))
- if !ok {
- if tok != nil {
- l.revert()
- return tok, nil
- }
- return &Token{
- ModeID: mode,
- ModeKindID: 0,
- BytePos: startPos,
- ByteLen: l.state.srcPtr - startPos,
- Lexeme: buf,
- Row: row,
- Col: col,
- Invalid: true,
- }, nil
- }
- state = nextState
- if modeKindID, ok := l.spec.Accept(mode, state); ok {
- kindID, _ := l.spec.KindIDAndName(mode, modeKindID)
- tok = &Token{
- ModeID: mode,
- KindID: kindID,
- ModeKindID: modeKindID,
- BytePos: startPos,
- ByteLen: l.state.srcPtr - startPos,
- Lexeme: buf,
- Row: row,
- Col: col,
- }
- l.accept()
- }
- }
-}
-
-// Mode returns the current lex mode.
-func (l *Lexer) Mode() ModeID {
- return l.modeStack[len(l.modeStack)-1]
-}
-
-// PushMode adds a lex mode onto the mode stack.
-func (l *Lexer) PushMode(mode ModeID) {
- l.modeStack = append(l.modeStack, mode)
-}
-
-// PopMode removes a lex mode from the top of the mode stack.
-func (l *Lexer) PopMode() error {
- sLen := len(l.modeStack)
- if sLen == 0 {
- return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more")
- }
- l.modeStack = l.modeStack[:sLen-1]
- return nil
-}
-
-func (l *Lexer) read() (byte, bool) {
- if l.state.srcPtr >= len(l.src) {
- return 0, true
- }
-
- b := l.src[l.state.srcPtr]
- l.state.srcPtr++
-
- // Count the token positions.
- // The driver treats LF as the end of lines and counts columns in code points, not bytes.
- // To count in code points, we refer to the First Byte column in the Table 3-6.
- //
- // Reference:
- // - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6. UTF-8 Bit Distribution
- if b < 128 {
- // 0x0A is LF.
- if b == 0x0A {
- l.state.row++
- l.state.col = 0
- } else {
- l.state.col++
- }
- } else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 {
- l.state.col++
- }
-
- return b, false
-}
-
-// accept saves the current state.
-func (l *Lexer) accept() {
- l.lastAcceptedState = l.state
-}
-
-// revert reverts the lexer state to the last accepted state.
-//
-// We must not call this function consecutively.
-func (l *Lexer) revert() {
- l.state = l.lastAcceptedState
-}
diff --git a/src/urubu/driver/lexer/spec.go b/src/urubu/driver/lexer/spec.go
deleted file mode 100644
index 75c74af..0000000
--- a/src/urubu/driver/lexer/spec.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package lexer
-
-import spec "urubu/spec/grammar"
-
-type lexSpec struct {
- spec *spec.LexicalSpec
-}
-
-func NewLexSpec(spec *spec.LexicalSpec) *lexSpec {
- return &lexSpec{
- spec: spec,
- }
-}
-
-func (s *lexSpec) InitialMode() ModeID {
- return ModeID(s.spec.InitialModeID.Int())
-}
-
-func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool {
- return s.spec.Specs[mode].Pop[modeKind] == 1
-}
-
-func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) {
- modeID := s.spec.Specs[mode].Push[modeKind]
- return ModeID(modeID.Int()), !modeID.IsNil()
-}
-
-func (s *lexSpec) ModeName(mode ModeID) string {
- return s.spec.ModeNames[mode].String()
-}
-
-func (s *lexSpec) InitialState(mode ModeID) StateID {
- return StateID(s.spec.Specs[mode].DFA.InitialStateID.Int())
-}
-
-func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) {
- switch s.spec.CompressionLevel {
- case 2:
- tran := s.spec.Specs[mode].DFA.Transition
- rowNum := tran.RowNums[state]
- d := tran.UniqueEntries.RowDisplacement[rowNum]
- if tran.UniqueEntries.Bounds[d+v] != rowNum {
- return StateID(tran.UniqueEntries.EmptyValue.Int()), false
- }
- return StateID(tran.UniqueEntries.Entries[d+v].Int()), true
- case 1:
- tran := s.spec.Specs[mode].DFA.Transition
- next := tran.UncompressedUniqueEntries[tran.RowNums[state]*tran.OriginalColCount+v]
- if next == spec.StateIDNil {
- return StateID(spec.StateIDNil.Int()), false
- }
- return StateID(next.Int()), true
- }
-
- modeSpec := s.spec.Specs[mode]
- next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v]
- if next == spec.StateIDNil {
- return StateID(spec.StateIDNil), false
- }
- return StateID(next.Int()), true
-}
-
-func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) {
- modeKindID := s.spec.Specs[mode].DFA.AcceptingStates[state]
- return ModeKindID(modeKindID.Int()), modeKindID != spec.LexModeKindIDNil
-}
-
-func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) {
- kindID := s.spec.KindIDs[mode][modeKind]
- return KindID(kindID.Int()), s.spec.KindNames[kindID].String()
-}
diff --git a/src/urubu/driver/parser.go b/src/urubu/driver/parser.go
new file mode 100644
index 0000000..89cb240
--- /dev/null
+++ b/src/urubu/driver/parser.go
@@ -0,0 +1,1439 @@
+package parser
+
+import (
+ "bytes"
+ _ "embed"
+ "encoding/json"
+ "fmt"
+ "go/ast"
+ "go/format"
+ "go/parser"
+ "go/token"
+ goToken "go/token"
+ "io"
+ "strconv"
+ "strings"
+ "text/template"
+
+ "urubu/driver/lexer"
+ spec "urubu/spec/grammar"
+)
+
+type Grammar interface {
+ // InitialState returns the initial state of a parser.
+ InitialState() int
+
+ // StartProduction returns the start production of grammar.
+ StartProduction() int
+
+ // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair.
+ Action(state int, terminal int) int
+
+ // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair.
+ GoTo(state int, lhs int) int
+
+ // ErrorTrapperState returns true when a state can shift the error symbol.
+ ErrorTrapperState(state int) bool
+
+ // LHS returns a LHS symbol of a production.
+ LHS(prod int) int
+
+ // AlternativeSymbolCount returns a symbol count of p production.
+ AlternativeSymbolCount(prod int) int
+
+ // RecoverProduction returns true when a production has the recover directive.
+ RecoverProduction(prod int) bool
+
+ // NonTerminal retuns a string representaion of a non-terminal symbol.
+ NonTerminal(nonTerminal int) string
+
+ // TerminalCount returns a terminal symbol count of grammar.
+ TerminalCount() int
+
+ // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis.
+ SkipTerminal(terminal int) bool
+
+ // EOF returns the EOF symbol.
+ EOF() int
+
+ // Error returns the error symbol.
+ Error() int
+
+ // Terminal retuns a string representaion of a terminal symbol.
+ Terminal(terminal int) string
+
+ // ASTAction returns an AST action entries.
+ ASTAction(prod int) []int
+}
+
+type VToken interface {
+ // TerminalID returns a terminal ID.
+ TerminalID() int
+
+ // Lexeme returns a lexeme.
+ Lexeme() []byte
+
+ // EOF returns true when a token represents EOF.
+ EOF() bool
+
+ // Invalid returns true when a token is invalid.
+ Invalid() bool
+
+ // BytePosition returns (position, length) pair.
+ // `position` is a byte position where a token appears and `length` is a length in bytes.
+ BytePosition() (int, int)
+
+ // Position returns (row, column) pair.
+ Position() (int, int)
+}
+
+type TokenStream interface {
+ Next() (VToken, error)
+}
+
+type SyntaxError struct {
+ Row int
+ Col int
+ Message string
+ Token VToken
+ ExpectedTerminals []string
+}
+
+type ParserOption func(p *Parser) error
+
+// DisableLAC disables LAC (lookahead correction). LAC is enabled by default.
+func DisableLAC() ParserOption {
+ return func(p *Parser) error {
+ p.disableLAC = true
+ return nil
+ }
+}
+
+func SemanticAction(semAct SemanticActionSet) ParserOption {
+ return func(p *Parser) error {
+ p.semAct = semAct
+ return nil
+ }
+}
+
+type Parser struct {
+ toks TokenStream
+ gram Grammar
+ stateStack *stateStack
+ semAct SemanticActionSet
+ disableLAC bool
+ onError bool
+ shiftCount int
+ synErrs []*SyntaxError
+}
+
+func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) {
+ p := &Parser{
+ toks: toks,
+ gram: gram,
+ stateStack: &stateStack{},
+ }
+
+ for _, opt := range opts {
+ err := opt(p)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return p, nil
+}
+
+func (p *Parser) Parse() error {
+ p.stateStack.push(p.gram.InitialState())
+ tok, err := p.nextToken()
+ if err != nil {
+ return err
+ }
+
+ACTION_LOOP:
+ for {
+ act := p.lookupAction(tok)
+
+ switch {
+ case act < 0: // Shift
+ nextState := act * -1
+
+ recovered := false
+ if p.onError {
+ p.shiftCount++
+
+ // When the parser performs shift three times, the parser recovers from the error state.
+ if p.shiftCount >= 3 {
+ p.onError = false
+ p.shiftCount = 0
+ recovered = true
+ }
+ }
+
+ p.shift(nextState)
+
+ if p.semAct != nil {
+ p.semAct.Shift(tok, recovered)
+ }
+
+ tok, err = p.nextToken()
+ if err != nil {
+ return err
+ }
+ case act > 0: // Reduce
+ prodNum := act
+
+ recovered := false
+ if p.onError && p.gram.RecoverProduction(prodNum) {
+ p.onError = false
+ p.shiftCount = 0
+ recovered = true
+ }
+
+ accepted := p.reduce(prodNum)
+ if accepted {
+ if p.semAct != nil {
+ p.semAct.Accept()
+ }
+
+ return nil
+ }
+
+ if p.semAct != nil {
+ p.semAct.Reduce(prodNum, recovered)
+ }
+ default: // Error
+ if p.onError {
+ tok, err = p.nextToken()
+ if err != nil {
+ return err
+ }
+ if tok.EOF() {
+ if p.semAct != nil {
+ p.semAct.MissError(tok)
+ }
+
+ return nil
+ }
+
+ continue ACTION_LOOP
+ }
+
+ row, col := tok.Position()
+ p.synErrs = append(p.synErrs, &SyntaxError{
+ Row: row,
+ Col: col,
+ Message: "unexpected token",
+ Token: tok,
+ ExpectedTerminals: p.searchLookahead(p.stateStack.top()),
+ })
+
+ count, ok := p.trapError()
+ if !ok {
+ if p.semAct != nil {
+ p.semAct.MissError(tok)
+ }
+
+ return nil
+ }
+
+ p.onError = true
+ p.shiftCount = 0
+
+ act, err := p.lookupActionOnError()
+ if err != nil {
+ return err
+ }
+
+ p.shift(act * -1)
+
+ if p.semAct != nil {
+ p.semAct.TrapAndShiftError(tok, count)
+ }
+ }
+ }
+}
+
+// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid,
+// this method returns `true`.
+func (p *Parser) validateLookahead(term int) bool {
+ p.stateStack.enableExploratoryMode()
+ defer p.stateStack.disableExploratoryMode()
+
+ for {
+ act := p.gram.Action(p.stateStack.topExploratorily(), term)
+
+ switch {
+ case act < 0: // Shift
+ return true
+ case act > 0: // Reduce
+ prodNum := act
+
+ lhs := p.gram.LHS(prodNum)
+ if lhs == p.gram.LHS(p.gram.StartProduction()) {
+ return true
+ }
+ n := p.gram.AlternativeSymbolCount(prodNum)
+ p.stateStack.popExploratorily(n)
+ state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs)
+ p.stateStack.pushExploratorily(state)
+ default: // Error
+ return false
+ }
+ }
+}
+
+func (p *Parser) nextToken() (VToken, error) {
+ for {
+ // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0,
+ // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect
+ // a syntax error because the parser cannot find an entry corresponding to the invalid token.
+ tok, err := p.toks.Next()
+ if err != nil {
+ return nil, err
+ }
+
+ if p.gram.SkipTerminal(tok.TerminalID()) {
+ continue
+ }
+
+ return tok, nil
+ }
+}
+
+func (p *Parser) tokenToTerminal(tok VToken) int {
+ if tok.EOF() {
+ return p.gram.EOF()
+ }
+
+ return tok.TerminalID()
+}
+
+func (p *Parser) lookupAction(tok VToken) int {
+ if !p.disableLAC {
+ term := p.tokenToTerminal(tok)
+ if !p.validateLookahead(term) {
+ return 0
+ }
+ }
+
+ return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok))
+}
+
+func (p *Parser) lookupActionOnError() (int, error) {
+ act := p.gram.Action(p.stateStack.top(), p.gram.Error())
+ if act >= 0 {
+ return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error()))
+ }
+
+ return act, nil
+}
+
+func (p *Parser) shift(nextState int) {
+ p.stateStack.push(nextState)
+}
+
+func (p *Parser) reduce(prodNum int) bool {
+ lhs := p.gram.LHS(prodNum)
+ if lhs == p.gram.LHS(p.gram.StartProduction()) {
+ return true
+ }
+ n := p.gram.AlternativeSymbolCount(prodNum)
+ p.stateStack.pop(n)
+ nextState := p.gram.GoTo(p.stateStack.top(), lhs)
+ p.stateStack.push(nextState)
+ return false
+}
+
+func (p *Parser) trapError() (int, bool) {
+ count := 0
+ for {
+ if p.gram.ErrorTrapperState(p.stateStack.top()) {
+ return count, true
+ }
+
+ if p.stateStack.top() != p.gram.InitialState() {
+ p.stateStack.pop(1)
+ count++
+ } else {
+ return 0, false
+ }
+ }
+}
+
+func (p *Parser) SyntaxErrors() []*SyntaxError {
+ return p.synErrs
+}
+
+func (p *Parser) searchLookahead(state int) []string {
+ kinds := []string{}
+ termCount := p.gram.TerminalCount()
+ for term := 0; term < termCount; term++ {
+ if p.disableLAC {
+ if p.gram.Action(p.stateStack.top(), term) == 0 {
+ continue
+ }
+ } else {
+ if !p.validateLookahead(term) {
+ continue
+ }
+ }
+
+ // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol
+ // intentionally.
+ if term == p.gram.Error() {
+ continue
+ }
+
+ kinds = append(kinds, p.gram.Terminal(term))
+ }
+
+ return kinds
+}
+
+type stateStack struct {
+ items []int
+ itemsExp []int
+}
+
+func (s *stateStack) enableExploratoryMode() {
+ s.itemsExp = make([]int, len(s.items))
+ copy(s.itemsExp, s.items)
+}
+
+func (s *stateStack) disableExploratoryMode() {
+ s.itemsExp = nil
+}
+
+func (s *stateStack) top() int {
+ return s.items[len(s.items)-1]
+}
+
+func (s *stateStack) topExploratorily() int {
+ return s.itemsExp[len(s.itemsExp)-1]
+}
+
+func (s *stateStack) push(state int) {
+ s.items = append(s.items, state)
+}
+
+func (s *stateStack) pushExploratorily(state int) {
+ s.itemsExp = append(s.itemsExp, state)
+}
+
+func (s *stateStack) pop(n int) {
+ s.items = s.items[:len(s.items)-n]
+}
+
+func (s *stateStack) popExploratorily(n int) {
+ s.itemsExp = s.itemsExp[:len(s.itemsExp)-n]
+}
+
+// SemanticActionSet is a set of semantic actions a parser calls.
+type SemanticActionSet interface {
+ // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol.
+ // When the parser recovered from an error state by shifting the token, `recovered` is true.
+ Shift(tok VToken, recovered bool)
+
+ // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production.
+ // When the parser recovered from an error state by reducing the production, `recovered` is true.
+ Reduce(prodNum int, recovered bool)
+
+ // Accept runs when the parser accepts an input.
+ Accept()
+
+ // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack.
+ // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards
+ // from the state stack.
+ // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token
+ // corresponding to the error symbol doesn't exist.
+ TrapAndShiftError(cause VToken, popped int)
+
+ // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error.
+ MissError(cause VToken)
+}
+
+var _ SemanticActionSet = &SyntaxTreeActionSet{}
+
+// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface.
+type SyntaxTreeNode interface {
+ // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast`
+ // directive with `...` operator.
+ ChildCount() int
+
+ // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast`
+ // directive with `...` operator.
+ ExpandChildren() []SyntaxTreeNode
+}
+
+var _ SyntaxTreeNode = &Node{}
+
+// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types.
+// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface.
+type SyntaxTreeBuilder interface {
+ Shift(kindName string, tok VToken) SyntaxTreeNode
+ ShiftError(kindName string) SyntaxTreeNode
+ Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode
+ Accept(f SyntaxTreeNode)
+}
+
+var _ SyntaxTreeBuilder = &DefaultSyntaxTreeBuilder{}
+
+// DefaultSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder.
+type DefaultSyntaxTreeBuilder struct {
+ tree *Node
+}
+
+// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder.
+func NewDefaultSyntaxTreeBuilder() *DefaultSyntaxTreeBuilder {
+ return &DefaultSyntaxTreeBuilder{}
+}
+
+// Shift is a implementation of SyntaxTreeBuilder.Shift.
+func (b *DefaultSyntaxTreeBuilder) Shift(kindName string, tok VToken) SyntaxTreeNode {
+ bytePos, byteLen := tok.BytePosition()
+ row, col := tok.Position()
+ return &Node{
+ Type: NodeTypeTerminal,
+ KindName: kindName,
+ Text: string(tok.Lexeme()),
+ BytePos: bytePos,
+ ByteLen: byteLen,
+ Row: row,
+ Col: col,
+ }
+}
+
+// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError.
+func (b *DefaultSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode {
+ return &Node{
+ Type: NodeTypeError,
+ KindName: kindName,
+ }
+}
+
+// Reduce is a implementation of SyntaxTreeBuilder.Reduce.
+func (b *DefaultSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode {
+ cNodes := make([]*Node, len(children))
+ for i, c := range children {
+ cNodes[i] = c.(*Node)
+ }
+ return &Node{
+ Type: NodeTypeNonTerminal,
+ KindName: kindName,
+ Children: cNodes,
+ }
+}
+
+// Accept is a implementation of SyntaxTreeBuilder.Accept.
+func (b *DefaultSyntaxTreeBuilder) Accept(f SyntaxTreeNode) {
+ b.tree = f.(*Node)
+}
+
+// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil.
+func (b *DefaultSyntaxTreeBuilder) Tree() *Node {
+ return b.tree
+}
+
+// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree.
+type SyntaxTreeActionSet struct {
+ gram Grammar
+ builder SyntaxTreeBuilder
+ semStack *semanticStack
+ disableASTAction bool
+}
+
+// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree).
+// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them.
+func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
+ return &SyntaxTreeActionSet{
+ gram: gram,
+ builder: builder,
+ semStack: newSemanticStack(),
+ }
+}
+
+// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree).
+// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them.
+func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
+ return &SyntaxTreeActionSet{
+ gram: gram,
+ builder: builder,
+ semStack: newSemanticStack(),
+ disableASTAction: true,
+ }
+}
+
+// Shift is a implementation of SemanticActionSet.Shift method.
+func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) {
+ term := a.tokenToTerminal(tok)
+ a.semStack.push(a.builder.Shift(a.gram.Terminal(term), tok))
+}
+
+// Reduce is a implementation of SemanticActionSet.Reduce method.
+func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) {
+ lhs := a.gram.LHS(prodNum)
+
+ // When an alternative is empty, `n` will be 0, and `handle` will be empty slice.
+ n := a.gram.AlternativeSymbolCount(prodNum)
+ handle := a.semStack.pop(n)
+
+ var astAct []int
+ if !a.disableASTAction {
+ astAct = a.gram.ASTAction(prodNum)
+ }
+ var children []SyntaxTreeNode
+ if astAct != nil {
+ // Count the number of children in advance to avoid frequent growth in a slice for children.
+ {
+ l := 0
+ for _, e := range astAct {
+ if e > 0 {
+ l++
+ } else {
+ offset := e*-1 - 1
+ l += handle[offset].ChildCount()
+ }
+ }
+
+ children = make([]SyntaxTreeNode, l)
+ }
+
+ p := 0
+ for _, e := range astAct {
+ if e > 0 {
+ offset := e - 1
+ children[p] = handle[offset]
+ p++
+ } else {
+ offset := e*-1 - 1
+ for _, c := range handle[offset].ExpandChildren() {
+ children[p] = c
+ p++
+ }
+ }
+ }
+ } else {
+ // If an alternative has no AST action, a driver generates
+ // a node with the same structure as a CST.
+ children = handle
+ }
+
+ a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children))
+}
+
+// Accept is a implementation of SemanticActionSet.Accept method.
+func (a *SyntaxTreeActionSet) Accept() {
+ top := a.semStack.pop(1)
+ a.builder.Accept(top[0])
+}
+
+// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method.
+func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) {
+ a.semStack.pop(popped)
+ a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error())))
+}
+
+// MissError is a implementation of SemanticActionSet.MissError method.
+func (a *SyntaxTreeActionSet) MissError(cause VToken) {
+}
+
+func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int {
+ if tok.EOF() {
+ return a.gram.EOF()
+ }
+
+ return tok.TerminalID()
+}
+
+type semanticStack struct {
+ frames []SyntaxTreeNode
+}
+
+func newSemanticStack() *semanticStack {
+ return &semanticStack{
+ frames: make([]SyntaxTreeNode, 0, 100),
+ }
+}
+
+func (s *semanticStack) push(f SyntaxTreeNode) {
+ s.frames = append(s.frames, f)
+}
+
+func (s *semanticStack) pop(n int) []SyntaxTreeNode {
+ fs := s.frames[len(s.frames)-n:]
+ s.frames = s.frames[:len(s.frames)-n]
+
+ return fs
+}
+
+type NodeType int
+
+const (
+ NodeTypeError = 0
+ NodeTypeTerminal = 1
+ NodeTypeNonTerminal = 2
+)
+
+// Node is a implementation of SyntaxTreeNode interface.
+type Node struct {
+ Type NodeType
+ KindName string
+ Text string
+ BytePos int
+ ByteLen int
+ Row int
+ Col int
+ Children []*Node
+}
+
+func (n *Node) MarshalJSON() ([]byte, error) {
+ switch n.Type {
+ case NodeTypeError:
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ })
+ case NodeTypeTerminal:
+ if n.KindName == "" {
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ Text string `json:"text"`
+ Row int `json:"row"`
+ Col int `json:"col"`
+ }{
+ Type: n.Type,
+ Text: n.Text,
+ Row: n.Row,
+ Col: n.Col,
+ })
+ }
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ Text string `json:"text"`
+ Row int `json:"row"`
+ Col int `json:"col"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ Text: n.Text,
+ Row: n.Row,
+ Col: n.Col,
+ })
+ case NodeTypeNonTerminal:
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ Children []*Node `json:"children"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ Children: n.Children,
+ })
+ default:
+ return nil, fmt.Errorf("invalid node type: %v", n.Type)
+ }
+}
+
+// ChildCount is a implementation of SyntaxTreeNode.ChildCount.
+func (n *Node) ChildCount() int {
+ return len(n.Children)
+}
+
+// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren.
+func (n *Node) ExpandChildren() []SyntaxTreeNode {
+ fs := make([]SyntaxTreeNode, len(n.Children))
+ for i, n := range n.Children {
+ fs[i] = n
+ }
+ return fs
+}
+
+// PrintTree prints a syntax tree whose root is `node`.
+func PrintTree(w io.Writer, node *Node) {
+ printTree(w, node, "", "")
+}
+
+func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) {
+ if node == nil {
+ return
+ }
+
+ switch node.Type {
+ case NodeTypeError:
+ fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
+ case NodeTypeTerminal:
+ fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text))
+ case NodeTypeNonTerminal:
+ fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
+
+ num := len(node.Children)
+ for i, child := range node.Children {
+ var line string
+ if num > 1 && i < num-1 {
+ line = "├─ "
+ } else {
+ line = "└─ "
+ }
+
+ var prefix string
+ if i >= num-1 {
+ prefix = " "
+ } else {
+ prefix = "│ "
+ }
+
+ printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+ }
+}
+
+type grammarImpl struct {
+ g *spec.CompiledGrammar
+}
+
+func NewGrammar(g *spec.CompiledGrammar) *grammarImpl {
+ return &grammarImpl{
+ g: g,
+ }
+}
+
+func (g *grammarImpl) InitialState() int {
+ return g.g.Syntactic.InitialState
+}
+
+func (g *grammarImpl) StartProduction() int {
+ return g.g.Syntactic.StartProduction
+}
+
+func (g *grammarImpl) RecoverProduction(prod int) bool {
+ return g.g.Syntactic.RecoverProductions[prod] != 0
+}
+
+func (g *grammarImpl) Action(state int, terminal int) int {
+ return g.g.Syntactic.Action[state*g.g.Syntactic.TerminalCount+terminal]
+}
+
+func (g *grammarImpl) GoTo(state int, lhs int) int {
+ return g.g.Syntactic.GoTo[state*g.g.Syntactic.NonTerminalCount+lhs]
+}
+
+func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
+ return g.g.Syntactic.AlternativeSymbolCounts[prod]
+}
+
+func (g *grammarImpl) TerminalCount() int {
+ return g.g.Syntactic.TerminalCount
+}
+
+func (g *grammarImpl) SkipTerminal(terminal int) bool {
+ return g.g.Syntactic.TerminalSkip[terminal] == 1
+}
+
+func (g *grammarImpl) ErrorTrapperState(state int) bool {
+ return g.g.Syntactic.ErrorTrapperStates[state] != 0
+}
+
+func (g *grammarImpl) NonTerminal(nonTerminal int) string {
+ return g.g.Syntactic.NonTerminals[nonTerminal]
+}
+
+func (g *grammarImpl) LHS(prod int) int {
+ return g.g.Syntactic.LHSSymbols[prod]
+}
+
+func (g *grammarImpl) EOF() int {
+ return g.g.Syntactic.EOFSymbol
+}
+
+func (g *grammarImpl) Error() int {
+ return g.g.Syntactic.ErrorSymbol
+}
+
+func (g *grammarImpl) Terminal(terminal int) string {
+ return g.g.Syntactic.Terminals[terminal]
+}
+
+func (g *grammarImpl) ASTAction(prod int) []int {
+ return g.g.ASTAction.Entries[prod]
+}
+
+// go:embed parser.go
+var parserCoreSrc string
+
+// go:embed semantic_action.go
+var semActSrc string
+
+func GenParser(cgram *spec.CompiledGrammar, pkgName string) ([]byte, error) {
+ var parserSrc string
+ {
+ fset := goToken.NewFileSet()
+ f, err := parser.ParseFile(fset, "parser.go", parserCoreSrc, parser.ParseComments)
+ if err != nil {
+ return nil, err
+ }
+
+ var b strings.Builder
+ err = format.Node(&b, fset, f)
+ if err != nil {
+ return nil, err
+ }
+
+ parserSrc = b.String()
+ }
+
+ var grammarSrc string
+ {
+ t, err := template.New("").Funcs(genGrammarTemplateFuncs(cgram)).Parse(grammarSrcTmplate)
+ if err != nil {
+ return nil, err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, map[string]interface{}{
+ "initialState": cgram.Syntactic.InitialState,
+ "startProduction": cgram.Syntactic.StartProduction,
+ "terminalCount": cgram.Syntactic.TerminalCount,
+ "nonTerminalCount": cgram.Syntactic.NonTerminalCount,
+ "eofSymbol": cgram.Syntactic.EOFSymbol,
+ "errorSymbol": cgram.Syntactic.ErrorSymbol,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ grammarSrc = b.String()
+ }
+
+ var lexerSrc string
+ {
+ t, err := template.New("").Funcs(genLexerTemplateFuncs(cgram)).Parse(lexerSrcTmplate)
+ if err != nil {
+ return nil, err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, nil)
+ if err != nil {
+ return nil, err
+ }
+
+ lexerSrc = b.String()
+ }
+
+ var src string
+ {
+ tmpl := `// Code generated by vartan-go. DO NOT EDIT.
+{{ .parserSrc }}
+
+{{ .grammarSrc }}
+
+{{ .lexerSrc }}
+`
+ t, err := template.New("").Parse(tmpl)
+ if err != nil {
+ return nil, err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, map[string]string{
+ "parserSrc": parserSrc,
+ "grammarSrc": grammarSrc,
+ "lexerSrc": lexerSrc,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ src = b.String()
+ }
+
+ fset := goToken.NewFileSet()
+ f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
+ if err != nil {
+ return nil, err
+ }
+
+ f.Name = ast.NewIdent(pkgName)
+
+ // Complete an import statement.
+ for _, d := range f.Decls {
+ gd, ok := d.(*ast.GenDecl)
+ if !ok || gd.Tok != token.IMPORT {
+ continue
+ }
+ gd.Specs = append(gd.Specs, &ast.ImportSpec{
+ Path: &ast.BasicLit{
+ Value: `"io"`,
+ },
+ })
+ break
+ }
+
+ var b bytes.Buffer
+ err = format.Node(&b, fset, f)
+ if err != nil {
+ return nil, err
+ }
+
+ return b.Bytes(), nil
+}
+
+const grammarSrcTmplate = `
+type grammarImpl struct {
+ recoverProductions []int
+ action []int
+ goTo []int
+ alternativeSymbolCounts []int
+ errorTrapperStates []int
+ nonTerminals []string
+ lhsSymbols []int
+ terminals []string
+ terminalSkip []int
+ astActions [][]int
+}
+
+func NewGrammar() *grammarImpl {
+ return &grammarImpl{
+ recoverProductions: {{ genRecoverProductions }},
+ action: {{ genAction }},
+ goTo: {{ genGoTo }},
+ alternativeSymbolCounts: {{ genAlternativeSymbolCounts }},
+ errorTrapperStates: {{ genErrorTrapperStates }},
+ nonTerminals: {{ genNonTerminals }},
+ lhsSymbols: {{ genLHSSymbols }},
+ terminals: {{ genTerminals }},
+ terminalSkip: {{ genTerminalSkip }},
+ astActions: {{ genASTActions }},
+ }
+}
+
+func (g *grammarImpl) InitialState() int {
+ return {{ .initialState }}
+}
+
+func (g *grammarImpl) StartProduction() int {
+ return {{ .startProduction }}
+}
+
+func (g *grammarImpl) RecoverProduction(prod int) bool {
+ return g.recoverProductions[prod] != 0
+}
+
+func (g *grammarImpl) Action(state int, terminal int) int {
+ return g.action[state*{{ .terminalCount }}+terminal]
+}
+
+func (g *grammarImpl) GoTo(state int, lhs int) int {
+ return g.goTo[state*{{ .nonTerminalCount }}+lhs]
+}
+
+func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
+ return g.alternativeSymbolCounts[prod]
+}
+
+func (g *grammarImpl) TerminalCount() int {
+ return {{ .terminalCount }}
+}
+
+func (g *grammarImpl) SkipTerminal(terminal int) bool {
+ return g.terminalSkip[terminal] == 1
+}
+
+func (g *grammarImpl) ErrorTrapperState(state int) bool {
+ return g.errorTrapperStates[state] != 0
+}
+
+func (g *grammarImpl) NonTerminal(nonTerminal int) string {
+ return g.nonTerminals[nonTerminal]
+}
+
+func (g *grammarImpl) LHS(prod int) int {
+ return g.lhsSymbols[prod]
+}
+
+func (g *grammarImpl) EOF() int {
+ return {{ .eofSymbol }}
+}
+
+func (g *grammarImpl) Error() int {
+ return {{ .errorSymbol }}
+}
+
+func (g *grammarImpl) Terminal(terminal int) string {
+ return g.terminals[terminal]
+}
+
+func (g *grammarImpl) ASTAction(prod int) []int {
+ return g.astActions[prod]
+}
+`
+
+func genGrammarTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap {
+ return template.FuncMap{
+ "genRecoverProductions": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.RecoverProductions {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genAction": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.Action {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genGoTo": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.GoTo {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genAlternativeSymbolCounts": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.AlternativeSymbolCounts {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genErrorTrapperStates": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.ErrorTrapperStates {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genNonTerminals": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]string{\n")
+ for _, v := range cgram.Syntactic.NonTerminals {
+ fmt.Fprintf(&b, "%v,\n", strconv.Quote(v))
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genLHSSymbols": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.LHSSymbols {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genTerminals": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]string{\n")
+ for _, v := range cgram.Syntactic.Terminals {
+ fmt.Fprintf(&b, "%v,\n", strconv.Quote(v))
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genTerminalSkip": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.TerminalSkip {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genASTActions": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]int{\n")
+ for _, entries := range cgram.ASTAction.Entries {
+ if len(entries) == 0 {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{\n")
+ c := 1
+ for _, v := range entries {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ }
+}
+
+const lexerSrcTmplate = `
+type vToken struct {
+ terminalID int
+ tok *Token
+}
+
+func (t *vToken) TerminalID() int {
+ return t.terminalID
+}
+
+func (t *vToken) Lexeme() []byte {
+ return t.tok.Lexeme
+}
+
+func (t *vToken) EOF() bool {
+ return t.tok.EOF
+}
+
+func (t *vToken) Invalid() bool {
+ return t.tok.Invalid
+}
+
+func (t *vToken) BytePosition() (int, int) {
+ return t.tok.BytePos, t.tok.ByteLen
+}
+
+func (t *vToken) Position() (int, int) {
+ return t.tok.Row, t.tok.Col
+}
+
+var kindToTerminal = {{ genKindToTerminal }}
+
+type tokenStream struct {
+ lex *Lexer
+ kindToTerminal []int
+}
+
+func NewTokenStream(src io.Reader) (*tokenStream, error) {
+ lex, err := NewLexer(NewLexSpec(), src)
+ if err != nil {
+ return nil, err
+ }
+
+ return &tokenStream{
+ lex: lex,
+ }, nil
+}
+
+func (t *tokenStream) Next() (VToken, error) {
+ tok, err := t.lex.Next()
+ if err != nil {
+ return nil, err
+ }
+ return &vToken{
+ terminalID: kindToTerminal[tok.KindID],
+ tok: tok,
+ }, nil
+}
+`
+
+func genLexerTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap {
+ return template.FuncMap{
+ "genKindToTerminal": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ c := 1
+ for _, v := range cgram.Syntactic.KindToTerminal {
+ fmt.Fprintf(&b, "%v, ", v)
+ if c == 20 {
+ fmt.Fprintf(&b, "\n")
+ c = 1
+ } else {
+ c++
+ }
+ }
+ if c > 1 {
+ fmt.Fprintf(&b, "\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ }
+}
+
+func GenSemanticAction(pkgName string) ([]byte, error) {
+ var src string
+ {
+ tmpl := `// Code generated by vartan-go. DO NOT EDIT.
+{{ .semActSrc }}
+`
+ t, err := template.New("").Parse(tmpl)
+ if err != nil {
+ return nil, err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, map[string]string{
+ "semActSrc": semActSrc,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ src = b.String()
+ }
+
+ fset := goToken.NewFileSet()
+ f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
+ if err != nil {
+ return nil, err
+ }
+
+ f.Name = ast.NewIdent(pkgName)
+
+ var b bytes.Buffer
+ err = format.Node(&b, fset, f)
+ if err != nil {
+ return nil, err
+ }
+
+ return b.Bytes(), nil
+}
+
+type vToken struct {
+ terminalID int
+ tok *lexer.Token
+}
+
+func (t *vToken) TerminalID() int {
+ return t.terminalID
+}
+
+func (t *vToken) Lexeme() []byte {
+ return t.tok.Lexeme
+}
+
+func (t *vToken) EOF() bool {
+ return t.tok.EOF
+}
+
+func (t *vToken) Invalid() bool {
+ return t.tok.Invalid
+}
+
+func (t *vToken) BytePosition() (int, int) {
+ return t.tok.BytePos, t.tok.ByteLen
+}
+
+func (t *vToken) Position() (int, int) {
+ return t.tok.Row, t.tok.Col
+}
+
+type tokenStream struct {
+ lex *lexer.Lexer
+ kindToTerminal []int
+}
+
+func NewTokenStream(g *spec.CompiledGrammar, src io.Reader) (TokenStream, error) {
+ lex, err := lexer.NewLexer(lexer.NewLexSpec(g.Lexical), src)
+ if err != nil {
+ return nil, err
+ }
+
+ return &tokenStream{
+ lex: lex,
+ kindToTerminal: g.Syntactic.KindToTerminal,
+ }, nil
+}
+
+func (l *tokenStream) Next() (VToken, error) {
+ tok, err := l.lex.Next()
+ if err != nil {
+ return nil, err
+ }
+ return &vToken{
+ terminalID: l.kindToTerminal[tok.KindID],
+ tok: tok,
+ }, nil
+}
diff --git a/src/urubu/driver/parser/parser.go b/src/urubu/driver/parser/parser.go
deleted file mode 100644
index 2eaa678..0000000
--- a/src/urubu/driver/parser/parser.go
+++ /dev/null
@@ -1,416 +0,0 @@
-package parser
-
-import (
- "fmt"
-)
-
-type Grammar interface {
- // InitialState returns the initial state of a parser.
- InitialState() int
-
- // StartProduction returns the start production of grammar.
- StartProduction() int
-
- // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair.
- Action(state int, terminal int) int
-
- // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair.
- GoTo(state int, lhs int) int
-
- // ErrorTrapperState returns true when a state can shift the error symbol.
- ErrorTrapperState(state int) bool
-
- // LHS returns a LHS symbol of a production.
- LHS(prod int) int
-
- // AlternativeSymbolCount returns a symbol count of p production.
- AlternativeSymbolCount(prod int) int
-
- // RecoverProduction returns true when a production has the recover directive.
- RecoverProduction(prod int) bool
-
- // NonTerminal retuns a string representaion of a non-terminal symbol.
- NonTerminal(nonTerminal int) string
-
- // TerminalCount returns a terminal symbol count of grammar.
- TerminalCount() int
-
- // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis.
- SkipTerminal(terminal int) bool
-
- // EOF returns the EOF symbol.
- EOF() int
-
- // Error returns the error symbol.
- Error() int
-
- // Terminal retuns a string representaion of a terminal symbol.
- Terminal(terminal int) string
-
- // ASTAction returns an AST action entries.
- ASTAction(prod int) []int
-}
-
-type VToken interface {
- // TerminalID returns a terminal ID.
- TerminalID() int
-
- // Lexeme returns a lexeme.
- Lexeme() []byte
-
- // EOF returns true when a token represents EOF.
- EOF() bool
-
- // Invalid returns true when a token is invalid.
- Invalid() bool
-
- // BytePosition returns (position, length) pair.
- // `position` is a byte position where a token appears and `length` is a length in bytes.
- BytePosition() (int, int)
-
- // Position returns (row, column) pair.
- Position() (int, int)
-}
-
-type TokenStream interface {
- Next() (VToken, error)
-}
-
-type SyntaxError struct {
- Row int
- Col int
- Message string
- Token VToken
- ExpectedTerminals []string
-}
-
-type ParserOption func(p *Parser) error
-
-// DisableLAC disables LAC (lookahead correction). LAC is enabled by default.
-func DisableLAC() ParserOption {
- return func(p *Parser) error {
- p.disableLAC = true
- return nil
- }
-}
-
-func SemanticAction(semAct SemanticActionSet) ParserOption {
- return func(p *Parser) error {
- p.semAct = semAct
- return nil
- }
-}
-
-type Parser struct {
- toks TokenStream
- gram Grammar
- stateStack *stateStack
- semAct SemanticActionSet
- disableLAC bool
- onError bool
- shiftCount int
- synErrs []*SyntaxError
-}
-
-func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) {
- p := &Parser{
- toks: toks,
- gram: gram,
- stateStack: &stateStack{},
- }
-
- for _, opt := range opts {
- err := opt(p)
- if err != nil {
- return nil, err
- }
- }
-
- return p, nil
-}
-
-func (p *Parser) Parse() error {
- p.stateStack.push(p.gram.InitialState())
- tok, err := p.nextToken()
- if err != nil {
- return err
- }
-
-ACTION_LOOP:
- for {
- act := p.lookupAction(tok)
-
- switch {
- case act < 0: // Shift
- nextState := act * -1
-
- recovered := false
- if p.onError {
- p.shiftCount++
-
- // When the parser performs shift three times, the parser recovers from the error state.
- if p.shiftCount >= 3 {
- p.onError = false
- p.shiftCount = 0
- recovered = true
- }
- }
-
- p.shift(nextState)
-
- if p.semAct != nil {
- p.semAct.Shift(tok, recovered)
- }
-
- tok, err = p.nextToken()
- if err != nil {
- return err
- }
- case act > 0: // Reduce
- prodNum := act
-
- recovered := false
- if p.onError && p.gram.RecoverProduction(prodNum) {
- p.onError = false
- p.shiftCount = 0
- recovered = true
- }
-
- accepted := p.reduce(prodNum)
- if accepted {
- if p.semAct != nil {
- p.semAct.Accept()
- }
-
- return nil
- }
-
- if p.semAct != nil {
- p.semAct.Reduce(prodNum, recovered)
- }
- default: // Error
- if p.onError {
- tok, err = p.nextToken()
- if err != nil {
- return err
- }
- if tok.EOF() {
- if p.semAct != nil {
- p.semAct.MissError(tok)
- }
-
- return nil
- }
-
- continue ACTION_LOOP
- }
-
- row, col := tok.Position()
- p.synErrs = append(p.synErrs, &SyntaxError{
- Row: row,
- Col: col,
- Message: "unexpected token",
- Token: tok,
- ExpectedTerminals: p.searchLookahead(p.stateStack.top()),
- })
-
- count, ok := p.trapError()
- if !ok {
- if p.semAct != nil {
- p.semAct.MissError(tok)
- }
-
- return nil
- }
-
- p.onError = true
- p.shiftCount = 0
-
- act, err := p.lookupActionOnError()
- if err != nil {
- return err
- }
-
- p.shift(act * -1)
-
- if p.semAct != nil {
- p.semAct.TrapAndShiftError(tok, count)
- }
- }
- }
-}
-
-// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid,
-// this method returns `true`.
-func (p *Parser) validateLookahead(term int) bool {
- p.stateStack.enableExploratoryMode()
- defer p.stateStack.disableExploratoryMode()
-
- for {
- act := p.gram.Action(p.stateStack.topExploratorily(), term)
-
- switch {
- case act < 0: // Shift
- return true
- case act > 0: // Reduce
- prodNum := act
-
- lhs := p.gram.LHS(prodNum)
- if lhs == p.gram.LHS(p.gram.StartProduction()) {
- return true
- }
- n := p.gram.AlternativeSymbolCount(prodNum)
- p.stateStack.popExploratorily(n)
- state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs)
- p.stateStack.pushExploratorily(state)
- default: // Error
- return false
- }
- }
-}
-
-func (p *Parser) nextToken() (VToken, error) {
- for {
- // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0,
- // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect
- // a syntax error because the parser cannot find an entry corresponding to the invalid token.
- tok, err := p.toks.Next()
- if err != nil {
- return nil, err
- }
-
- if p.gram.SkipTerminal(tok.TerminalID()) {
- continue
- }
-
- return tok, nil
- }
-}
-
-func (p *Parser) tokenToTerminal(tok VToken) int {
- if tok.EOF() {
- return p.gram.EOF()
- }
-
- return tok.TerminalID()
-}
-
-func (p *Parser) lookupAction(tok VToken) int {
- if !p.disableLAC {
- term := p.tokenToTerminal(tok)
- if !p.validateLookahead(term) {
- return 0
- }
- }
-
- return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok))
-}
-
-func (p *Parser) lookupActionOnError() (int, error) {
- act := p.gram.Action(p.stateStack.top(), p.gram.Error())
- if act >= 0 {
- return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error()))
- }
-
- return act, nil
-}
-
-func (p *Parser) shift(nextState int) {
- p.stateStack.push(nextState)
-}
-
-func (p *Parser) reduce(prodNum int) bool {
- lhs := p.gram.LHS(prodNum)
- if lhs == p.gram.LHS(p.gram.StartProduction()) {
- return true
- }
- n := p.gram.AlternativeSymbolCount(prodNum)
- p.stateStack.pop(n)
- nextState := p.gram.GoTo(p.stateStack.top(), lhs)
- p.stateStack.push(nextState)
- return false
-}
-
-func (p *Parser) trapError() (int, bool) {
- count := 0
- for {
- if p.gram.ErrorTrapperState(p.stateStack.top()) {
- return count, true
- }
-
- if p.stateStack.top() != p.gram.InitialState() {
- p.stateStack.pop(1)
- count++
- } else {
- return 0, false
- }
- }
-}
-
-func (p *Parser) SyntaxErrors() []*SyntaxError {
- return p.synErrs
-}
-
-func (p *Parser) searchLookahead(state int) []string {
- kinds := []string{}
- termCount := p.gram.TerminalCount()
- for term := 0; term < termCount; term++ {
- if p.disableLAC {
- if p.gram.Action(p.stateStack.top(), term) == 0 {
- continue
- }
- } else {
- if !p.validateLookahead(term) {
- continue
- }
- }
-
- // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol
- // intentionally.
- if term == p.gram.Error() {
- continue
- }
-
- kinds = append(kinds, p.gram.Terminal(term))
- }
-
- return kinds
-}
-
-type stateStack struct {
- items []int
- itemsExp []int
-}
-
-func (s *stateStack) enableExploratoryMode() {
- s.itemsExp = make([]int, len(s.items))
- copy(s.itemsExp, s.items)
-}
-
-func (s *stateStack) disableExploratoryMode() {
- s.itemsExp = nil
-}
-
-func (s *stateStack) top() int {
- return s.items[len(s.items)-1]
-}
-
-func (s *stateStack) topExploratorily() int {
- return s.itemsExp[len(s.itemsExp)-1]
-}
-
-func (s *stateStack) push(state int) {
- s.items = append(s.items, state)
-}
-
-func (s *stateStack) pushExploratorily(state int) {
- s.itemsExp = append(s.itemsExp, state)
-}
-
-func (s *stateStack) pop(n int) {
- s.items = s.items[:len(s.items)-n]
-}
-
-func (s *stateStack) popExploratorily(n int) {
- s.itemsExp = s.itemsExp[:len(s.itemsExp)-n]
-}
diff --git a/src/urubu/driver/parser/semantic_action.go b/src/urubu/driver/parser/semantic_action.go
deleted file mode 100644
index 6bb78cf..0000000
--- a/src/urubu/driver/parser/semantic_action.go
+++ /dev/null
@@ -1,371 +0,0 @@
-package parser
-
-import (
- "encoding/json"
- "fmt"
- "io"
- "strconv"
-)
-
-// SemanticActionSet is a set of semantic actions a parser calls.
-type SemanticActionSet interface {
- // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol.
- // When the parser recovered from an error state by shifting the token, `recovered` is true.
- Shift(tok VToken, recovered bool)
-
- // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production.
- // When the parser recovered from an error state by reducing the production, `recovered` is true.
- Reduce(prodNum int, recovered bool)
-
- // Accept runs when the parser accepts an input.
- Accept()
-
- // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack.
- // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards
- // from the state stack.
- // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token
- // corresponding to the error symbol doesn't exist.
- TrapAndShiftError(cause VToken, popped int)
-
- // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error.
- MissError(cause VToken)
-}
-
-var _ SemanticActionSet = &SyntaxTreeActionSet{}
-
-// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface.
-type SyntaxTreeNode interface {
- // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast`
- // directive with `...` operator.
- ChildCount() int
-
- // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast`
- // directive with `...` operator.
- ExpandChildren() []SyntaxTreeNode
-}
-
-var _ SyntaxTreeNode = &Node{}
-
-// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types.
-// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface.
-type SyntaxTreeBuilder interface {
- Shift(kindName string, tok VToken) SyntaxTreeNode
- ShiftError(kindName string) SyntaxTreeNode
- Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode
- Accept(f SyntaxTreeNode)
-}
-
-var _ SyntaxTreeBuilder = &DefaultSyntaxTreeBuilder{}
-
-// DefaultSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder.
-type DefaultSyntaxTreeBuilder struct {
- tree *Node
-}
-
-// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder.
-func NewDefaultSyntaxTreeBuilder() *DefaultSyntaxTreeBuilder {
- return &DefaultSyntaxTreeBuilder{}
-}
-
-// Shift is a implementation of SyntaxTreeBuilder.Shift.
-func (b *DefaultSyntaxTreeBuilder) Shift(kindName string, tok VToken) SyntaxTreeNode {
- bytePos, byteLen := tok.BytePosition()
- row, col := tok.Position()
- return &Node{
- Type: NodeTypeTerminal,
- KindName: kindName,
- Text: string(tok.Lexeme()),
- BytePos: bytePos,
- ByteLen: byteLen,
- Row: row,
- Col: col,
- }
-}
-
-// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError.
-func (b *DefaultSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode {
- return &Node{
- Type: NodeTypeError,
- KindName: kindName,
- }
-}
-
-// Reduce is a implementation of SyntaxTreeBuilder.Reduce.
-func (b *DefaultSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode {
- cNodes := make([]*Node, len(children))
- for i, c := range children {
- cNodes[i] = c.(*Node)
- }
- return &Node{
- Type: NodeTypeNonTerminal,
- KindName: kindName,
- Children: cNodes,
- }
-}
-
-// Accept is a implementation of SyntaxTreeBuilder.Accept.
-func (b *DefaultSyntaxTreeBuilder) Accept(f SyntaxTreeNode) {
- b.tree = f.(*Node)
-}
-
-// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil.
-func (b *DefaultSyntaxTreeBuilder) Tree() *Node {
- return b.tree
-}
-
-// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree.
-type SyntaxTreeActionSet struct {
- gram Grammar
- builder SyntaxTreeBuilder
- semStack *semanticStack
- disableASTAction bool
-}
-
-// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree).
-// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them.
-func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
- return &SyntaxTreeActionSet{
- gram: gram,
- builder: builder,
- semStack: newSemanticStack(),
- }
-}
-
-// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree).
-// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them.
-func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
- return &SyntaxTreeActionSet{
- gram: gram,
- builder: builder,
- semStack: newSemanticStack(),
- disableASTAction: true,
- }
-}
-
-// Shift is a implementation of SemanticActionSet.Shift method.
-func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) {
- term := a.tokenToTerminal(tok)
- a.semStack.push(a.builder.Shift(a.gram.Terminal(term), tok))
-}
-
-// Reduce is a implementation of SemanticActionSet.Reduce method.
-func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) {
- lhs := a.gram.LHS(prodNum)
-
- // When an alternative is empty, `n` will be 0, and `handle` will be empty slice.
- n := a.gram.AlternativeSymbolCount(prodNum)
- handle := a.semStack.pop(n)
-
- var astAct []int
- if !a.disableASTAction {
- astAct = a.gram.ASTAction(prodNum)
- }
- var children []SyntaxTreeNode
- if astAct != nil {
- // Count the number of children in advance to avoid frequent growth in a slice for children.
- {
- l := 0
- for _, e := range astAct {
- if e > 0 {
- l++
- } else {
- offset := e*-1 - 1
- l += handle[offset].ChildCount()
- }
- }
-
- children = make([]SyntaxTreeNode, l)
- }
-
- p := 0
- for _, e := range astAct {
- if e > 0 {
- offset := e - 1
- children[p] = handle[offset]
- p++
- } else {
- offset := e*-1 - 1
- for _, c := range handle[offset].ExpandChildren() {
- children[p] = c
- p++
- }
- }
- }
- } else {
- // If an alternative has no AST action, a driver generates
- // a node with the same structure as a CST.
- children = handle
- }
-
- a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children))
-}
-
-// Accept is a implementation of SemanticActionSet.Accept method.
-func (a *SyntaxTreeActionSet) Accept() {
- top := a.semStack.pop(1)
- a.builder.Accept(top[0])
-}
-
-// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method.
-func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) {
- a.semStack.pop(popped)
- a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error())))
-}
-
-// MissError is a implementation of SemanticActionSet.MissError method.
-func (a *SyntaxTreeActionSet) MissError(cause VToken) {
-}
-
-func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int {
- if tok.EOF() {
- return a.gram.EOF()
- }
-
- return tok.TerminalID()
-}
-
-type semanticStack struct {
- frames []SyntaxTreeNode
-}
-
-func newSemanticStack() *semanticStack {
- return &semanticStack{
- frames: make([]SyntaxTreeNode, 0, 100),
- }
-}
-
-func (s *semanticStack) push(f SyntaxTreeNode) {
- s.frames = append(s.frames, f)
-}
-
-func (s *semanticStack) pop(n int) []SyntaxTreeNode {
- fs := s.frames[len(s.frames)-n:]
- s.frames = s.frames[:len(s.frames)-n]
-
- return fs
-}
-
-type NodeType int
-
-const (
- NodeTypeError = 0
- NodeTypeTerminal = 1
- NodeTypeNonTerminal = 2
-)
-
-// Node is a implementation of SyntaxTreeNode interface.
-type Node struct {
- Type NodeType
- KindName string
- Text string
- BytePos int
- ByteLen int
- Row int
- Col int
- Children []*Node
-}
-
-func (n *Node) MarshalJSON() ([]byte, error) {
- switch n.Type {
- case NodeTypeError:
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- })
- case NodeTypeTerminal:
- if n.KindName == "" {
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- Text string `json:"text"`
- Row int `json:"row"`
- Col int `json:"col"`
- }{
- Type: n.Type,
- Text: n.Text,
- Row: n.Row,
- Col: n.Col,
- })
- }
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- Text string `json:"text"`
- Row int `json:"row"`
- Col int `json:"col"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- Text: n.Text,
- Row: n.Row,
- Col: n.Col,
- })
- case NodeTypeNonTerminal:
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- Children []*Node `json:"children"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- Children: n.Children,
- })
- default:
- return nil, fmt.Errorf("invalid node type: %v", n.Type)
- }
-}
-
-// ChildCount is a implementation of SyntaxTreeNode.ChildCount.
-func (n *Node) ChildCount() int {
- return len(n.Children)
-}
-
-// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren.
-func (n *Node) ExpandChildren() []SyntaxTreeNode {
- fs := make([]SyntaxTreeNode, len(n.Children))
- for i, n := range n.Children {
- fs[i] = n
- }
- return fs
-}
-
-// PrintTree prints a syntax tree whose root is `node`.
-func PrintTree(w io.Writer, node *Node) {
- printTree(w, node, "", "")
-}
-
-func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) {
- if node == nil {
- return
- }
-
- switch node.Type {
- case NodeTypeError:
- fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
- case NodeTypeTerminal:
- fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text))
- case NodeTypeNonTerminal:
- fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
-
- num := len(node.Children)
- for i, child := range node.Children {
- var line string
- if num > 1 && i < num-1 {
- line = "├─ "
- } else {
- line = "└─ "
- }
-
- var prefix string
- if i >= num-1 {
- prefix = " "
- } else {
- prefix = "│ "
- }
-
- printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
- }
- }
-}
diff --git a/src/urubu/driver/parser/spec.go b/src/urubu/driver/parser/spec.go
deleted file mode 100644
index 6dc7c3f..0000000
--- a/src/urubu/driver/parser/spec.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package parser
-
-import spec "urubu/spec/grammar"
-
-type grammarImpl struct {
- g *spec.CompiledGrammar
-}
-
-func NewGrammar(g *spec.CompiledGrammar) *grammarImpl {
- return &grammarImpl{
- g: g,
- }
-}
-
-func (g *grammarImpl) InitialState() int {
- return g.g.Syntactic.InitialState
-}
-
-func (g *grammarImpl) StartProduction() int {
- return g.g.Syntactic.StartProduction
-}
-
-func (g *grammarImpl) RecoverProduction(prod int) bool {
- return g.g.Syntactic.RecoverProductions[prod] != 0
-}
-
-func (g *grammarImpl) Action(state int, terminal int) int {
- return g.g.Syntactic.Action[state*g.g.Syntactic.TerminalCount+terminal]
-}
-
-func (g *grammarImpl) GoTo(state int, lhs int) int {
- return g.g.Syntactic.GoTo[state*g.g.Syntactic.NonTerminalCount+lhs]
-}
-
-func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
- return g.g.Syntactic.AlternativeSymbolCounts[prod]
-}
-
-func (g *grammarImpl) TerminalCount() int {
- return g.g.Syntactic.TerminalCount
-}
-
-func (g *grammarImpl) SkipTerminal(terminal int) bool {
- return g.g.Syntactic.TerminalSkip[terminal] == 1
-}
-
-func (g *grammarImpl) ErrorTrapperState(state int) bool {
- return g.g.Syntactic.ErrorTrapperStates[state] != 0
-}
-
-func (g *grammarImpl) NonTerminal(nonTerminal int) string {
- return g.g.Syntactic.NonTerminals[nonTerminal]
-}
-
-func (g *grammarImpl) LHS(prod int) int {
- return g.g.Syntactic.LHSSymbols[prod]
-}
-
-func (g *grammarImpl) EOF() int {
- return g.g.Syntactic.EOFSymbol
-}
-
-func (g *grammarImpl) Error() int {
- return g.g.Syntactic.ErrorSymbol
-}
-
-func (g *grammarImpl) Terminal(terminal int) string {
- return g.g.Syntactic.Terminals[terminal]
-}
-
-func (g *grammarImpl) ASTAction(prod int) []int {
- return g.g.ASTAction.Entries[prod]
-}
diff --git a/src/urubu/driver/parser/template.go b/src/urubu/driver/parser/template.go
deleted file mode 100644
index 33d097c..0000000
--- a/src/urubu/driver/parser/template.go
+++ /dev/null
@@ -1,535 +0,0 @@
-package parser
-
-import (
- "bytes"
- _ "embed"
- "fmt"
- "go/ast"
- "go/format"
- "go/parser"
- "go/token"
- goToken "go/token"
- "strconv"
- "strings"
- "text/template"
-
- spec "urubu/spec/grammar"
-)
-
-// go:embed parser.go
-var parserCoreSrc string
-
-// go:embed semantic_action.go
-var semActSrc string
-
-func GenParser(cgram *spec.CompiledGrammar, pkgName string) ([]byte, error) {
- var parserSrc string
- {
- fset := goToken.NewFileSet()
- f, err := parser.ParseFile(fset, "parser.go", parserCoreSrc, parser.ParseComments)
- if err != nil {
- return nil, err
- }
-
- var b strings.Builder
- err = format.Node(&b, fset, f)
- if err != nil {
- return nil, err
- }
-
- parserSrc = b.String()
- }
-
- var grammarSrc string
- {
- t, err := template.New("").Funcs(genGrammarTemplateFuncs(cgram)).Parse(grammarSrcTmplate)
- if err != nil {
- return nil, err
- }
-
- var b strings.Builder
- err = t.Execute(&b, map[string]interface{}{
- "initialState": cgram.Syntactic.InitialState,
- "startProduction": cgram.Syntactic.StartProduction,
- "terminalCount": cgram.Syntactic.TerminalCount,
- "nonTerminalCount": cgram.Syntactic.NonTerminalCount,
- "eofSymbol": cgram.Syntactic.EOFSymbol,
- "errorSymbol": cgram.Syntactic.ErrorSymbol,
- })
- if err != nil {
- return nil, err
- }
-
- grammarSrc = b.String()
- }
-
- var lexerSrc string
- {
- t, err := template.New("").Funcs(genLexerTemplateFuncs(cgram)).Parse(lexerSrcTmplate)
- if err != nil {
- return nil, err
- }
-
- var b strings.Builder
- err = t.Execute(&b, nil)
- if err != nil {
- return nil, err
- }
-
- lexerSrc = b.String()
- }
-
- var src string
- {
- tmpl := `// Code generated by vartan-go. DO NOT EDIT.
-{{ .parserSrc }}
-
-{{ .grammarSrc }}
-
-{{ .lexerSrc }}
-`
- t, err := template.New("").Parse(tmpl)
- if err != nil {
- return nil, err
- }
-
- var b strings.Builder
- err = t.Execute(&b, map[string]string{
- "parserSrc": parserSrc,
- "grammarSrc": grammarSrc,
- "lexerSrc": lexerSrc,
- })
- if err != nil {
- return nil, err
- }
-
- src = b.String()
- }
-
- fset := goToken.NewFileSet()
- f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
- if err != nil {
- return nil, err
- }
-
- f.Name = ast.NewIdent(pkgName)
-
- // Complete an import statement.
- for _, d := range f.Decls {
- gd, ok := d.(*ast.GenDecl)
- if !ok || gd.Tok != token.IMPORT {
- continue
- }
- gd.Specs = append(gd.Specs, &ast.ImportSpec{
- Path: &ast.BasicLit{
- Value: `"io"`,
- },
- })
- break
- }
-
- var b bytes.Buffer
- err = format.Node(&b, fset, f)
- if err != nil {
- return nil, err
- }
-
- return b.Bytes(), nil
-}
-
-const grammarSrcTmplate = `
-type grammarImpl struct {
- recoverProductions []int
- action []int
- goTo []int
- alternativeSymbolCounts []int
- errorTrapperStates []int
- nonTerminals []string
- lhsSymbols []int
- terminals []string
- terminalSkip []int
- astActions [][]int
-}
-
-func NewGrammar() *grammarImpl {
- return &grammarImpl{
- recoverProductions: {{ genRecoverProductions }},
- action: {{ genAction }},
- goTo: {{ genGoTo }},
- alternativeSymbolCounts: {{ genAlternativeSymbolCounts }},
- errorTrapperStates: {{ genErrorTrapperStates }},
- nonTerminals: {{ genNonTerminals }},
- lhsSymbols: {{ genLHSSymbols }},
- terminals: {{ genTerminals }},
- terminalSkip: {{ genTerminalSkip }},
- astActions: {{ genASTActions }},
- }
-}
-
-func (g *grammarImpl) InitialState() int {
- return {{ .initialState }}
-}
-
-func (g *grammarImpl) StartProduction() int {
- return {{ .startProduction }}
-}
-
-func (g *grammarImpl) RecoverProduction(prod int) bool {
- return g.recoverProductions[prod] != 0
-}
-
-func (g *grammarImpl) Action(state int, terminal int) int {
- return g.action[state*{{ .terminalCount }}+terminal]
-}
-
-func (g *grammarImpl) GoTo(state int, lhs int) int {
- return g.goTo[state*{{ .nonTerminalCount }}+lhs]
-}
-
-func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
- return g.alternativeSymbolCounts[prod]
-}
-
-func (g *grammarImpl) TerminalCount() int {
- return {{ .terminalCount }}
-}
-
-func (g *grammarImpl) SkipTerminal(terminal int) bool {
- return g.terminalSkip[terminal] == 1
-}
-
-func (g *grammarImpl) ErrorTrapperState(state int) bool {
- return g.errorTrapperStates[state] != 0
-}
-
-func (g *grammarImpl) NonTerminal(nonTerminal int) string {
- return g.nonTerminals[nonTerminal]
-}
-
-func (g *grammarImpl) LHS(prod int) int {
- return g.lhsSymbols[prod]
-}
-
-func (g *grammarImpl) EOF() int {
- return {{ .eofSymbol }}
-}
-
-func (g *grammarImpl) Error() int {
- return {{ .errorSymbol }}
-}
-
-func (g *grammarImpl) Terminal(terminal int) string {
- return g.terminals[terminal]
-}
-
-func (g *grammarImpl) ASTAction(prod int) []int {
- return g.astActions[prod]
-}
-`
-
-func genGrammarTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap {
- return template.FuncMap{
- "genRecoverProductions": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.RecoverProductions {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genAction": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.Action {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genGoTo": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.GoTo {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genAlternativeSymbolCounts": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.AlternativeSymbolCounts {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genErrorTrapperStates": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.ErrorTrapperStates {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genNonTerminals": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]string{\n")
- for _, v := range cgram.Syntactic.NonTerminals {
- fmt.Fprintf(&b, "%v,\n", strconv.Quote(v))
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genLHSSymbols": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.LHSSymbols {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genTerminals": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]string{\n")
- for _, v := range cgram.Syntactic.Terminals {
- fmt.Fprintf(&b, "%v,\n", strconv.Quote(v))
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genTerminalSkip": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.TerminalSkip {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- "genASTActions": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[][]int{\n")
- for _, entries := range cgram.ASTAction.Entries {
- if len(entries) == 0 {
- fmt.Fprintf(&b, "nil,\n")
- continue
- }
-
- fmt.Fprintf(&b, "{\n")
- c := 1
- for _, v := range entries {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "},\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- }
-}
-
-const lexerSrcTmplate = `
-type vToken struct {
- terminalID int
- tok *Token
-}
-
-func (t *vToken) TerminalID() int {
- return t.terminalID
-}
-
-func (t *vToken) Lexeme() []byte {
- return t.tok.Lexeme
-}
-
-func (t *vToken) EOF() bool {
- return t.tok.EOF
-}
-
-func (t *vToken) Invalid() bool {
- return t.tok.Invalid
-}
-
-func (t *vToken) BytePosition() (int, int) {
- return t.tok.BytePos, t.tok.ByteLen
-}
-
-func (t *vToken) Position() (int, int) {
- return t.tok.Row, t.tok.Col
-}
-
-var kindToTerminal = {{ genKindToTerminal }}
-
-type tokenStream struct {
- lex *Lexer
- kindToTerminal []int
-}
-
-func NewTokenStream(src io.Reader) (*tokenStream, error) {
- lex, err := NewLexer(NewLexSpec(), src)
- if err != nil {
- return nil, err
- }
-
- return &tokenStream{
- lex: lex,
- }, nil
-}
-
-func (t *tokenStream) Next() (VToken, error) {
- tok, err := t.lex.Next()
- if err != nil {
- return nil, err
- }
- return &vToken{
- terminalID: kindToTerminal[tok.KindID],
- tok: tok,
- }, nil
-}
-`
-
-func genLexerTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap {
- return template.FuncMap{
- "genKindToTerminal": func() string {
- var b strings.Builder
- fmt.Fprintf(&b, "[]int{\n")
- c := 1
- for _, v := range cgram.Syntactic.KindToTerminal {
- fmt.Fprintf(&b, "%v, ", v)
- if c == 20 {
- fmt.Fprintf(&b, "\n")
- c = 1
- } else {
- c++
- }
- }
- if c > 1 {
- fmt.Fprintf(&b, "\n")
- }
- fmt.Fprintf(&b, "}")
- return b.String()
- },
- }
-}
-
-func GenSemanticAction(pkgName string) ([]byte, error) {
- var src string
- {
- tmpl := `// Code generated by vartan-go. DO NOT EDIT.
-{{ .semActSrc }}
-`
- t, err := template.New("").Parse(tmpl)
- if err != nil {
- return nil, err
- }
-
- var b strings.Builder
- err = t.Execute(&b, map[string]string{
- "semActSrc": semActSrc,
- })
- if err != nil {
- return nil, err
- }
-
- src = b.String()
- }
-
- fset := goToken.NewFileSet()
- f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
- if err != nil {
- return nil, err
- }
-
- f.Name = ast.NewIdent(pkgName)
-
- var b bytes.Buffer
- err = format.Node(&b, fset, f)
- if err != nil {
- return nil, err
- }
-
- return b.Bytes(), nil
-}
diff --git a/src/urubu/driver/parser/token_stream.go b/src/urubu/driver/parser/token_stream.go
deleted file mode 100644
index 788e521..0000000
--- a/src/urubu/driver/parser/token_stream.go
+++ /dev/null
@@ -1,65 +0,0 @@
-package parser
-
-import (
- "io"
-
- "urubu/driver/lexer"
- spec "urubu/spec/grammar"
-)
-
-type vToken struct {
- terminalID int
- tok *lexer.Token
-}
-
-func (t *vToken) TerminalID() int {
- return t.terminalID
-}
-
-func (t *vToken) Lexeme() []byte {
- return t.tok.Lexeme
-}
-
-func (t *vToken) EOF() bool {
- return t.tok.EOF
-}
-
-func (t *vToken) Invalid() bool {
- return t.tok.Invalid
-}
-
-func (t *vToken) BytePosition() (int, int) {
- return t.tok.BytePos, t.tok.ByteLen
-}
-
-func (t *vToken) Position() (int, int) {
- return t.tok.Row, t.tok.Col
-}
-
-type tokenStream struct {
- lex *lexer.Lexer
- kindToTerminal []int
-}
-
-func NewTokenStream(g *spec.CompiledGrammar, src io.Reader) (TokenStream, error) {
- lex, err := lexer.NewLexer(lexer.NewLexSpec(g.Lexical), src)
- if err != nil {
- return nil, err
- }
-
- return &tokenStream{
- lex: lex,
- kindToTerminal: g.Syntactic.KindToTerminal,
- }, nil
-}
-
-func (l *tokenStream) Next() (VToken, error) {
- tok, err := l.lex.Next()
- if err != nil {
- return nil, err
- }
- return &vToken{
- terminalID: l.kindToTerminal[tok.KindID],
- tok: tok,
- }, nil
-}
diff --git a/src/urubu/error/error.go b/src/urubu/error.go
index 0e5d3af..0e5d3af 100644
--- a/src/urubu/error/error.go
+++ b/src/urubu/error.go
diff --git a/src/urubu/grammar.go b/src/urubu/grammar.go
new file mode 100644
index 0000000..6059210
--- /dev/null
+++ b/src/urubu/grammar.go
@@ -0,0 +1,2911 @@
+package grammar
+
+import (
+ "crypto/sha256"
+ "encoding/binary"
+ "encoding/hex"
+ "errors"
+ "fmt"
+ "io"
+ "sort"
+ "strconv"
+ "strings"
+
+ verr "urubu/error"
+ "urubu/grammar/lexical"
+ "urubu/grammar/symbol"
+ spec "urubu/spec/grammar"
+ "urubu/spec/grammar/parser"
+)
+
+type firstEntry struct {
+ symbols map[symbol.Symbol]struct{}
+ empty bool
+}
+
+func newFirstEntry() *firstEntry {
+ return &firstEntry{
+ symbols: map[symbol.Symbol]struct{}{},
+ empty: false,
+ }
+}
+
+func (e *firstEntry) add(sym symbol.Symbol) bool {
+ if _, ok := e.symbols[sym]; ok {
+ return false
+ }
+ e.symbols[sym] = struct{}{}
+ return true
+}
+
+func (e *firstEntry) addEmpty() bool {
+ if !e.empty {
+ e.empty = true
+ return true
+ }
+ return false
+}
+
+func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool {
+ if target == nil {
+ return false
+ }
+ changed := false
+ for sym := range target.symbols {
+ added := e.add(sym)
+ if added {
+ changed = true
+ }
+ }
+ return changed
+}
+
+type firstSet struct {
+ set map[symbol.Symbol]*firstEntry
+}
+
+func newFirstSet(prods *productionSet) *firstSet {
+ fst := &firstSet{
+ set: map[symbol.Symbol]*firstEntry{},
+ }
+ for _, prod := range prods.getAllProductions() {
+ if _, ok := fst.set[prod.lhs]; ok {
+ continue
+ }
+ fst.set[prod.lhs] = newFirstEntry()
+ }
+
+ return fst
+}
+
+func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) {
+ entry := newFirstEntry()
+ if prod.rhsLen <= head {
+ entry.addEmpty()
+ return entry, nil
+ }
+ for _, sym := range prod.rhs[head:] {
+ if sym.IsTerminal() {
+ entry.add(sym)
+ return entry, nil
+ }
+
+ e := fst.findBySymbol(sym)
+ if e == nil {
+ return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym)
+ }
+ for s := range e.symbols {
+ entry.add(s)
+ }
+ if !e.empty {
+ return entry, nil
+ }
+ }
+ entry.addEmpty()
+ return entry, nil
+}
+
+func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry {
+ return fst.set[sym]
+}
+
+type firstComContext struct {
+ first *firstSet
+}
+
+func newFirstComContext(prods *productionSet) *firstComContext {
+ return &firstComContext{
+ first: newFirstSet(prods),
+ }
+}
+
+func genFirstSet(prods *productionSet) (*firstSet, error) {
+ cc := newFirstComContext(prods)
+ for {
+ more := false
+ for _, prod := range prods.getAllProductions() {
+ e := cc.first.findBySymbol(prod.lhs)
+ changed, err := genProdFirstEntry(cc, e, prod)
+ if err != nil {
+ return nil, err
+ }
+ if changed {
+ more = true
+ }
+ }
+ if !more {
+ break
+ }
+ }
+ return cc.first, nil
+}
+
+func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) {
+ if prod.isEmpty() {
+ return acc.addEmpty(), nil
+ }
+
+ for _, sym := range prod.rhs {
+ if sym.IsTerminal() {
+ return acc.add(sym), nil
+ }
+
+ e := cc.first.findBySymbol(sym)
+ changed := acc.mergeExceptEmpty(e)
+ if !e.empty {
+ return changed, nil
+ }
+ }
+ return acc.addEmpty(), nil
+}
+
+type astActionEntry struct {
+ position int
+ expansion bool
+}
+
+type assocType string
+
+const (
+ assocTypeNil = assocType("")
+ assocTypeLeft = assocType("left")
+ assocTypeRight = assocType("right")
+)
+
+const (
+ precNil = 0
+ precMin = 1
+)
+
+// precAndAssoc represents precedence and associativities of terminal symbols and productions.
+// We use the priority of the production to resolve shift/reduce conflicts.
+type precAndAssoc struct {
+ // termPrec and termAssoc represent the precedence of the terminal symbols.
+ termPrec map[symbol.SymbolNum]int
+ termAssoc map[symbol.SymbolNum]assocType
+
+ // prodPrec and prodAssoc represent the precedence and the associativities of the production.
+ // These values are inherited from the right-most terminal symbols in the RHS of the productions.
+ prodPrec map[productionNum]int
+ prodAssoc map[productionNum]assocType
+}
+
+func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int {
+ prec, ok := pa.termPrec[sym]
+ if !ok {
+ return precNil
+ }
+
+ return prec
+}
+
+func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType {
+ assoc, ok := pa.termAssoc[sym]
+ if !ok {
+ return assocTypeNil
+ }
+
+ return assoc
+}
+
+func (pa *precAndAssoc) productionPredence(prod productionNum) int {
+ prec, ok := pa.prodPrec[prod]
+ if !ok {
+ return precNil
+ }
+
+ return prec
+}
+
+func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType {
+ assoc, ok := pa.prodAssoc[prod]
+ if !ok {
+ return assocTypeNil
+ }
+
+ return assoc
+}
+
+const reservedSymbolNameError = "error"
+
+type Grammar struct {
+ name string
+ lexSpec *lexical.LexSpec
+ skipSymbols []symbol.Symbol
+ productionSet *productionSet
+ augmentedStartSymbol symbol.Symbol
+ errorSymbol symbol.Symbol
+ symbolTable *symbol.SymbolTableReader
+ astActions map[productionID][]*astActionEntry
+ precAndAssoc *precAndAssoc
+
+ // recoverProductions is a set of productions having the recover directive.
+ recoverProductions map[productionID]struct{}
+}
+
+type buildConfig struct {
+ isReportingEnabled bool
+}
+
+type BuildOption func(config *buildConfig)
+
+func EnableReporting() BuildOption {
+ return func(config *buildConfig) {
+ config.isReportingEnabled = true
+ }
+}
+
+type GrammarBuilder struct {
+ AST *parser.RootNode
+
+ errs verr.SpecErrors
+}
+
+func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
+ gram, err := b.build()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return compile(gram, opts...)
+}
+
+func (b *GrammarBuilder) build() (*Grammar, error) {
+ var specName string
+ {
+ errOccurred := false
+ for _, dir := range b.AST.Directives {
+ if dir.Name != "name" {
+ continue
+ }
+
+ if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'name' takes just one ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+
+ errOccurred = true
+ break
+ }
+
+ specName = dir.Parameters[0].ID
+ break
+ }
+
+ if specName == "" && !errOccurred {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrNoGrammarName,
+ })
+ }
+ }
+
+ b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST)
+ if len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ symTab, ss, err := b.genSymbolTable(b.AST)
+ if err != nil {
+ return nil, err
+ }
+
+ lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST)
+ if err != nil {
+ return nil, err
+ }
+
+ prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym)
+ if err != nil {
+ return nil, err
+ }
+ if prodsAndActs == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs)
+ if err != nil {
+ return nil, err
+ }
+ if pa == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ syms := findUsedAndUnusedSymbols(b.AST)
+ if syms == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ // When a terminal symbol that cannot be reached from the start symbol has the skip directive,
+ // the compiler treats its terminal as a used symbol, not unused.
+ {
+ r := symTab.Reader()
+ for _, sym := range skip {
+ s, _ := r.ToText(sym)
+ if _, ok := syms.unusedTerminals[s]; !ok {
+ prod := syms.usedTerminals[s]
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrTermCannotBeSkipped,
+ Detail: s,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ continue
+ }
+
+ delete(syms.unusedTerminals, s)
+ }
+ }
+
+ for sym, prod := range syms.unusedProductions {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUnusedProduction,
+ Detail: sym,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ for sym, prod := range syms.unusedTerminals {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUnusedTerminal,
+ Detail: sym,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ if len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ return &Grammar{
+ name: specName,
+ lexSpec: lexSpec,
+ skipSymbols: skip,
+ productionSet: prodsAndActs.prods,
+ augmentedStartSymbol: prodsAndActs.augStartSym,
+ errorSymbol: ss.errSym,
+ symbolTable: symTab.Reader(),
+ astActions: prodsAndActs.astActs,
+ recoverProductions: prodsAndActs.recoverProds,
+ precAndAssoc: pa,
+ }, nil
+}
+
+type usedAndUnusedSymbols struct {
+ unusedProductions map[string]*parser.ProductionNode
+ unusedTerminals map[string]*parser.ProductionNode
+ usedTerminals map[string]*parser.ProductionNode
+}
+
+func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols {
+ prods := map[string]*parser.ProductionNode{}
+ lexProds := map[string]*parser.ProductionNode{}
+ mark := map[string]bool{}
+ {
+ for _, p := range root.Productions {
+ prods[p.LHS] = p
+ mark[p.LHS] = false
+ for _, alt := range p.RHS {
+ for _, e := range alt.Elements {
+ if e.ID == "" {
+ continue
+ }
+ mark[e.ID] = false
+ }
+ }
+ }
+
+ for _, p := range root.LexProductions {
+ lexProds[p.LHS] = p
+ mark[p.LHS] = false
+ }
+
+ start := root.Productions[0]
+ mark[start.LHS] = true
+ markUsedSymbols(mark, map[string]bool{}, prods, start)
+
+ // We don't have to check the error symbol because the error symbol doesn't have a production.
+ delete(mark, reservedSymbolNameError)
+ }
+
+ usedTerms := make(map[string]*parser.ProductionNode, len(lexProds))
+ unusedProds := map[string]*parser.ProductionNode{}
+ unusedTerms := map[string]*parser.ProductionNode{}
+ for sym, used := range mark {
+ if p, ok := prods[sym]; ok {
+ if used {
+ continue
+ }
+ unusedProds[sym] = p
+ continue
+ }
+ if p, ok := lexProds[sym]; ok {
+ if used {
+ usedTerms[sym] = p
+ } else {
+ unusedTerms[sym] = p
+ }
+ continue
+ }
+
+ // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error
+ // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can
+ // ignore it here.
+ }
+
+ return &usedAndUnusedSymbols{
+ usedTerminals: usedTerms,
+ unusedProductions: unusedProds,
+ unusedTerminals: unusedTerms,
+ }
+}
+
+func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) {
+ if marked[prod.LHS] {
+ return
+ }
+
+ for _, alt := range prod.RHS {
+ for _, e := range alt.Elements {
+ if e.ID == "" {
+ continue
+ }
+
+ mark[e.ID] = true
+
+ p, ok := prods[e.ID]
+ if !ok {
+ continue
+ }
+
+ // Remove a production to avoid inifinite recursion.
+ marked[prod.LHS] = true
+
+ markUsedSymbols(mark, marked, prods, p)
+ }
+ }
+}
+
+func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) {
+ var ids []string
+ {
+ for _, prod := range root.Productions {
+ ids = append(ids, prod.LHS)
+ for _, alt := range prod.RHS {
+ for _, elem := range alt.Elements {
+ if elem.Label != nil {
+ ids = append(ids, elem.Label.Name)
+ }
+ }
+ }
+ }
+ for _, prod := range root.LexProductions {
+ ids = append(ids, prod.LHS)
+ }
+ for _, dir := range root.Directives {
+ dirIDs := collectUserDefinedIDsFromDirective(dir)
+ if len(dirIDs) > 0 {
+ ids = append(ids, dirIDs...)
+ }
+ }
+ }
+
+ duplicated := lexical.FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return
+ }
+
+ for _, dup := range duplicated {
+ var s string
+ {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ s = b.String()
+ }
+
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrSpellingInconsistency,
+ Detail: s,
+ })
+ }
+}
+
+func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string {
+ var ids []string
+ for _, param := range dir.Parameters {
+ if param.Group != nil {
+ for _, d := range param.Group {
+ dIDs := collectUserDefinedIDsFromDirective(d)
+ if len(dIDs) > 0 {
+ ids = append(ids, dIDs...)
+ }
+ }
+ }
+ if param.OrderedSymbol != "" {
+ ids = append(ids, param.OrderedSymbol)
+ }
+ }
+ return ids
+}
+
+type symbols struct {
+ errSym symbol.Symbol
+ augStartSym symbol.Symbol
+ startSym symbol.Symbol
+}
+
+func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) {
+ symTab := symbol.NewSymbolTable()
+ w := symTab.Writer()
+ r := symTab.Reader()
+
+ // We need to register the reserved symbol before registering others.
+ var errSym symbol.Symbol
+ {
+ sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError)
+ if err != nil {
+ return nil, nil, err
+ }
+ errSym = sym
+ }
+
+ for _, prod := range root.LexProductions {
+ if sym, exist := r.ToSymbol(prod.LHS); exist {
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateTerminal,
+ Detail: prod.LHS,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ continue
+ }
+
+ _, err := w.RegisterTerminalSymbol(prod.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+
+ startProd := root.Productions[0]
+ augStartText := fmt.Sprintf("%s'", startProd.LHS)
+ var err error
+ augStartSym, err := w.RegisterStartSymbol(augStartText)
+ if err != nil {
+ return nil, nil, err
+ }
+ if augStartSym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: startProd.Pos.Row,
+ Col: startProd.Pos.Col,
+ })
+ }
+
+ startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ if startSym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: startProd.Pos.Row,
+ Col: startProd.Pos.Col,
+ })
+ }
+
+ for _, prod := range root.Productions {
+ sym, err := w.RegisterNonTerminalSymbol(prod.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ if sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateName,
+ Detail: prod.LHS,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+ }
+
+ return symTab, &symbols{
+ errSym: errSym,
+ augStartSym: augStartSym,
+ startSym: startSym,
+ }, nil
+}
+
+func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) {
+ entries := []*lexical.LexEntry{}
+ skipSyms := []symbol.Symbol{}
+ for _, prod := range root.LexProductions {
+ entry, skip, specErr, err := genLexEntry(prod)
+ if err != nil {
+ return nil, nil, err
+ }
+ if specErr != nil {
+ b.errs = append(b.errs, specErr)
+ continue
+ }
+ if skip {
+ sym, _ := symTab.ToSymbol(prod.LHS)
+ skipSyms = append(skipSyms, sym)
+ }
+ entries = append(entries, entry)
+ }
+
+ checkedFragments := map[string]struct{}{}
+ for _, fragment := range root.Fragments {
+ if _, exist := checkedFragments[fragment.LHS]; exist {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateFragment,
+ Detail: fragment.LHS,
+ Row: fragment.Pos.Row,
+ Col: fragment.Pos.Col,
+ })
+ continue
+ }
+ checkedFragments[fragment.LHS] = struct{}{}
+
+ entries = append(entries, &lexical.LexEntry{
+ Fragment: true,
+ Kind: spec.LexKindName(fragment.LHS),
+ Pattern: fragment.RHS,
+ })
+ }
+
+ return &lexical.LexSpec{
+ Entries: entries,
+ }, skipSyms, nil
+}
+
+func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) {
+ alt := prod.RHS[0]
+ elem := alt.Elements[0]
+
+ var pattern string
+ if elem.Literally {
+ pattern = spec.EscapePattern(elem.Pattern)
+ } else {
+ pattern = elem.Pattern
+ }
+
+ var modes []spec.LexModeName
+ var skip bool
+ var push spec.LexModeName
+ var pop bool
+ dirConsumed := map[string]struct{}{}
+ for _, dir := range prod.Directives {
+ if _, consumed := dirConsumed[dir.Name]; consumed {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDuplicateDir,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ dirConsumed[dir.Name] = struct{}{}
+
+ switch dir.Name {
+ case "mode":
+ if len(dir.Parameters) == 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'mode' directive needs an ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ for _, param := range dir.Parameters {
+ if param.ID == "" {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'mode' directive needs an ID parameter",
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ }, nil
+ }
+ modes = append(modes, spec.LexModeName(param.ID))
+ }
+ case "skip":
+ if len(dir.Parameters) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'skip' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ skip = true
+ case "push":
+ if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'push' directive needs an ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ push = spec.LexModeName(dir.Parameters[0].ID)
+ case "pop":
+ if len(dir.Parameters) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'pop' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ pop = true
+ default:
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ }
+
+ if len(alt.Directives) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrInvalidAltDir,
+ Detail: "a lexical production cannot have alternative directives",
+ Row: alt.Directives[0].Pos.Row,
+ Col: alt.Directives[0].Pos.Col,
+ }, nil
+ }
+
+ return &lexical.LexEntry{
+ Modes: modes,
+ Kind: spec.LexKindName(prod.LHS),
+ Pattern: pattern,
+ Push: push,
+ Pop: pop,
+ }, skip, nil, nil
+}
+
+type productionsAndActions struct {
+ prods *productionSet
+ augStartSym symbol.Symbol
+ astActs map[productionID][]*astActionEntry
+ prodPrecsTerm map[productionID]symbol.Symbol
+ prodPrecsOrdSym map[productionID]string
+ prodPrecPoss map[productionID]*parser.Position
+ recoverProds map[productionID]struct{}
+}
+
+func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) {
+ if len(root.Productions) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrNoProduction,
+ })
+ return nil, nil
+ }
+
+ prods := newProductionSet()
+ astActs := map[productionID][]*astActionEntry{}
+ prodPrecsTerm := map[productionID]symbol.Symbol{}
+ prodPrecsOrdSym := map[productionID]string{}
+ prodPrecPoss := map[productionID]*parser.Position{}
+ recoverProds := map[productionID]struct{}{}
+
+ p, err := newProduction(augStartSym, []symbol.Symbol{
+ startSym,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ prods.append(p)
+
+ for _, prod := range root.Productions {
+ lhsSym, ok := symTab.ToSymbol(prod.LHS)
+ if !ok {
+ // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here.
+ return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS)
+ }
+
+ if len(prod.Directives) > 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrInvalidProdDir,
+ Detail: "a production cannot have production directives",
+ Row: prod.Directives[0].Pos.Row,
+ Col: prod.Directives[0].Pos.Col,
+ })
+ continue
+ }
+
+ LOOP_RHS:
+ for _, alt := range prod.RHS {
+ altSyms := make([]symbol.Symbol, len(alt.Elements))
+ offsets := map[string]int{}
+ ambiguousIDOffsets := map[string]struct{}{}
+ for i, elem := range alt.Elements {
+ sym, ok := symTab.ToSymbol(elem.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedSym,
+ Detail: elem.ID,
+ Row: elem.Pos.Row,
+ Col: elem.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ altSyms[i] = sym
+
+ if elem.Label != nil {
+ if _, added := offsets[elem.Label.Name]; added {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateLabel,
+ Detail: elem.Label.Name,
+ Row: elem.Label.Pos.Row,
+ Col: elem.Label.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if _, found := symTab.ToSymbol(elem.Label.Name); found {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrInvalidLabel,
+ Detail: elem.Label.Name,
+ Row: elem.Label.Pos.Row,
+ Col: elem.Label.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ offsets[elem.Label.Name] = i
+ }
+ // A symbol having a label can be specified by both the label and the symbol name.
+ // So record the symbol's position, whether or not it has a label.
+ if elem.ID != "" {
+ if _, exist := offsets[elem.ID]; exist {
+ // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need
+ // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify
+ // a label to resolve the ambiguity.
+ delete(offsets, elem.ID)
+ ambiguousIDOffsets[elem.ID] = struct{}{}
+ } else {
+ offsets[elem.ID] = i
+ }
+ }
+ }
+
+ p, err := newProduction(lhsSym, altSyms)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := prods.findByID(p.id); exist {
+ // Report the line number of a duplicate alternative.
+ // When the alternative is empty, we report the position of its LHS.
+ var row int
+ var col int
+ if len(alt.Elements) > 0 {
+ row = alt.Elements[0].Pos.Row
+ col = alt.Elements[0].Pos.Col
+ } else {
+ row = prod.Pos.Row
+ col = prod.Pos.Col
+ }
+
+ var detail string
+ {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v →", prod.LHS)
+ for _, elem := range alt.Elements {
+ switch {
+ case elem.ID != "":
+ fmt.Fprintf(&b, " %v", elem.ID)
+ case elem.Pattern != "":
+ fmt.Fprintf(&b, ` "%v"`, elem.Pattern)
+ }
+ }
+ if len(alt.Elements) == 0 {
+ fmt.Fprintf(&b, " ε")
+ }
+
+ detail = b.String()
+ }
+
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateProduction,
+ Detail: detail,
+ Row: row,
+ Col: col,
+ })
+ continue LOOP_RHS
+ }
+ prods.append(p)
+
+ dirConsumed := map[string]struct{}{}
+ for _, dir := range alt.Directives {
+ if _, consumed := dirConsumed[dir.Name]; consumed {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateDir,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ }
+ dirConsumed[dir.Name] = struct{}{}
+
+ switch dir.Name {
+ case "ast":
+ if len(dir.Parameters) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'ast' directive needs at least one parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ astAct := make([]*astActionEntry, len(dir.Parameters))
+ consumedOffsets := map[int]struct{}{}
+ for i, param := range dir.Parameters {
+ if param.ID == "" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'ast' directive can take only ID parameters",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+
+ if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrAmbiguousElem,
+ Detail: fmt.Sprintf("'%v' is ambiguous", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+
+ offset, ok := offsets[param.ID]
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if _, consumed := consumedOffsets[offset]; consumed {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateElem,
+ Detail: param.ID,
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ consumedOffsets[offset] = struct{}{}
+
+ if param.Expansion {
+ elem := alt.Elements[offset]
+ if elem.Pattern != "" {
+ // Currently, it is a bug to reach here because it is
+ // forbidden to have anything other than ID appear in
+ // production rules.
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ elemSym, ok := symTab.ToSymbol(elem.ID)
+ if !ok {
+ // If the symbol was not found, it's a bug.
+ return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID)
+ }
+ if elemSym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ }
+
+ astAct[i] = &astActionEntry{
+ position: offset + 1,
+ expansion: param.Expansion,
+ }
+ }
+ astActs[p.id] = astAct
+ case "prec":
+ if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'prec' directive needs just one ID parameter or ordered symbol",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ param := dir.Parameters[0]
+ switch {
+ case param.ID != "":
+ sym, ok := symTab.ToSymbol(param.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ }
+ if !sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ prodPrecsTerm[p.id] = sym
+ prodPrecPoss[p.id] = &param.Pos
+ case param.OrderedSymbol != "":
+ prodPrecsOrdSym[p.id] = param.OrderedSymbol
+ prodPrecPoss[p.id] = &param.Pos
+ }
+ case "recover":
+ if len(dir.Parameters) > 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'recover' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ recoverProds[p.id] = struct{}{}
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name),
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ }
+ }
+ }
+
+ return &productionsAndActions{
+ prods: prods,
+ augStartSym: augStartSym,
+ astActs: astActs,
+ prodPrecsTerm: prodPrecsTerm,
+ prodPrecsOrdSym: prodPrecsOrdSym,
+ prodPrecPoss: prodPrecPoss,
+ recoverProds: recoverProds,
+ }, nil
+}
+
+func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) {
+ termPrec := map[symbol.SymbolNum]int{}
+ termAssoc := map[symbol.SymbolNum]assocType{}
+ ordSymPrec := map[string]int{}
+ {
+ var precGroup []*parser.DirectiveNode
+ for _, dir := range b.AST.Directives {
+ if dir.Name == "prec" {
+ if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'prec' needs just one directive group",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue
+ }
+ precGroup = dir.Parameters[0].Group
+ continue
+ }
+
+ if dir.Name != "name" && dir.Name != "prec" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue
+ }
+ }
+
+ precN := precMin
+ for _, dir := range precGroup {
+ var assocTy assocType
+ switch dir.Name {
+ case "left":
+ assocTy = assocTypeLeft
+ case "right":
+ assocTy = assocTypeRight
+ case "assign":
+ assocTy = assocTypeNil
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ return nil, nil
+ }
+
+ if len(dir.Parameters) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "associativity needs at least one symbol",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ return nil, nil
+ }
+ ASSOC_PARAM_LOOP:
+ for _, p := range dir.Parameters {
+ switch {
+ case p.ID != "":
+ sym, ok := symTab.ToSymbol(p.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' is undefined", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if !sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if prec, alreadySet := termPrec[sym.Num()]; alreadySet {
+ if prec == precN {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else if assoc := termAssoc[sym.Num()]; assoc == assocTy {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has different precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ }
+ break ASSOC_PARAM_LOOP
+ }
+
+ termPrec[sym.Num()] = precN
+ termAssoc[sym.Num()] = assocTy
+ case p.OrderedSymbol != "":
+ if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet {
+ if prec == precN {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ }
+ break ASSOC_PARAM_LOOP
+ }
+
+ ordSymPrec[p.OrderedSymbol] = precN
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "a parameter must be an ID or an ordered symbol",
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ }
+
+ precN++
+ }
+ }
+ if len(b.errs) > 0 {
+ return nil, nil
+ }
+
+ prodPrec := map[productionNum]int{}
+ prodAssoc := map[productionNum]assocType{}
+ for _, prod := range prodsAndActs.prods.getAllProductions() {
+ // A #prec directive changes only precedence, not associativity.
+ if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok {
+ if prec, ok := termPrec[term.Num()]; ok {
+ prodPrec[prod.num] = prec
+ prodAssoc[prod.num] = assocTypeNil
+ } else {
+ text, _ := symTab.ToText(term)
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedPrec,
+ Detail: text,
+ Row: prodsAndActs.prodPrecPoss[prod.id].Row,
+ Col: prodsAndActs.prodPrecPoss[prod.id].Col,
+ })
+ }
+ } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok {
+ if prec, ok := ordSymPrec[ordSym]; ok {
+ prodPrec[prod.num] = prec
+ prodAssoc[prod.num] = assocTypeNil
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedOrdSym,
+ Detail: fmt.Sprintf("$%v", ordSym),
+ Row: prodsAndActs.prodPrecPoss[prod.id].Row,
+ Col: prodsAndActs.prodPrecPoss[prod.id].Col,
+ })
+ }
+ } else {
+ // A production inherits precedence and associativity from the right-most terminal symbol.
+ mostrightTerm := symbol.SymbolNil
+ for _, sym := range prod.rhs {
+ if !sym.IsTerminal() {
+ continue
+ }
+ mostrightTerm = sym
+ }
+ if !mostrightTerm.IsNil() {
+ prodPrec[prod.num] = termPrec[mostrightTerm.Num()]
+ prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()]
+ }
+ }
+ }
+ if len(b.errs) > 0 {
+ return nil, nil
+ }
+
+ return &precAndAssoc{
+ termPrec: termPrec,
+ termAssoc: termAssoc,
+ prodPrec: prodPrec,
+ prodAssoc: prodAssoc,
+ }, nil
+}
+
+func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
+ config := &buildConfig{}
+ for _, opt := range opts {
+ opt(config)
+ }
+
+ lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax)
+ if err != nil {
+ if len(cErrs) > 0 {
+ var b strings.Builder
+ writeCompileError(&b, cErrs[0])
+ for _, cerr := range cErrs[1:] {
+ fmt.Fprintf(&b, "\n")
+ writeCompileError(&b, cerr)
+ }
+ return nil, nil, fmt.Errorf(b.String())
+ }
+ return nil, nil, err
+ }
+
+ kind2Term := make([]int, len(lexSpec.KindNames))
+ for i, k := range lexSpec.KindNames {
+ if k == spec.LexKindNameNil {
+ kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int()
+ continue
+ }
+
+ sym, ok := gram.symbolTable.ToSymbol(k.String())
+ if !ok {
+ return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k)
+ }
+ kind2Term[i] = sym.Num().Int()
+ }
+
+ termTexts, err := gram.symbolTable.TerminalTexts()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var termSkip []int
+ {
+ r := gram.symbolTable.Reader()
+ // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(),
+ // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts
+ // because it does not contain a predefined symbol, like EOF.
+ // Therefore, we use terminalTexts, although it takes more time to lookup for symbols.
+ termSkip = make([]int, len(termTexts))
+ for _, t := range termTexts {
+ s, _ := r.ToSymbol(t)
+ for _, sk := range gram.skipSymbols {
+ if s != sk {
+ continue
+ }
+ termSkip[s.Num()] = 1
+ break
+ }
+ }
+ }
+
+ nonTerms, err := gram.symbolTable.NonTerminalTexts()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ firstSet, err := genFirstSet(gram.productionSet)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var tab *ParsingTable
+ var report *spec.Report
+ {
+ lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ b := &lrTableBuilder{
+ automaton: lalr1.lr0Automaton,
+ prods: gram.productionSet,
+ termCount: len(termTexts),
+ nonTermCount: len(nonTerms),
+ symTab: gram.symbolTable,
+ precAndAssoc: gram.precAndAssoc,
+ }
+ tab, err = b.build()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ if config.isReportingEnabled {
+ report, err = b.genReport(tab, gram)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ }
+
+ action := make([]int, len(tab.actionTable))
+ for i, e := range tab.actionTable {
+ action[i] = int(e)
+ }
+ goTo := make([]int, len(tab.goToTable))
+ for i, e := range tab.goToTable {
+ goTo[i] = int(e)
+ }
+
+ lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1)
+ altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1)
+ recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1)
+ astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1)
+ for _, p := range gram.productionSet.getAllProductions() {
+ lhsSyms[p.num] = p.lhs.Num().Int()
+ altSymCounts[p.num] = p.rhsLen
+
+ if _, ok := gram.recoverProductions[p.id]; ok {
+ recoverProds[p.num] = 1
+ }
+
+ astAct, ok := gram.astActions[p.id]
+ if !ok {
+ continue
+ }
+ astActEntry := make([]int, len(astAct))
+ for i, e := range astAct {
+ if e.expansion {
+ astActEntry[i] = e.position * -1
+ } else {
+ astActEntry[i] = e.position
+ }
+ }
+ astActEnties[p.num] = astActEntry
+ }
+
+ return &spec.CompiledGrammar{
+ Name: gram.name,
+ Lexical: lexSpec,
+ Syntactic: &spec.SyntacticSpec{
+ Action: action,
+ GoTo: goTo,
+ StateCount: tab.stateCount,
+ InitialState: tab.InitialState.Int(),
+ StartProduction: productionNumStart.Int(),
+ LHSSymbols: lhsSyms,
+ AlternativeSymbolCounts: altSymCounts,
+ Terminals: termTexts,
+ TerminalCount: tab.terminalCount,
+ TerminalSkip: termSkip,
+ KindToTerminal: kind2Term,
+ NonTerminals: nonTerms,
+ NonTerminalCount: tab.nonTerminalCount,
+ EOFSymbol: symbol.SymbolEOF.Num().Int(),
+ ErrorSymbol: gram.errorSymbol.Num().Int(),
+ ErrorTrapperStates: tab.errorTrapperStates,
+ RecoverProductions: recoverProds,
+ },
+ ASTAction: &spec.ASTAction{
+ Entries: astActEnties,
+ },
+ }, report, nil
+}
+
+func writeCompileError(w io.Writer, cErr *lexical.CompileError) {
+ if cErr.Fragment {
+ fmt.Fprintf(w, "fragment ")
+ }
+ fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause)
+ if cErr.Detail != "" {
+ fmt.Fprintf(w, ": %v", cErr.Detail)
+ }
+}
+
+type lrItemID [32]byte
+
+func (id lrItemID) String() string {
+ return fmt.Sprintf("%x", id.num())
+}
+
+func (id lrItemID) num() uint32 {
+ return binary.LittleEndian.Uint32(id[:])
+}
+
+type lookAhead struct {
+ symbols map[symbol.Symbol]struct{}
+
+ // When propagation is true, an item propagates look-ahead symbols to other items.
+ propagation bool
+}
+
+type lrItem struct {
+ id lrItemID
+ prod productionID
+
+ // E → E + T
+ //
+ // Dot | Dotted Symbol | Item
+ // ----+---------------+------------
+ // 0 | E | E →・E + T
+ // 1 | + | E → E・+ T
+ // 2 | T | E → E +・T
+ // 3 | Nil | E → E + T・
+ dot int
+ dottedSymbol symbol.Symbol
+
+ // When initial is true, the LHS of the production is the augmented start symbol and dot is 0.
+ // It looks like S' →・S.
+ initial bool
+
+ // When reducible is true, the item looks like E → E + T・.
+ reducible bool
+
+ // When kernel is true, the item is kernel item.
+ kernel bool
+
+ // lookAhead stores look-ahead symbols, and they are terminal symbols.
+ // The item is reducible only when the look-ahead symbols appear as the next input symbol.
+ lookAhead lookAhead
+}
+
+func newLR0Item(prod *production, dot int) (*lrItem, error) {
+ if prod == nil {
+ return nil, fmt.Errorf("production must be non-nil")
+ }
+
+ if dot < 0 || dot > prod.rhsLen {
+ return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen)
+ }
+
+ var id lrItemID
+ {
+ b := []byte{}
+ b = append(b, prod.id[:]...)
+ bDot := make([]byte, 8)
+ binary.LittleEndian.PutUint64(bDot, uint64(dot))
+ b = append(b, bDot...)
+ id = sha256.Sum256(b)
+ }
+
+ dottedSymbol := symbol.SymbolNil
+ if dot < prod.rhsLen {
+ dottedSymbol = prod.rhs[dot]
+ }
+
+ initial := false
+ if prod.lhs.IsStart() && dot == 0 {
+ initial = true
+ }
+
+ reducible := false
+ if dot == prod.rhsLen {
+ reducible = true
+ }
+
+ kernel := false
+ if initial || dot > 0 {
+ kernel = true
+ }
+
+ item := &lrItem{
+ id: id,
+ prod: prod.id,
+ dot: dot,
+ dottedSymbol: dottedSymbol,
+ initial: initial,
+ reducible: reducible,
+ kernel: kernel,
+ }
+
+ return item, nil
+}
+
+type kernelID [32]byte
+
+func (id kernelID) String() string {
+ return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:]))
+}
+
+type kernel struct {
+ id kernelID
+ items []*lrItem
+}
+
+func newKernel(items []*lrItem) (*kernel, error) {
+ if len(items) == 0 {
+ return nil, fmt.Errorf("a kernel need at least one item")
+ }
+
+ // Remove duplicates from items.
+ var sortedItems []*lrItem
+ {
+ m := map[lrItemID]*lrItem{}
+ for _, item := range items {
+ if !item.kernel {
+ return nil, fmt.Errorf("not a kernel item: %v", item)
+ }
+ m[item.id] = item
+ }
+ sortedItems = []*lrItem{}
+ for _, item := range m {
+ sortedItems = append(sortedItems, item)
+ }
+ sort.Slice(sortedItems, func(i, j int) bool {
+ return sortedItems[i].id.num() < sortedItems[j].id.num()
+ })
+ }
+
+ var id kernelID
+ {
+ b := []byte{}
+ for _, item := range sortedItems {
+ b = append(b, item.id[:]...)
+ }
+ id = sha256.Sum256(b)
+ }
+
+ return &kernel{
+ id: id,
+ items: sortedItems,
+ }, nil
+}
+
+type stateNum int
+
+const stateNumInitial = stateNum(0)
+
+func (n stateNum) Int() int {
+ return int(n)
+}
+
+func (n stateNum) String() string {
+ return strconv.Itoa(int(n))
+}
+
+func (n stateNum) next() stateNum {
+ return stateNum(n + 1)
+}
+
+type lrState struct {
+ *kernel
+ num stateNum
+ next map[symbol.Symbol]kernelID
+ reducible map[productionID]struct{}
+
+ // emptyProdItems stores items that have an empty production like `p → ε` and is reducible.
+ // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store
+ // look-ahead symbols because the kernel items don't include these items.
+ //
+ // For instance, we have the following productions, and A is a terminal symbol.
+ //
+ // s' → s
+ // s → A | ε
+ //
+ // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't
+ // include `s → ・ε`.
+ //
+ // s' → ・s
+ // s → ・A
+ // s → ・ε
+ emptyProdItems []*lrItem
+
+ // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form.
+ // The `α` and `β` can be empty.
+ //
+ // A → α・error β
+ isErrorTrapper bool
+}
+
+type stateAndLRItem struct {
+ kernelID kernelID
+ itemID lrItemID
+}
+
+type propagation struct {
+ src *stateAndLRItem
+ dest []*stateAndLRItem
+}
+
+type lalr1Automaton struct {
+ *lr0Automaton
+}
+
+func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) {
+ // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $]
+ iniState := lr0.states[lr0.initialState]
+ iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{
+ symbol.SymbolEOF: {},
+ }
+
+ var props []*propagation
+ for _, state := range lr0.states {
+ for _, kItem := range state.items {
+ items, err := genLALR1Closure(kItem, prods, first)
+ if err != nil {
+ return nil, err
+ }
+
+ kItem.lookAhead.propagation = true
+
+ var propDests []*stateAndLRItem
+ for _, item := range items {
+ if item.reducible {
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+
+ if p.isEmpty() {
+ var reducibleItem *lrItem
+ for _, it := range state.emptyProdItems {
+ if it.id != item.id {
+ continue
+ }
+
+ reducibleItem = it
+ break
+ }
+ if reducibleItem == nil {
+ return nil, fmt.Errorf("reducible item not found: %v", item.id)
+ }
+ if reducibleItem.lookAhead.symbols == nil {
+ reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+ for a := range item.lookAhead.symbols {
+ reducibleItem.lookAhead.symbols[a] = struct{}{}
+ }
+
+ propDests = append(propDests, &stateAndLRItem{
+ kernelID: state.id,
+ itemID: item.id,
+ })
+ }
+
+ continue
+ }
+
+ nextKID := state.next[item.dottedSymbol]
+ var nextItemID lrItemID
+ {
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+ it, err := newLR0Item(p, item.dot+1)
+ if err != nil {
+ return nil, fmt.Errorf("failed to generate an item ID: %v", err)
+ }
+ nextItemID = it.id
+ }
+
+ if item.lookAhead.propagation {
+ propDests = append(propDests, &stateAndLRItem{
+ kernelID: nextKID,
+ itemID: nextItemID,
+ })
+ } else {
+ nextState := lr0.states[nextKID]
+ var nextItem *lrItem
+ for _, it := range nextState.items {
+ if it.id != nextItemID {
+ continue
+ }
+ nextItem = it
+ break
+ }
+ if nextItem == nil {
+ return nil, fmt.Errorf("item not found: %v", nextItemID)
+ }
+
+ if nextItem.lookAhead.symbols == nil {
+ nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+
+ for a := range item.lookAhead.symbols {
+ nextItem.lookAhead.symbols[a] = struct{}{}
+ }
+ }
+ }
+ if len(propDests) == 0 {
+ continue
+ }
+
+ props = append(props, &propagation{
+ src: &stateAndLRItem{
+ kernelID: state.id,
+ itemID: kItem.id,
+ },
+ dest: propDests,
+ })
+ }
+ }
+
+ err := propagateLookAhead(lr0, props)
+ if err != nil {
+ return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err)
+ }
+
+ return &lalr1Automaton{
+ lr0Automaton: lr0,
+ }, nil
+}
+
+func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) {
+ items := []*lrItem{}
+ knownItems := map[lrItemID]map[symbol.Symbol]struct{}{}
+ knownItemsProp := map[lrItemID]struct{}{}
+ uncheckedItems := []*lrItem{}
+ items = append(items, srcItem)
+ uncheckedItems = append(uncheckedItems, srcItem)
+ for len(uncheckedItems) > 0 {
+ nextUncheckedItems := []*lrItem{}
+ for _, item := range uncheckedItems {
+ if item.dottedSymbol.IsTerminal() {
+ continue
+ }
+
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+
+ var fstSyms []symbol.Symbol
+ var isFstNullable bool
+ {
+ fst, err := first.find(p, item.dot+1)
+ if err != nil {
+ return nil, err
+ }
+
+ fstSyms = make([]symbol.Symbol, len(fst.symbols))
+ i := 0
+ for s := range fst.symbols {
+ fstSyms[i] = s
+ i++
+ }
+ if fst.empty {
+ isFstNullable = true
+ }
+ }
+
+ ps, _ := prods.findByLHS(item.dottedSymbol)
+ for _, prod := range ps {
+ var lookAhead []symbol.Symbol
+ {
+ var lookAheadCount int
+ if isFstNullable {
+ lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols)
+ } else {
+ lookAheadCount = len(fstSyms)
+ }
+
+ lookAhead = make([]symbol.Symbol, lookAheadCount)
+ i := 0
+ for _, s := range fstSyms {
+ lookAhead[i] = s
+ i++
+ }
+ if isFstNullable {
+ for a := range item.lookAhead.symbols {
+ lookAhead[i] = a
+ i++
+ }
+ }
+ }
+
+ for _, a := range lookAhead {
+ newItem, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if items, exist := knownItems[newItem.id]; exist {
+ if _, exist := items[a]; exist {
+ continue
+ }
+ }
+
+ newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{
+ a: {},
+ }
+
+ items = append(items, newItem)
+ if knownItems[newItem.id] == nil {
+ knownItems[newItem.id] = map[symbol.Symbol]struct{}{}
+ }
+ knownItems[newItem.id][a] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, newItem)
+ }
+
+ if isFstNullable {
+ newItem, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := knownItemsProp[newItem.id]; exist {
+ continue
+ }
+
+ newItem.lookAhead.propagation = true
+
+ items = append(items, newItem)
+ knownItemsProp[newItem.id] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, newItem)
+ }
+ }
+ }
+ uncheckedItems = nextUncheckedItems
+ }
+
+ return items, nil
+}
+
+func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error {
+ for {
+ changed := false
+ for _, prop := range props {
+ srcState, ok := lr0.states[prop.src.kernelID]
+ if !ok {
+ return fmt.Errorf("source state not found: %v", prop.src.kernelID)
+ }
+ var srcItem *lrItem
+ for _, item := range srcState.items {
+ if item.id != prop.src.itemID {
+ continue
+ }
+ srcItem = item
+ break
+ }
+ if srcItem == nil {
+ return fmt.Errorf("source item not found: %v", prop.src.itemID)
+ }
+
+ for _, dest := range prop.dest {
+ destState, ok := lr0.states[dest.kernelID]
+ if !ok {
+ return fmt.Errorf("destination state not found: %v", dest.kernelID)
+ }
+ var destItem *lrItem
+ for _, item := range destState.items {
+ if item.id != dest.itemID {
+ continue
+ }
+ destItem = item
+ break
+ }
+ if destItem == nil {
+ for _, item := range destState.emptyProdItems {
+ if item.id != dest.itemID {
+ continue
+ }
+ destItem = item
+ break
+ }
+ if destItem == nil {
+ return fmt.Errorf("destination item not found: %v", dest.itemID)
+ }
+ }
+
+ for a := range srcItem.lookAhead.symbols {
+ if _, ok := destItem.lookAhead.symbols[a]; ok {
+ continue
+ }
+
+ if destItem.lookAhead.symbols == nil {
+ destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+
+ destItem.lookAhead.symbols[a] = struct{}{}
+ changed = true
+ }
+ }
+ }
+ if !changed {
+ break
+ }
+ }
+
+ return nil
+}
+
+type lr0Automaton struct {
+ initialState kernelID
+ states map[kernelID]*lrState
+}
+
+func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) {
+ if !startSym.IsStart() {
+ return nil, fmt.Errorf("passed symbold is not a start symbol")
+ }
+
+ automaton := &lr0Automaton{
+ states: map[kernelID]*lrState{},
+ }
+
+ currentState := stateNumInitial
+ knownKernels := map[kernelID]struct{}{}
+ uncheckedKernels := []*kernel{}
+
+ // Generate an initial kernel.
+ {
+ prods, _ := prods.findByLHS(startSym)
+ initialItem, err := newLR0Item(prods[0], 0)
+ if err != nil {
+ return nil, err
+ }
+
+ k, err := newKernel([]*lrItem{initialItem})
+ if err != nil {
+ return nil, err
+ }
+
+ automaton.initialState = k.id
+ knownKernels[k.id] = struct{}{}
+ uncheckedKernels = append(uncheckedKernels, k)
+ }
+
+ for len(uncheckedKernels) > 0 {
+ nextUncheckedKernels := []*kernel{}
+ for _, k := range uncheckedKernels {
+ state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym)
+ if err != nil {
+ return nil, err
+ }
+ state.num = currentState
+ currentState = currentState.next()
+
+ automaton.states[state.id] = state
+
+ for _, k := range neighbours {
+ if _, known := knownKernels[k.id]; known {
+ continue
+ }
+ knownKernels[k.id] = struct{}{}
+ nextUncheckedKernels = append(nextUncheckedKernels, k)
+ }
+ }
+ uncheckedKernels = nextUncheckedKernels
+ }
+
+ return automaton, nil
+}
+
+func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) {
+ items, err := genLR0Closure(k, prods)
+ if err != nil {
+ return nil, nil, err
+ }
+ neighbours, err := genNeighbourKernels(items, prods)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ next := map[symbol.Symbol]kernelID{}
+ kernels := []*kernel{}
+ for _, n := range neighbours {
+ next[n.symbol] = n.kernel.id
+ kernels = append(kernels, n.kernel)
+ }
+
+ reducible := map[productionID]struct{}{}
+ var emptyProdItems []*lrItem
+ isErrorTrapper := false
+ for _, item := range items {
+ if item.dottedSymbol == errSym {
+ isErrorTrapper = true
+ }
+
+ if item.reducible {
+ reducible[item.prod] = struct{}{}
+
+ prod, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod)
+ }
+ if prod.isEmpty() {
+ emptyProdItems = append(emptyProdItems, item)
+ }
+ }
+ }
+
+ return &lrState{
+ kernel: k,
+ next: next,
+ reducible: reducible,
+ emptyProdItems: emptyProdItems,
+ isErrorTrapper: isErrorTrapper,
+ }, kernels, nil
+}
+
+func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) {
+ items := []*lrItem{}
+ knownItems := map[lrItemID]struct{}{}
+ uncheckedItems := []*lrItem{}
+ for _, item := range k.items {
+ items = append(items, item)
+ uncheckedItems = append(uncheckedItems, item)
+ }
+ for len(uncheckedItems) > 0 {
+ nextUncheckedItems := []*lrItem{}
+ for _, item := range uncheckedItems {
+ if item.dottedSymbol.IsTerminal() {
+ continue
+ }
+
+ ps, _ := prods.findByLHS(item.dottedSymbol)
+ for _, prod := range ps {
+ item, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := knownItems[item.id]; exist {
+ continue
+ }
+ items = append(items, item)
+ knownItems[item.id] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, item)
+ }
+ }
+ uncheckedItems = nextUncheckedItems
+ }
+
+ return items, nil
+}
+
+type neighbourKernel struct {
+ symbol symbol.Symbol
+ kernel *kernel
+}
+
+func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) {
+ kItemMap := map[symbol.Symbol][]*lrItem{}
+ for _, item := range items {
+ if item.dottedSymbol.IsNil() {
+ continue
+ }
+ prod, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("a production was not found: %v", item.prod)
+ }
+ kItem, err := newLR0Item(prod, item.dot+1)
+ if err != nil {
+ return nil, err
+ }
+ kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem)
+ }
+
+ nextSyms := []symbol.Symbol{}
+ for sym := range kItemMap {
+ nextSyms = append(nextSyms, sym)
+ }
+ sort.Slice(nextSyms, func(i, j int) bool {
+ return nextSyms[i] < nextSyms[j]
+ })
+
+ kernels := []*neighbourKernel{}
+ for _, sym := range nextSyms {
+ k, err := newKernel(kItemMap[sym])
+ if err != nil {
+ return nil, err
+ }
+ kernels = append(kernels, &neighbourKernel{
+ symbol: sym,
+ kernel: k,
+ })
+ }
+
+ return kernels, nil
+}
+
+type ActionType string
+
+const (
+ ActionTypeShift = ActionType("shift")
+ ActionTypeReduce = ActionType("reduce")
+ ActionTypeError = ActionType("error")
+)
+
+type actionEntry int
+
+const actionEntryEmpty = actionEntry(0)
+
+func newShiftActionEntry(state stateNum) actionEntry {
+ return actionEntry(state * -1)
+}
+
+func newReduceActionEntry(prod productionNum) actionEntry {
+ return actionEntry(prod)
+}
+
+func (e actionEntry) isEmpty() bool {
+ return e == actionEntryEmpty
+}
+
+func (e actionEntry) describe() (ActionType, stateNum, productionNum) {
+ if e == actionEntryEmpty {
+ return ActionTypeError, stateNumInitial, productionNumNil
+ }
+ if e < 0 {
+ return ActionTypeShift, stateNum(e * -1), productionNumNil
+ }
+ return ActionTypeReduce, stateNumInitial, productionNum(e)
+}
+
+type GoToType string
+
+const (
+ GoToTypeRegistered = GoToType("registered")
+ GoToTypeError = GoToType("error")
+)
+
+type goToEntry uint
+
+const goToEntryEmpty = goToEntry(0)
+
+func newGoToEntry(state stateNum) goToEntry {
+ return goToEntry(state)
+}
+
+func (e goToEntry) describe() (GoToType, stateNum) {
+ if e == goToEntryEmpty {
+ return GoToTypeError, stateNumInitial
+ }
+ return GoToTypeRegistered, stateNum(e)
+}
+
+type conflictResolutionMethod int
+
+func (m conflictResolutionMethod) Int() int {
+ return int(m)
+}
+
+const (
+ ResolvedByPrec conflictResolutionMethod = 1
+ ResolvedByAssoc conflictResolutionMethod = 2
+ ResolvedByShift conflictResolutionMethod = 3
+ ResolvedByProdOrder conflictResolutionMethod = 4
+)
+
+type conflict interface {
+ conflict()
+}
+
+type shiftReduceConflict struct {
+ state stateNum
+ sym symbol.Symbol
+ nextState stateNum
+ prodNum productionNum
+ resolvedBy conflictResolutionMethod
+}
+
+func (c *shiftReduceConflict) conflict() {
+}
+
+type reduceReduceConflict struct {
+ state stateNum
+ sym symbol.Symbol
+ prodNum1 productionNum
+ prodNum2 productionNum
+ resolvedBy conflictResolutionMethod
+}
+
+func (c *reduceReduceConflict) conflict() {
+}
+
+var (
+ _ conflict = &shiftReduceConflict{}
+ _ conflict = &reduceReduceConflict{}
+)
+
+type ParsingTable struct {
+ actionTable []actionEntry
+ goToTable []goToEntry
+ stateCount int
+ terminalCount int
+ nonTerminalCount int
+
+ // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`,
+ // the state has an item having the following form. The `α` and `β` can be empty.
+ //
+ // A → α・error β
+ errorTrapperStates []int
+
+ InitialState stateNum
+}
+
+func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) {
+ pos := state.Int()*t.terminalCount + sym.Int()
+ return t.actionTable[pos].describe()
+}
+
+func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) {
+ pos := state.Int()*t.nonTerminalCount + sym.Int()
+ return t.goToTable[pos].describe()
+}
+
+func (t *ParsingTable) readAction(row int, col int) actionEntry {
+ return t.actionTable[row*t.terminalCount+col]
+}
+
+func (t *ParsingTable) writeAction(row int, col int, act actionEntry) {
+ t.actionTable[row*t.terminalCount+col] = act
+}
+
+func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) {
+ pos := state.Int()*t.nonTerminalCount + sym.Num().Int()
+ t.goToTable[pos] = newGoToEntry(nextState)
+}
+
+type lrTableBuilder struct {
+ automaton *lr0Automaton
+ prods *productionSet
+ termCount int
+ nonTermCount int
+ symTab *symbol.SymbolTableReader
+ precAndAssoc *precAndAssoc
+
+ conflicts []conflict
+}
+
+func (b *lrTableBuilder) build() (*ParsingTable, error) {
+ var ptab *ParsingTable
+ {
+ initialState := b.automaton.states[b.automaton.initialState]
+ ptab = &ParsingTable{
+ actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount),
+ goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount),
+ stateCount: len(b.automaton.states),
+ terminalCount: b.termCount,
+ nonTerminalCount: b.nonTermCount,
+ errorTrapperStates: make([]int, len(b.automaton.states)),
+ InitialState: initialState.num,
+ }
+ }
+
+ for _, state := range b.automaton.states {
+ if state.isErrorTrapper {
+ ptab.errorTrapperStates[state.num] = 1
+ }
+
+ for sym, kID := range state.next {
+ nextState := b.automaton.states[kID]
+ if sym.IsTerminal() {
+ b.writeShiftAction(ptab, state.num, sym, nextState.num)
+ } else {
+ ptab.writeGoTo(state.num, sym, nextState.num)
+ }
+ }
+
+ for prodID := range state.reducible {
+ reducibleProd, ok := b.prods.findByID(prodID)
+ if !ok {
+ return nil, fmt.Errorf("reducible production not found: %v", prodID)
+ }
+
+ var reducibleItem *lrItem
+ for _, item := range state.items {
+ if item.prod != reducibleProd.id {
+ continue
+ }
+
+ reducibleItem = item
+ break
+ }
+ if reducibleItem == nil {
+ for _, item := range state.emptyProdItems {
+ if item.prod != reducibleProd.id {
+ continue
+ }
+
+ reducibleItem = item
+ break
+ }
+ if reducibleItem == nil {
+ return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num)
+ }
+ }
+
+ for a := range reducibleItem.lookAhead.symbols {
+ b.writeReduceAction(ptab, state.num, a, reducibleProd.num)
+ }
+ }
+ }
+
+ return ptab, nil
+}
+
+// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred,
+// we prioritize the shift action.
+func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) {
+ act := tab.readAction(state.Int(), sym.Num().Int())
+ if !act.isEmpty() {
+ ty, _, p := act.describe()
+ if ty == ActionTypeReduce {
+ act, method := b.resolveSRConflict(sym.Num(), p)
+ b.conflicts = append(b.conflicts, &shiftReduceConflict{
+ state: state,
+ sym: sym,
+ nextState: nextState,
+ prodNum: p,
+ resolvedBy: method,
+ })
+ if act == ActionTypeShift {
+ tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
+ }
+ return
+ }
+ }
+ tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
+}
+
+// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred,
+// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces
+// the production with higher priority. Productions defined earlier in the grammar file have a higher priority.
+func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) {
+ act := tab.readAction(state.Int(), sym.Num().Int())
+ if !act.isEmpty() {
+ ty, s, p := act.describe()
+ switch ty {
+ case ActionTypeReduce:
+ if p == prod {
+ return
+ }
+
+ b.conflicts = append(b.conflicts, &reduceReduceConflict{
+ state: state,
+ sym: sym,
+ prodNum1: p,
+ prodNum2: prod,
+ resolvedBy: ResolvedByProdOrder,
+ })
+ if p < prod {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p))
+ } else {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+ }
+ case ActionTypeShift:
+ act, method := b.resolveSRConflict(sym.Num(), prod)
+ b.conflicts = append(b.conflicts, &shiftReduceConflict{
+ state: state,
+ sym: sym,
+ nextState: s,
+ prodNum: prod,
+ resolvedBy: method,
+ })
+ if act == ActionTypeReduce {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+ }
+ }
+ return
+ }
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+}
+
+func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) {
+ symPrec := b.precAndAssoc.terminalPrecedence(sym)
+ prodPrec := b.precAndAssoc.productionPredence(prod)
+ if symPrec == 0 || prodPrec == 0 {
+ return ActionTypeShift, ResolvedByShift
+ }
+ if symPrec == prodPrec {
+ assoc := b.precAndAssoc.productionAssociativity(prod)
+ if assoc != assocTypeLeft {
+ return ActionTypeShift, ResolvedByAssoc
+ }
+ return ActionTypeReduce, ResolvedByAssoc
+ }
+ if symPrec < prodPrec {
+ return ActionTypeShift, ResolvedByPrec
+ }
+ return ActionTypeReduce, ResolvedByPrec
+}
+
+func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) {
+ var terms []*spec.Terminal
+ {
+ termSyms := b.symTab.TerminalSymbols()
+ terms = make([]*spec.Terminal, len(termSyms)+1)
+
+ for _, sym := range termSyms {
+ name, ok := b.symTab.ToText(sym)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym)
+ }
+
+ term := &spec.Terminal{
+ Number: sym.Num().Int(),
+ Name: name,
+ }
+
+ prec := b.precAndAssoc.terminalPrecedence(sym.Num())
+ if prec != precNil {
+ term.Precedence = prec
+ }
+
+ assoc := b.precAndAssoc.terminalAssociativity(sym.Num())
+ switch assoc {
+ case assocTypeLeft:
+ term.Associativity = "l"
+ case assocTypeRight:
+ term.Associativity = "r"
+ }
+
+ terms[sym.Num()] = term
+ }
+ }
+
+ var nonTerms []*spec.NonTerminal
+ {
+ nonTermSyms := b.symTab.NonTerminalSymbols()
+ nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1)
+ for _, sym := range nonTermSyms {
+ name, ok := b.symTab.ToText(sym)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym)
+ }
+
+ nonTerms[sym.Num()] = &spec.NonTerminal{
+ Number: sym.Num().Int(),
+ Name: name,
+ }
+ }
+ }
+
+ var prods []*spec.Production
+ {
+ ps := gram.productionSet.getAllProductions()
+ prods = make([]*spec.Production, len(ps)+1)
+ for _, p := range ps {
+ rhs := make([]int, len(p.rhs))
+ for i, e := range p.rhs {
+ if e.IsTerminal() {
+ rhs[i] = e.Num().Int()
+ } else {
+ rhs[i] = e.Num().Int() * -1
+ }
+ }
+
+ prod := &spec.Production{
+ Number: p.num.Int(),
+ LHS: p.lhs.Num().Int(),
+ RHS: rhs,
+ }
+
+ prec := b.precAndAssoc.productionPredence(p.num)
+ if prec != precNil {
+ prod.Precedence = prec
+ }
+
+ assoc := b.precAndAssoc.productionAssociativity(p.num)
+ switch assoc {
+ case assocTypeLeft:
+ prod.Associativity = "l"
+ case assocTypeRight:
+ prod.Associativity = "r"
+ }
+
+ prods[p.num.Int()] = prod
+ }
+ }
+
+ var states []*spec.State
+ {
+ srConflicts := map[stateNum][]*shiftReduceConflict{}
+ rrConflicts := map[stateNum][]*reduceReduceConflict{}
+ for _, con := range b.conflicts {
+ switch c := con.(type) {
+ case *shiftReduceConflict:
+ srConflicts[c.state] = append(srConflicts[c.state], c)
+ case *reduceReduceConflict:
+ rrConflicts[c.state] = append(rrConflicts[c.state], c)
+ }
+ }
+
+ states = make([]*spec.State, len(b.automaton.states))
+ for _, s := range b.automaton.states {
+ kernel := make([]*spec.Item, len(s.items))
+ for i, item := range s.items {
+ p, ok := b.prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod)
+ }
+
+ kernel[i] = &spec.Item{
+ Production: p.num.Int(),
+ Dot: item.dot,
+ }
+ }
+
+ sort.Slice(kernel, func(i, j int) bool {
+ if kernel[i].Production < kernel[j].Production {
+ return true
+ }
+ if kernel[i].Production > kernel[j].Production {
+ return false
+ }
+ return kernel[i].Dot < kernel[j].Dot
+ })
+
+ var shift []*spec.Transition
+ var reduce []*spec.Reduce
+ var goTo []*spec.Transition
+ {
+ TERMINALS_LOOP:
+ for _, t := range b.symTab.TerminalSymbols() {
+ act, next, prod := tab.getAction(s.num, t.Num())
+ switch act {
+ case ActionTypeShift:
+ shift = append(shift, &spec.Transition{
+ Symbol: t.Num().Int(),
+ State: next.Int(),
+ })
+ case ActionTypeReduce:
+ for _, r := range reduce {
+ if r.Production == prod.Int() {
+ r.LookAhead = append(r.LookAhead, t.Num().Int())
+ continue TERMINALS_LOOP
+ }
+ }
+ reduce = append(reduce, &spec.Reduce{
+ LookAhead: []int{t.Num().Int()},
+ Production: prod.Int(),
+ })
+ }
+ }
+
+ for _, n := range b.symTab.NonTerminalSymbols() {
+ ty, next := tab.getGoTo(s.num, n.Num())
+ if ty == GoToTypeRegistered {
+ goTo = append(goTo, &spec.Transition{
+ Symbol: n.Num().Int(),
+ State: next.Int(),
+ })
+ }
+ }
+
+ sort.Slice(shift, func(i, j int) bool {
+ return shift[i].State < shift[j].State
+ })
+ sort.Slice(reduce, func(i, j int) bool {
+ return reduce[i].Production < reduce[j].Production
+ })
+ sort.Slice(goTo, func(i, j int) bool {
+ return goTo[i].State < goTo[j].State
+ })
+ }
+
+ sr := []*spec.SRConflict{}
+ rr := []*spec.RRConflict{}
+ {
+ for _, c := range srConflicts[s.num] {
+ conflict := &spec.SRConflict{
+ Symbol: c.sym.Num().Int(),
+ State: c.nextState.Int(),
+ Production: c.prodNum.Int(),
+ ResolvedBy: c.resolvedBy.Int(),
+ }
+
+ ty, s, p := tab.getAction(s.num, c.sym.Num())
+ switch ty {
+ case ActionTypeShift:
+ n := s.Int()
+ conflict.AdoptedState = &n
+ case ActionTypeReduce:
+ n := p.Int()
+ conflict.AdoptedProduction = &n
+ }
+
+ sr = append(sr, conflict)
+ }
+
+ sort.Slice(sr, func(i, j int) bool {
+ return sr[i].Symbol < sr[j].Symbol
+ })
+
+ for _, c := range rrConflicts[s.num] {
+ conflict := &spec.RRConflict{
+ Symbol: c.sym.Num().Int(),
+ Production1: c.prodNum1.Int(),
+ Production2: c.prodNum2.Int(),
+ ResolvedBy: c.resolvedBy.Int(),
+ }
+
+ _, _, p := tab.getAction(s.num, c.sym.Num())
+ conflict.AdoptedProduction = p.Int()
+
+ rr = append(rr, conflict)
+ }
+
+ sort.Slice(rr, func(i, j int) bool {
+ return rr[i].Symbol < rr[j].Symbol
+ })
+ }
+
+ states[s.num.Int()] = &spec.State{
+ Number: s.num.Int(),
+ Kernel: kernel,
+ Shift: shift,
+ Reduce: reduce,
+ GoTo: goTo,
+ SRConflict: sr,
+ RRConflict: rr,
+ }
+ }
+ }
+
+ return &spec.Report{
+ Terminals: terms,
+ NonTerminals: nonTerms,
+ Productions: prods,
+ States: states,
+ }, nil
+}
+
+type productionID [32]byte
+
+func (id productionID) String() string {
+ return hex.EncodeToString(id[:])
+}
+
+func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID {
+ seq := lhs.Byte()
+ for _, sym := range rhs {
+ seq = append(seq, sym.Byte()...)
+ }
+ return productionID(sha256.Sum256(seq))
+}
+
+type productionNum uint16
+
+const (
+ productionNumNil = productionNum(0)
+ productionNumStart = productionNum(1)
+ productionNumMin = productionNum(2)
+)
+
+func (n productionNum) Int() int {
+ return int(n)
+}
+
+type production struct {
+ id productionID
+ num productionNum
+ lhs symbol.Symbol
+ rhs []symbol.Symbol
+ rhsLen int
+}
+
+func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) {
+ if lhs.IsNil() {
+ return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
+ }
+ for _, sym := range rhs {
+ if sym.IsNil() {
+ return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
+ }
+ }
+
+ return &production{
+ id: genProductionID(lhs, rhs),
+ lhs: lhs,
+ rhs: rhs,
+ rhsLen: len(rhs),
+ }, nil
+}
+
+func (p *production) isEmpty() bool {
+ return p.rhsLen == 0
+}
+
+type productionSet struct {
+ lhs2Prods map[symbol.Symbol][]*production
+ id2Prod map[productionID]*production
+ num productionNum
+}
+
+func newProductionSet() *productionSet {
+ return &productionSet{
+ lhs2Prods: map[symbol.Symbol][]*production{},
+ id2Prod: map[productionID]*production{},
+ num: productionNumMin,
+ }
+}
+
+func (ps *productionSet) append(prod *production) {
+ if _, ok := ps.id2Prod[prod.id]; ok {
+ return
+ }
+
+ if prod.lhs.IsStart() {
+ prod.num = productionNumStart
+ } else {
+ prod.num = ps.num
+ ps.num++
+ }
+
+ if prods, ok := ps.lhs2Prods[prod.lhs]; ok {
+ ps.lhs2Prods[prod.lhs] = append(prods, prod)
+ } else {
+ ps.lhs2Prods[prod.lhs] = []*production{prod}
+ }
+ ps.id2Prod[prod.id] = prod
+}
+
+func (ps *productionSet) findByID(id productionID) (*production, bool) {
+ prod, ok := ps.id2Prod[id]
+ return prod, ok
+}
+
+func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) {
+ if lhs.IsNil() {
+ return nil, false
+ }
+
+ prods, ok := ps.lhs2Prods[lhs]
+ return prods, ok
+}
+
+func (ps *productionSet) getAllProductions() map[productionID]*production {
+ return ps.id2Prod
+}
+
+var (
+ semErrNoGrammarName = errors.New("name is missing")
+ semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling")
+ semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol")
+ semErrUndefinedPrec = errors.New("symbol must has precedence")
+ semErrUndefinedOrdSym = errors.New("undefined ordered symbol")
+ semErrUnusedProduction = errors.New("unused production")
+ semErrUnusedTerminal = errors.New("unused terminal")
+ semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped")
+ semErrNoProduction = errors.New("a grammar needs at least one production")
+ semErrUndefinedSym = errors.New("undefined symbol")
+ semErrDuplicateProduction = errors.New("duplicate production")
+ semErrDuplicateTerminal = errors.New("duplicate terminal")
+ semErrDuplicateFragment = errors.New("duplicate fragment")
+ semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals")
+ semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol")
+ semErrDuplicateLabel = errors.New("a label must be unique in an alternative")
+ semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols")
+ semErrDirInvalidName = errors.New("invalid directive name")
+ semErrDirInvalidParam = errors.New("invalid parameter")
+ semErrDuplicateDir = errors.New("a directive must not be duplicated")
+ semErrDuplicateElem = errors.New("duplicate element")
+ semErrAmbiguousElem = errors.New("ambiguous element")
+ semErrInvalidProdDir = errors.New("invalid production directive")
+ semErrInvalidAltDir = errors.New("invalid alternative directive")
+)
diff --git a/src/urubu/grammar/first.go b/src/urubu/grammar/first.go
deleted file mode 100644
index 6443bcf..0000000
--- a/src/urubu/grammar/first.go
+++ /dev/null
@@ -1,148 +0,0 @@
-package grammar
-
-import (
- "fmt"
-
- "urubu/grammar/symbol"
-)
-
-type firstEntry struct {
- symbols map[symbol.Symbol]struct{}
- empty bool
-}
-
-func newFirstEntry() *firstEntry {
- return &firstEntry{
- symbols: map[symbol.Symbol]struct{}{},
- empty: false,
- }
-}
-
-func (e *firstEntry) add(sym symbol.Symbol) bool {
- if _, ok := e.symbols[sym]; ok {
- return false
- }
- e.symbols[sym] = struct{}{}
- return true
-}
-
-func (e *firstEntry) addEmpty() bool {
- if !e.empty {
- e.empty = true
- return true
- }
- return false
-}
-
-func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool {
- if target == nil {
- return false
- }
- changed := false
- for sym := range target.symbols {
- added := e.add(sym)
- if added {
- changed = true
- }
- }
- return changed
-}
-
-type firstSet struct {
- set map[symbol.Symbol]*firstEntry
-}
-
-func newFirstSet(prods *productionSet) *firstSet {
- fst := &firstSet{
- set: map[symbol.Symbol]*firstEntry{},
- }
- for _, prod := range prods.getAllProductions() {
- if _, ok := fst.set[prod.lhs]; ok {
- continue
- }
- fst.set[prod.lhs] = newFirstEntry()
- }
-
- return fst
-}
-
-func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) {
- entry := newFirstEntry()
- if prod.rhsLen <= head {
- entry.addEmpty()
- return entry, nil
- }
- for _, sym := range prod.rhs[head:] {
- if sym.IsTerminal() {
- entry.add(sym)
- return entry, nil
- }
-
- e := fst.findBySymbol(sym)
- if e == nil {
- return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym)
- }
- for s := range e.symbols {
- entry.add(s)
- }
- if !e.empty {
- return entry, nil
- }
- }
- entry.addEmpty()
- return entry, nil
-}
-
-func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry {
- return fst.set[sym]
-}
-
-type firstComContext struct {
- first *firstSet
-}
-
-func newFirstComContext(prods *productionSet) *firstComContext {
- return &firstComContext{
- first: newFirstSet(prods),
- }
-}
-
-func genFirstSet(prods *productionSet) (*firstSet, error) {
- cc := newFirstComContext(prods)
- for {
- more := false
- for _, prod := range prods.getAllProductions() {
- e := cc.first.findBySymbol(prod.lhs)
- changed, err := genProdFirstEntry(cc, e, prod)
- if err != nil {
- return nil, err
- }
- if changed {
- more = true
- }
- }
- if !more {
- break
- }
- }
- return cc.first, nil
-}
-
-func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) {
- if prod.isEmpty() {
- return acc.addEmpty(), nil
- }
-
- for _, sym := range prod.rhs {
- if sym.IsTerminal() {
- return acc.add(sym), nil
- }
-
- e := cc.first.findBySymbol(sym)
- changed := acc.mergeExceptEmpty(e)
- if !e.empty {
- return changed, nil
- }
- }
- return acc.addEmpty(), nil
-}
diff --git a/src/urubu/grammar/grammar.go b/src/urubu/grammar/grammar.go
deleted file mode 100644
index bfa53c6..0000000
--- a/src/urubu/grammar/grammar.go
+++ /dev/null
@@ -1,1390 +0,0 @@
-package grammar
-
-import (
- "fmt"
- "io"
- "strings"
-
- verr "urubu/error"
- "urubu/grammar/lexical"
- "urubu/grammar/symbol"
- spec "urubu/spec/grammar"
- "urubu/spec/grammar/parser"
-)
-
-type astActionEntry struct {
- position int
- expansion bool
-}
-
-type assocType string
-
-const (
- assocTypeNil = assocType("")
- assocTypeLeft = assocType("left")
- assocTypeRight = assocType("right")
-)
-
-const (
- precNil = 0
- precMin = 1
-)
-
-// precAndAssoc represents precedence and associativities of terminal symbols and productions.
-// We use the priority of the production to resolve shift/reduce conflicts.
-type precAndAssoc struct {
- // termPrec and termAssoc represent the precedence of the terminal symbols.
- termPrec map[symbol.SymbolNum]int
- termAssoc map[symbol.SymbolNum]assocType
-
- // prodPrec and prodAssoc represent the precedence and the associativities of the production.
- // These values are inherited from the right-most terminal symbols in the RHS of the productions.
- prodPrec map[productionNum]int
- prodAssoc map[productionNum]assocType
-}
-
-func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int {
- prec, ok := pa.termPrec[sym]
- if !ok {
- return precNil
- }
-
- return prec
-}
-
-func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType {
- assoc, ok := pa.termAssoc[sym]
- if !ok {
- return assocTypeNil
- }
-
- return assoc
-}
-
-func (pa *precAndAssoc) productionPredence(prod productionNum) int {
- prec, ok := pa.prodPrec[prod]
- if !ok {
- return precNil
- }
-
- return prec
-}
-
-func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType {
- assoc, ok := pa.prodAssoc[prod]
- if !ok {
- return assocTypeNil
- }
-
- return assoc
-}
-
-const reservedSymbolNameError = "error"
-
-type Grammar struct {
- name string
- lexSpec *lexical.LexSpec
- skipSymbols []symbol.Symbol
- productionSet *productionSet
- augmentedStartSymbol symbol.Symbol
- errorSymbol symbol.Symbol
- symbolTable *symbol.SymbolTableReader
- astActions map[productionID][]*astActionEntry
- precAndAssoc *precAndAssoc
-
- // recoverProductions is a set of productions having the recover directive.
- recoverProductions map[productionID]struct{}
-}
-
-type buildConfig struct {
- isReportingEnabled bool
-}
-
-type BuildOption func(config *buildConfig)
-
-func EnableReporting() BuildOption {
- return func(config *buildConfig) {
- config.isReportingEnabled = true
- }
-}
-
-type GrammarBuilder struct {
- AST *parser.RootNode
-
- errs verr.SpecErrors
-}
-
-func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
- gram, err := b.build()
- if err != nil {
- return nil, nil, err
- }
-
- return compile(gram, opts...)
-}
-
-func (b *GrammarBuilder) build() (*Grammar, error) {
- var specName string
- {
- errOccurred := false
- for _, dir := range b.AST.Directives {
- if dir.Name != "name" {
- continue
- }
-
- if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'name' takes just one ID parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
-
- errOccurred = true
- break
- }
-
- specName = dir.Parameters[0].ID
- break
- }
-
- if specName == "" && !errOccurred {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrNoGrammarName,
- })
- }
- }
-
- b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST)
- if len(b.errs) > 0 {
- return nil, b.errs
- }
-
- symTab, ss, err := b.genSymbolTable(b.AST)
- if err != nil {
- return nil, err
- }
-
- lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST)
- if err != nil {
- return nil, err
- }
-
- prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym)
- if err != nil {
- return nil, err
- }
- if prodsAndActs == nil && len(b.errs) > 0 {
- return nil, b.errs
- }
-
- pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs)
- if err != nil {
- return nil, err
- }
- if pa == nil && len(b.errs) > 0 {
- return nil, b.errs
- }
-
- syms := findUsedAndUnusedSymbols(b.AST)
- if syms == nil && len(b.errs) > 0 {
- return nil, b.errs
- }
-
- // When a terminal symbol that cannot be reached from the start symbol has the skip directive,
- // the compiler treats its terminal as a used symbol, not unused.
- {
- r := symTab.Reader()
- for _, sym := range skip {
- s, _ := r.ToText(sym)
- if _, ok := syms.unusedTerminals[s]; !ok {
- prod := syms.usedTerminals[s]
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrTermCannotBeSkipped,
- Detail: s,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- continue
- }
-
- delete(syms.unusedTerminals, s)
- }
- }
-
- for sym, prod := range syms.unusedProductions {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrUnusedProduction,
- Detail: sym,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- }
-
- for sym, prod := range syms.unusedTerminals {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrUnusedTerminal,
- Detail: sym,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- }
-
- if len(b.errs) > 0 {
- return nil, b.errs
- }
-
- return &Grammar{
- name: specName,
- lexSpec: lexSpec,
- skipSymbols: skip,
- productionSet: prodsAndActs.prods,
- augmentedStartSymbol: prodsAndActs.augStartSym,
- errorSymbol: ss.errSym,
- symbolTable: symTab.Reader(),
- astActions: prodsAndActs.astActs,
- recoverProductions: prodsAndActs.recoverProds,
- precAndAssoc: pa,
- }, nil
-}
-
-type usedAndUnusedSymbols struct {
- unusedProductions map[string]*parser.ProductionNode
- unusedTerminals map[string]*parser.ProductionNode
- usedTerminals map[string]*parser.ProductionNode
-}
-
-func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols {
- prods := map[string]*parser.ProductionNode{}
- lexProds := map[string]*parser.ProductionNode{}
- mark := map[string]bool{}
- {
- for _, p := range root.Productions {
- prods[p.LHS] = p
- mark[p.LHS] = false
- for _, alt := range p.RHS {
- for _, e := range alt.Elements {
- if e.ID == "" {
- continue
- }
- mark[e.ID] = false
- }
- }
- }
-
- for _, p := range root.LexProductions {
- lexProds[p.LHS] = p
- mark[p.LHS] = false
- }
-
- start := root.Productions[0]
- mark[start.LHS] = true
- markUsedSymbols(mark, map[string]bool{}, prods, start)
-
- // We don't have to check the error symbol because the error symbol doesn't have a production.
- delete(mark, reservedSymbolNameError)
- }
-
- usedTerms := make(map[string]*parser.ProductionNode, len(lexProds))
- unusedProds := map[string]*parser.ProductionNode{}
- unusedTerms := map[string]*parser.ProductionNode{}
- for sym, used := range mark {
- if p, ok := prods[sym]; ok {
- if used {
- continue
- }
- unusedProds[sym] = p
- continue
- }
- if p, ok := lexProds[sym]; ok {
- if used {
- usedTerms[sym] = p
- } else {
- unusedTerms[sym] = p
- }
- continue
- }
-
- // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error
- // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can
- // ignore it here.
- }
-
- return &usedAndUnusedSymbols{
- usedTerminals: usedTerms,
- unusedProductions: unusedProds,
- unusedTerminals: unusedTerms,
- }
-}
-
-func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) {
- if marked[prod.LHS] {
- return
- }
-
- for _, alt := range prod.RHS {
- for _, e := range alt.Elements {
- if e.ID == "" {
- continue
- }
-
- mark[e.ID] = true
-
- p, ok := prods[e.ID]
- if !ok {
- continue
- }
-
- // Remove a production to avoid inifinite recursion.
- marked[prod.LHS] = true
-
- markUsedSymbols(mark, marked, prods, p)
- }
- }
-}
-
-func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) {
- var ids []string
- {
- for _, prod := range root.Productions {
- ids = append(ids, prod.LHS)
- for _, alt := range prod.RHS {
- for _, elem := range alt.Elements {
- if elem.Label != nil {
- ids = append(ids, elem.Label.Name)
- }
- }
- }
- }
- for _, prod := range root.LexProductions {
- ids = append(ids, prod.LHS)
- }
- for _, dir := range root.Directives {
- dirIDs := collectUserDefinedIDsFromDirective(dir)
- if len(dirIDs) > 0 {
- ids = append(ids, dirIDs...)
- }
- }
- }
-
- duplicated := lexical.FindSpellingInconsistencies(ids)
- if len(duplicated) == 0 {
- return
- }
-
- for _, dup := range duplicated {
- var s string
- {
- var b strings.Builder
- fmt.Fprintf(&b, "%+v", dup[0])
- for _, id := range dup[1:] {
- fmt.Fprintf(&b, ", %+v", id)
- }
- s = b.String()
- }
-
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrSpellingInconsistency,
- Detail: s,
- })
- }
-}
-
-func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string {
- var ids []string
- for _, param := range dir.Parameters {
- if param.Group != nil {
- for _, d := range param.Group {
- dIDs := collectUserDefinedIDsFromDirective(d)
- if len(dIDs) > 0 {
- ids = append(ids, dIDs...)
- }
- }
- }
- if param.OrderedSymbol != "" {
- ids = append(ids, param.OrderedSymbol)
- }
- }
- return ids
-}
-
-type symbols struct {
- errSym symbol.Symbol
- augStartSym symbol.Symbol
- startSym symbol.Symbol
-}
-
-func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) {
- symTab := symbol.NewSymbolTable()
- w := symTab.Writer()
- r := symTab.Reader()
-
- // We need to register the reserved symbol before registering others.
- var errSym symbol.Symbol
- {
- sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError)
- if err != nil {
- return nil, nil, err
- }
- errSym = sym
- }
-
- for _, prod := range root.LexProductions {
- if sym, exist := r.ToSymbol(prod.LHS); exist {
- if sym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrErrSymIsReserved,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- } else {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateTerminal,
- Detail: prod.LHS,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- }
-
- continue
- }
-
- _, err := w.RegisterTerminalSymbol(prod.LHS)
- if err != nil {
- return nil, nil, err
- }
- }
-
- startProd := root.Productions[0]
- augStartText := fmt.Sprintf("%s'", startProd.LHS)
- var err error
- augStartSym, err := w.RegisterStartSymbol(augStartText)
- if err != nil {
- return nil, nil, err
- }
- if augStartSym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrErrSymIsReserved,
- Row: startProd.Pos.Row,
- Col: startProd.Pos.Col,
- })
- }
-
- startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS)
- if err != nil {
- return nil, nil, err
- }
- if startSym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrErrSymIsReserved,
- Row: startProd.Pos.Row,
- Col: startProd.Pos.Col,
- })
- }
-
- for _, prod := range root.Productions {
- sym, err := w.RegisterNonTerminalSymbol(prod.LHS)
- if err != nil {
- return nil, nil, err
- }
- if sym.IsTerminal() {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateName,
- Detail: prod.LHS,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- }
- if sym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrErrSymIsReserved,
- Row: prod.Pos.Row,
- Col: prod.Pos.Col,
- })
- }
- }
-
- return symTab, &symbols{
- errSym: errSym,
- augStartSym: augStartSym,
- startSym: startSym,
- }, nil
-}
-
-func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) {
- entries := []*lexical.LexEntry{}
- skipSyms := []symbol.Symbol{}
- for _, prod := range root.LexProductions {
- entry, skip, specErr, err := genLexEntry(prod)
- if err != nil {
- return nil, nil, err
- }
- if specErr != nil {
- b.errs = append(b.errs, specErr)
- continue
- }
- if skip {
- sym, _ := symTab.ToSymbol(prod.LHS)
- skipSyms = append(skipSyms, sym)
- }
- entries = append(entries, entry)
- }
-
- checkedFragments := map[string]struct{}{}
- for _, fragment := range root.Fragments {
- if _, exist := checkedFragments[fragment.LHS]; exist {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateFragment,
- Detail: fragment.LHS,
- Row: fragment.Pos.Row,
- Col: fragment.Pos.Col,
- })
- continue
- }
- checkedFragments[fragment.LHS] = struct{}{}
-
- entries = append(entries, &lexical.LexEntry{
- Fragment: true,
- Kind: spec.LexKindName(fragment.LHS),
- Pattern: fragment.RHS,
- })
- }
-
- return &lexical.LexSpec{
- Entries: entries,
- }, skipSyms, nil
-}
-
-func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) {
- alt := prod.RHS[0]
- elem := alt.Elements[0]
-
- var pattern string
- if elem.Literally {
- pattern = spec.EscapePattern(elem.Pattern)
- } else {
- pattern = elem.Pattern
- }
-
- var modes []spec.LexModeName
- var skip bool
- var push spec.LexModeName
- var pop bool
- dirConsumed := map[string]struct{}{}
- for _, dir := range prod.Directives {
- if _, consumed := dirConsumed[dir.Name]; consumed {
- return nil, false, &verr.SpecError{
- Cause: semErrDuplicateDir,
- Detail: dir.Name,
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- dirConsumed[dir.Name] = struct{}{}
-
- switch dir.Name {
- case "mode":
- if len(dir.Parameters) == 0 {
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'mode' directive needs an ID parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- for _, param := range dir.Parameters {
- if param.ID == "" {
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'mode' directive needs an ID parameter",
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- }, nil
- }
- modes = append(modes, spec.LexModeName(param.ID))
- }
- case "skip":
- if len(dir.Parameters) > 0 {
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'skip' directive needs no parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- skip = true
- case "push":
- if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'push' directive needs an ID parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- push = spec.LexModeName(dir.Parameters[0].ID)
- case "pop":
- if len(dir.Parameters) > 0 {
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'pop' directive needs no parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- pop = true
- default:
- return nil, false, &verr.SpecError{
- Cause: semErrDirInvalidName,
- Detail: dir.Name,
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- }, nil
- }
- }
-
- if len(alt.Directives) > 0 {
- return nil, false, &verr.SpecError{
- Cause: semErrInvalidAltDir,
- Detail: "a lexical production cannot have alternative directives",
- Row: alt.Directives[0].Pos.Row,
- Col: alt.Directives[0].Pos.Col,
- }, nil
- }
-
- return &lexical.LexEntry{
- Modes: modes,
- Kind: spec.LexKindName(prod.LHS),
- Pattern: pattern,
- Push: push,
- Pop: pop,
- }, skip, nil, nil
-}
-
-type productionsAndActions struct {
- prods *productionSet
- augStartSym symbol.Symbol
- astActs map[productionID][]*astActionEntry
- prodPrecsTerm map[productionID]symbol.Symbol
- prodPrecsOrdSym map[productionID]string
- prodPrecPoss map[productionID]*parser.Position
- recoverProds map[productionID]struct{}
-}
-
-func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) {
- if len(root.Productions) == 0 {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrNoProduction,
- })
- return nil, nil
- }
-
- prods := newProductionSet()
- astActs := map[productionID][]*astActionEntry{}
- prodPrecsTerm := map[productionID]symbol.Symbol{}
- prodPrecsOrdSym := map[productionID]string{}
- prodPrecPoss := map[productionID]*parser.Position{}
- recoverProds := map[productionID]struct{}{}
-
- p, err := newProduction(augStartSym, []symbol.Symbol{
- startSym,
- })
- if err != nil {
- return nil, err
- }
-
- prods.append(p)
-
- for _, prod := range root.Productions {
- lhsSym, ok := symTab.ToSymbol(prod.LHS)
- if !ok {
- // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here.
- return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS)
- }
-
- if len(prod.Directives) > 0 {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrInvalidProdDir,
- Detail: "a production cannot have production directives",
- Row: prod.Directives[0].Pos.Row,
- Col: prod.Directives[0].Pos.Col,
- })
- continue
- }
-
- LOOP_RHS:
- for _, alt := range prod.RHS {
- altSyms := make([]symbol.Symbol, len(alt.Elements))
- offsets := map[string]int{}
- ambiguousIDOffsets := map[string]struct{}{}
- for i, elem := range alt.Elements {
- sym, ok := symTab.ToSymbol(elem.ID)
- if !ok {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrUndefinedSym,
- Detail: elem.ID,
- Row: elem.Pos.Row,
- Col: elem.Pos.Col,
- })
- continue LOOP_RHS
- }
- altSyms[i] = sym
-
- if elem.Label != nil {
- if _, added := offsets[elem.Label.Name]; added {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateLabel,
- Detail: elem.Label.Name,
- Row: elem.Label.Pos.Row,
- Col: elem.Label.Pos.Col,
- })
- continue LOOP_RHS
- }
- if _, found := symTab.ToSymbol(elem.Label.Name); found {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrInvalidLabel,
- Detail: elem.Label.Name,
- Row: elem.Label.Pos.Row,
- Col: elem.Label.Pos.Col,
- })
- continue LOOP_RHS
- }
- offsets[elem.Label.Name] = i
- }
- // A symbol having a label can be specified by both the label and the symbol name.
- // So record the symbol's position, whether or not it has a label.
- if elem.ID != "" {
- if _, exist := offsets[elem.ID]; exist {
- // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need
- // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify
- // a label to resolve the ambiguity.
- delete(offsets, elem.ID)
- ambiguousIDOffsets[elem.ID] = struct{}{}
- } else {
- offsets[elem.ID] = i
- }
- }
- }
-
- p, err := newProduction(lhsSym, altSyms)
- if err != nil {
- return nil, err
- }
- if _, exist := prods.findByID(p.id); exist {
- // Report the line number of a duplicate alternative.
- // When the alternative is empty, we report the position of its LHS.
- var row int
- var col int
- if len(alt.Elements) > 0 {
- row = alt.Elements[0].Pos.Row
- col = alt.Elements[0].Pos.Col
- } else {
- row = prod.Pos.Row
- col = prod.Pos.Col
- }
-
- var detail string
- {
- var b strings.Builder
- fmt.Fprintf(&b, "%v →", prod.LHS)
- for _, elem := range alt.Elements {
- switch {
- case elem.ID != "":
- fmt.Fprintf(&b, " %v", elem.ID)
- case elem.Pattern != "":
- fmt.Fprintf(&b, ` "%v"`, elem.Pattern)
- }
- }
- if len(alt.Elements) == 0 {
- fmt.Fprintf(&b, " ε")
- }
-
- detail = b.String()
- }
-
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateProduction,
- Detail: detail,
- Row: row,
- Col: col,
- })
- continue LOOP_RHS
- }
- prods.append(p)
-
- dirConsumed := map[string]struct{}{}
- for _, dir := range alt.Directives {
- if _, consumed := dirConsumed[dir.Name]; consumed {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateDir,
- Detail: dir.Name,
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- }
- dirConsumed[dir.Name] = struct{}{}
-
- switch dir.Name {
- case "ast":
- if len(dir.Parameters) == 0 {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'ast' directive needs at least one parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue LOOP_RHS
- }
- astAct := make([]*astActionEntry, len(dir.Parameters))
- consumedOffsets := map[int]struct{}{}
- for i, param := range dir.Parameters {
- if param.ID == "" {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'ast' directive can take only ID parameters",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue LOOP_RHS
- }
-
- if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrAmbiguousElem,
- Detail: fmt.Sprintf("'%v' is ambiguous", param.ID),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
-
- offset, ok := offsets[param.ID]
- if !ok {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- if _, consumed := consumedOffsets[offset]; consumed {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateElem,
- Detail: param.ID,
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- consumedOffsets[offset] = struct{}{}
-
- if param.Expansion {
- elem := alt.Elements[offset]
- if elem.Pattern != "" {
- // Currently, it is a bug to reach here because it is
- // forbidden to have anything other than ID appear in
- // production rules.
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- elemSym, ok := symTab.ToSymbol(elem.ID)
- if !ok {
- // If the symbol was not found, it's a bug.
- return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID)
- }
- if elemSym.IsTerminal() {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- }
-
- astAct[i] = &astActionEntry{
- position: offset + 1,
- expansion: param.Expansion,
- }
- }
- astActs[p.id] = astAct
- case "prec":
- if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'prec' directive needs just one ID parameter or ordered symbol",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue LOOP_RHS
- }
- param := dir.Parameters[0]
- switch {
- case param.ID != "":
- sym, ok := symTab.ToSymbol(param.ID)
- if !ok {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- if sym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- }
- if !sym.IsTerminal() {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID),
- Row: param.Pos.Row,
- Col: param.Pos.Col,
- })
- continue LOOP_RHS
- }
- prodPrecsTerm[p.id] = sym
- prodPrecPoss[p.id] = &param.Pos
- case param.OrderedSymbol != "":
- prodPrecsOrdSym[p.id] = param.OrderedSymbol
- prodPrecPoss[p.id] = &param.Pos
- }
- case "recover":
- if len(dir.Parameters) > 0 {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'recover' directive needs no parameter",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue LOOP_RHS
- }
- recoverProds[p.id] = struct{}{}
- default:
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidName,
- Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name),
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue LOOP_RHS
- }
- }
- }
- }
-
- return &productionsAndActions{
- prods: prods,
- augStartSym: augStartSym,
- astActs: astActs,
- prodPrecsTerm: prodPrecsTerm,
- prodPrecsOrdSym: prodPrecsOrdSym,
- prodPrecPoss: prodPrecPoss,
- recoverProds: recoverProds,
- }, nil
-}
-
-func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) {
- termPrec := map[symbol.SymbolNum]int{}
- termAssoc := map[symbol.SymbolNum]assocType{}
- ordSymPrec := map[string]int{}
- {
- var precGroup []*parser.DirectiveNode
- for _, dir := range b.AST.Directives {
- if dir.Name == "prec" {
- if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "'prec' needs just one directive group",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue
- }
- precGroup = dir.Parameters[0].Group
- continue
- }
-
- if dir.Name != "name" && dir.Name != "prec" {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidName,
- Detail: dir.Name,
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- continue
- }
- }
-
- precN := precMin
- for _, dir := range precGroup {
- var assocTy assocType
- switch dir.Name {
- case "left":
- assocTy = assocTypeLeft
- case "right":
- assocTy = assocTypeRight
- case "assign":
- assocTy = assocTypeNil
- default:
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidName,
- Detail: dir.Name,
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- return nil, nil
- }
-
- if len(dir.Parameters) == 0 {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "associativity needs at least one symbol",
- Row: dir.Pos.Row,
- Col: dir.Pos.Col,
- })
- return nil, nil
- }
- ASSOC_PARAM_LOOP:
- for _, p := range dir.Parameters {
- switch {
- case p.ID != "":
- sym, ok := symTab.ToSymbol(p.ID)
- if !ok {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("'%v' is undefined", p.ID),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- return nil, nil
- }
- if sym == errSym {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- return nil, nil
- }
- if !sym.IsTerminal() {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- return nil, nil
- }
- if prec, alreadySet := termPrec[sym.Num()]; alreadySet {
- if prec == precN {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateAssoc,
- Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- } else if assoc := termAssoc[sym.Num()]; assoc == assocTy {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateAssoc,
- Detail: fmt.Sprintf("'%v' already has different precedence", p.ID),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- } else {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateAssoc,
- Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- }
- break ASSOC_PARAM_LOOP
- }
-
- termPrec[sym.Num()] = precN
- termAssoc[sym.Num()] = assocTy
- case p.OrderedSymbol != "":
- if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet {
- if prec == precN {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateAssoc,
- Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- } else {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDuplicateAssoc,
- Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol),
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- }
- break ASSOC_PARAM_LOOP
- }
-
- ordSymPrec[p.OrderedSymbol] = precN
- default:
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrDirInvalidParam,
- Detail: "a parameter must be an ID or an ordered symbol",
- Row: p.Pos.Row,
- Col: p.Pos.Col,
- })
- return nil, nil
- }
- }
-
- precN++
- }
- }
- if len(b.errs) > 0 {
- return nil, nil
- }
-
- prodPrec := map[productionNum]int{}
- prodAssoc := map[productionNum]assocType{}
- for _, prod := range prodsAndActs.prods.getAllProductions() {
- // A #prec directive changes only precedence, not associativity.
- if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok {
- if prec, ok := termPrec[term.Num()]; ok {
- prodPrec[prod.num] = prec
- prodAssoc[prod.num] = assocTypeNil
- } else {
- text, _ := symTab.ToText(term)
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrUndefinedPrec,
- Detail: text,
- Row: prodsAndActs.prodPrecPoss[prod.id].Row,
- Col: prodsAndActs.prodPrecPoss[prod.id].Col,
- })
- }
- } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok {
- if prec, ok := ordSymPrec[ordSym]; ok {
- prodPrec[prod.num] = prec
- prodAssoc[prod.num] = assocTypeNil
- } else {
- b.errs = append(b.errs, &verr.SpecError{
- Cause: semErrUndefinedOrdSym,
- Detail: fmt.Sprintf("$%v", ordSym),
- Row: prodsAndActs.prodPrecPoss[prod.id].Row,
- Col: prodsAndActs.prodPrecPoss[prod.id].Col,
- })
- }
- } else {
- // A production inherits precedence and associativity from the right-most terminal symbol.
- mostrightTerm := symbol.SymbolNil
- for _, sym := range prod.rhs {
- if !sym.IsTerminal() {
- continue
- }
- mostrightTerm = sym
- }
- if !mostrightTerm.IsNil() {
- prodPrec[prod.num] = termPrec[mostrightTerm.Num()]
- prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()]
- }
- }
- }
- if len(b.errs) > 0 {
- return nil, nil
- }
-
- return &precAndAssoc{
- termPrec: termPrec,
- termAssoc: termAssoc,
- prodPrec: prodPrec,
- prodAssoc: prodAssoc,
- }, nil
-}
-
-func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
- config := &buildConfig{}
- for _, opt := range opts {
- opt(config)
- }
-
- lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax)
- if err != nil {
- if len(cErrs) > 0 {
- var b strings.Builder
- writeCompileError(&b, cErrs[0])
- for _, cerr := range cErrs[1:] {
- fmt.Fprintf(&b, "\n")
- writeCompileError(&b, cerr)
- }
- return nil, nil, fmt.Errorf(b.String())
- }
- return nil, nil, err
- }
-
- kind2Term := make([]int, len(lexSpec.KindNames))
- for i, k := range lexSpec.KindNames {
- if k == spec.LexKindNameNil {
- kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int()
- continue
- }
-
- sym, ok := gram.symbolTable.ToSymbol(k.String())
- if !ok {
- return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k)
- }
- kind2Term[i] = sym.Num().Int()
- }
-
- termTexts, err := gram.symbolTable.TerminalTexts()
- if err != nil {
- return nil, nil, err
- }
-
- var termSkip []int
- {
- r := gram.symbolTable.Reader()
- // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(),
- // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts
- // because it does not contain a predefined symbol, like EOF.
- // Therefore, we use terminalTexts, although it takes more time to lookup for symbols.
- termSkip = make([]int, len(termTexts))
- for _, t := range termTexts {
- s, _ := r.ToSymbol(t)
- for _, sk := range gram.skipSymbols {
- if s != sk {
- continue
- }
- termSkip[s.Num()] = 1
- break
- }
- }
- }
-
- nonTerms, err := gram.symbolTable.NonTerminalTexts()
- if err != nil {
- return nil, nil, err
- }
-
- firstSet, err := genFirstSet(gram.productionSet)
- if err != nil {
- return nil, nil, err
- }
-
- lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol)
- if err != nil {
- return nil, nil, err
- }
-
- var tab *ParsingTable
- var report *spec.Report
- {
- lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet)
- if err != nil {
- return nil, nil, err
- }
-
- b := &lrTableBuilder{
- automaton: lalr1.lr0Automaton,
- prods: gram.productionSet,
- termCount: len(termTexts),
- nonTermCount: len(nonTerms),
- symTab: gram.symbolTable,
- precAndAssoc: gram.precAndAssoc,
- }
- tab, err = b.build()
- if err != nil {
- return nil, nil, err
- }
-
- if config.isReportingEnabled {
- report, err = b.genReport(tab, gram)
- if err != nil {
- return nil, nil, err
- }
- }
- }
-
- action := make([]int, len(tab.actionTable))
- for i, e := range tab.actionTable {
- action[i] = int(e)
- }
- goTo := make([]int, len(tab.goToTable))
- for i, e := range tab.goToTable {
- goTo[i] = int(e)
- }
-
- lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1)
- altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1)
- recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1)
- astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1)
- for _, p := range gram.productionSet.getAllProductions() {
- lhsSyms[p.num] = p.lhs.Num().Int()
- altSymCounts[p.num] = p.rhsLen
-
- if _, ok := gram.recoverProductions[p.id]; ok {
- recoverProds[p.num] = 1
- }
-
- astAct, ok := gram.astActions[p.id]
- if !ok {
- continue
- }
- astActEntry := make([]int, len(astAct))
- for i, e := range astAct {
- if e.expansion {
- astActEntry[i] = e.position * -1
- } else {
- astActEntry[i] = e.position
- }
- }
- astActEnties[p.num] = astActEntry
- }
-
- return &spec.CompiledGrammar{
- Name: gram.name,
- Lexical: lexSpec,
- Syntactic: &spec.SyntacticSpec{
- Action: action,
- GoTo: goTo,
- StateCount: tab.stateCount,
- InitialState: tab.InitialState.Int(),
- StartProduction: productionNumStart.Int(),
- LHSSymbols: lhsSyms,
- AlternativeSymbolCounts: altSymCounts,
- Terminals: termTexts,
- TerminalCount: tab.terminalCount,
- TerminalSkip: termSkip,
- KindToTerminal: kind2Term,
- NonTerminals: nonTerms,
- NonTerminalCount: tab.nonTerminalCount,
- EOFSymbol: symbol.SymbolEOF.Num().Int(),
- ErrorSymbol: gram.errorSymbol.Num().Int(),
- ErrorTrapperStates: tab.errorTrapperStates,
- RecoverProductions: recoverProds,
- },
- ASTAction: &spec.ASTAction{
- Entries: astActEnties,
- },
- }, report, nil
-}
-
-func writeCompileError(w io.Writer, cErr *lexical.CompileError) {
- if cErr.Fragment {
- fmt.Fprintf(w, "fragment ")
- }
- fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause)
- if cErr.Detail != "" {
- fmt.Fprintf(w, ": %v", cErr.Detail)
- }
-}
diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go
deleted file mode 100644
index 6c5fe42..0000000
--- a/src/urubu/grammar/item.go
+++ /dev/null
@@ -1,206 +0,0 @@
-package grammar
-
-import (
- "crypto/sha256"
- "encoding/binary"
- "fmt"
- "sort"
- "strconv"
-
- "urubu/grammar/symbol"
-)
-
-type lrItemID [32]byte
-
-func (id lrItemID) String() string {
- return fmt.Sprintf("%x", id.num())
-}
-
-func (id lrItemID) num() uint32 {
- return binary.LittleEndian.Uint32(id[:])
-}
-
-type lookAhead struct {
- symbols map[symbol.Symbol]struct{}
-
- // When propagation is true, an item propagates look-ahead symbols to other items.
- propagation bool
-}
-
-type lrItem struct {
- id lrItemID
- prod productionID
-
- // E → E + T
- //
- // Dot | Dotted Symbol | Item
- // ----+---------------+------------
- // 0 | E | E →・E + T
- // 1 | + | E → E・+ T
- // 2 | T | E → E +・T
- // 3 | Nil | E → E + T・
- dot int
- dottedSymbol symbol.Symbol
-
- // When initial is true, the LHS of the production is the augmented start symbol and dot is 0.
- // It looks like S' →・S.
- initial bool
-
- // When reducible is true, the item looks like E → E + T・.
- reducible bool
-
- // When kernel is true, the item is kernel item.
- kernel bool
-
- // lookAhead stores look-ahead symbols, and they are terminal symbols.
- // The item is reducible only when the look-ahead symbols appear as the next input symbol.
- lookAhead lookAhead
-}
-
-func newLR0Item(prod *production, dot int) (*lrItem, error) {
- if prod == nil {
- return nil, fmt.Errorf("production must be non-nil")
- }
-
- if dot < 0 || dot > prod.rhsLen {
- return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen)
- }
-
- var id lrItemID
- {
- b := []byte{}
- b = append(b, prod.id[:]...)
- bDot := make([]byte, 8)
- binary.LittleEndian.PutUint64(bDot, uint64(dot))
- b = append(b, bDot...)
- id = sha256.Sum256(b)
- }
-
- dottedSymbol := symbol.SymbolNil
- if dot < prod.rhsLen {
- dottedSymbol = prod.rhs[dot]
- }
-
- initial := false
- if prod.lhs.IsStart() && dot == 0 {
- initial = true
- }
-
- reducible := false
- if dot == prod.rhsLen {
- reducible = true
- }
-
- kernel := false
- if initial || dot > 0 {
- kernel = true
- }
-
- item := &lrItem{
- id: id,
- prod: prod.id,
- dot: dot,
- dottedSymbol: dottedSymbol,
- initial: initial,
- reducible: reducible,
- kernel: kernel,
- }
-
- return item, nil
-}
-
-type kernelID [32]byte
-
-func (id kernelID) String() string {
- return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:]))
-}
-
-type kernel struct {
- id kernelID
- items []*lrItem
-}
-
-func newKernel(items []*lrItem) (*kernel, error) {
- if len(items) == 0 {
- return nil, fmt.Errorf("a kernel need at least one item")
- }
-
- // Remove duplicates from items.
- var sortedItems []*lrItem
- {
- m := map[lrItemID]*lrItem{}
- for _, item := range items {
- if !item.kernel {
- return nil, fmt.Errorf("not a kernel item: %v", item)
- }
- m[item.id] = item
- }
- sortedItems = []*lrItem{}
- for _, item := range m {
- sortedItems = append(sortedItems, item)
- }
- sort.Slice(sortedItems, func(i, j int) bool {
- return sortedItems[i].id.num() < sortedItems[j].id.num()
- })
- }
-
- var id kernelID
- {
- b := []byte{}
- for _, item := range sortedItems {
- b = append(b, item.id[:]...)
- }
- id = sha256.Sum256(b)
- }
-
- return &kernel{
- id: id,
- items: sortedItems,
- }, nil
-}
-
-type stateNum int
-
-const stateNumInitial = stateNum(0)
-
-func (n stateNum) Int() int {
- return int(n)
-}
-
-func (n stateNum) String() string {
- return strconv.Itoa(int(n))
-}
-
-func (n stateNum) next() stateNum {
- return stateNum(n + 1)
-}
-
-type lrState struct {
- *kernel
- num stateNum
- next map[symbol.Symbol]kernelID
- reducible map[productionID]struct{}
-
- // emptyProdItems stores items that have an empty production like `p → ε` and is reducible.
- // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store
- // look-ahead symbols because the kernel items don't include these items.
- //
- // For instance, we have the following productions, and A is a terminal symbol.
- //
- // s' → s
- // s → A | ε
- //
- // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't
- // include `s → ・ε`.
- //
- // s' → ・s
- // s → ・A
- // s → ・ε
- emptyProdItems []*lrItem
-
- // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form.
- // The `α` and `β` can be empty.
- //
- // A → α・error β
- isErrorTrapper bool
-}
diff --git a/src/urubu/grammar/lalr1.go b/src/urubu/grammar/lalr1.go
deleted file mode 100644
index 8373568..0000000
--- a/src/urubu/grammar/lalr1.go
+++ /dev/null
@@ -1,318 +0,0 @@
-package grammar
-
-import (
- "fmt"
-
- "urubu/grammar/symbol"
-)
-
-type stateAndLRItem struct {
- kernelID kernelID
- itemID lrItemID
-}
-
-type propagation struct {
- src *stateAndLRItem
- dest []*stateAndLRItem
-}
-
-type lalr1Automaton struct {
- *lr0Automaton
-}
-
-func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) {
- // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $]
- iniState := lr0.states[lr0.initialState]
- iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{
- symbol.SymbolEOF: {},
- }
-
- var props []*propagation
- for _, state := range lr0.states {
- for _, kItem := range state.items {
- items, err := genLALR1Closure(kItem, prods, first)
- if err != nil {
- return nil, err
- }
-
- kItem.lookAhead.propagation = true
-
- var propDests []*stateAndLRItem
- for _, item := range items {
- if item.reducible {
- p, ok := prods.findByID(item.prod)
- if !ok {
- return nil, fmt.Errorf("production not found: %v", item.prod)
- }
-
- if p.isEmpty() {
- var reducibleItem *lrItem
- for _, it := range state.emptyProdItems {
- if it.id != item.id {
- continue
- }
-
- reducibleItem = it
- break
- }
- if reducibleItem == nil {
- return nil, fmt.Errorf("reducible item not found: %v", item.id)
- }
- if reducibleItem.lookAhead.symbols == nil {
- reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
- }
- for a := range item.lookAhead.symbols {
- reducibleItem.lookAhead.symbols[a] = struct{}{}
- }
-
- propDests = append(propDests, &stateAndLRItem{
- kernelID: state.id,
- itemID: item.id,
- })
- }
-
- continue
- }
-
- nextKID := state.next[item.dottedSymbol]
- var nextItemID lrItemID
- {
- p, ok := prods.findByID(item.prod)
- if !ok {
- return nil, fmt.Errorf("production not found: %v", item.prod)
- }
- it, err := newLR0Item(p, item.dot+1)
- if err != nil {
- return nil, fmt.Errorf("failed to generate an item ID: %v", err)
- }
- nextItemID = it.id
- }
-
- if item.lookAhead.propagation {
- propDests = append(propDests, &stateAndLRItem{
- kernelID: nextKID,
- itemID: nextItemID,
- })
- } else {
- nextState := lr0.states[nextKID]
- var nextItem *lrItem
- for _, it := range nextState.items {
- if it.id != nextItemID {
- continue
- }
- nextItem = it
- break
- }
- if nextItem == nil {
- return nil, fmt.Errorf("item not found: %v", nextItemID)
- }
-
- if nextItem.lookAhead.symbols == nil {
- nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
- }
-
- for a := range item.lookAhead.symbols {
- nextItem.lookAhead.symbols[a] = struct{}{}
- }
- }
- }
- if len(propDests) == 0 {
- continue
- }
-
- props = append(props, &propagation{
- src: &stateAndLRItem{
- kernelID: state.id,
- itemID: kItem.id,
- },
- dest: propDests,
- })
- }
- }
-
- err := propagateLookAhead(lr0, props)
- if err != nil {
- return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err)
- }
-
- return &lalr1Automaton{
- lr0Automaton: lr0,
- }, nil
-}
-
-func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) {
- items := []*lrItem{}
- knownItems := map[lrItemID]map[symbol.Symbol]struct{}{}
- knownItemsProp := map[lrItemID]struct{}{}
- uncheckedItems := []*lrItem{}
- items = append(items, srcItem)
- uncheckedItems = append(uncheckedItems, srcItem)
- for len(uncheckedItems) > 0 {
- nextUncheckedItems := []*lrItem{}
- for _, item := range uncheckedItems {
- if item.dottedSymbol.IsTerminal() {
- continue
- }
-
- p, ok := prods.findByID(item.prod)
- if !ok {
- return nil, fmt.Errorf("production not found: %v", item.prod)
- }
-
- var fstSyms []symbol.Symbol
- var isFstNullable bool
- {
- fst, err := first.find(p, item.dot+1)
- if err != nil {
- return nil, err
- }
-
- fstSyms = make([]symbol.Symbol, len(fst.symbols))
- i := 0
- for s := range fst.symbols {
- fstSyms[i] = s
- i++
- }
- if fst.empty {
- isFstNullable = true
- }
- }
-
- ps, _ := prods.findByLHS(item.dottedSymbol)
- for _, prod := range ps {
- var lookAhead []symbol.Symbol
- {
- var lookAheadCount int
- if isFstNullable {
- lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols)
- } else {
- lookAheadCount = len(fstSyms)
- }
-
- lookAhead = make([]symbol.Symbol, lookAheadCount)
- i := 0
- for _, s := range fstSyms {
- lookAhead[i] = s
- i++
- }
- if isFstNullable {
- for a := range item.lookAhead.symbols {
- lookAhead[i] = a
- i++
- }
- }
- }
-
- for _, a := range lookAhead {
- newItem, err := newLR0Item(prod, 0)
- if err != nil {
- return nil, err
- }
- if items, exist := knownItems[newItem.id]; exist {
- if _, exist := items[a]; exist {
- continue
- }
- }
-
- newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{
- a: {},
- }
-
- items = append(items, newItem)
- if knownItems[newItem.id] == nil {
- knownItems[newItem.id] = map[symbol.Symbol]struct{}{}
- }
- knownItems[newItem.id][a] = struct{}{}
- nextUncheckedItems = append(nextUncheckedItems, newItem)
- }
-
- if isFstNullable {
- newItem, err := newLR0Item(prod, 0)
- if err != nil {
- return nil, err
- }
- if _, exist := knownItemsProp[newItem.id]; exist {
- continue
- }
-
- newItem.lookAhead.propagation = true
-
- items = append(items, newItem)
- knownItemsProp[newItem.id] = struct{}{}
- nextUncheckedItems = append(nextUncheckedItems, newItem)
- }
- }
- }
- uncheckedItems = nextUncheckedItems
- }
-
- return items, nil
-}
-
-func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error {
- for {
- changed := false
- for _, prop := range props {
- srcState, ok := lr0.states[prop.src.kernelID]
- if !ok {
- return fmt.Errorf("source state not found: %v", prop.src.kernelID)
- }
- var srcItem *lrItem
- for _, item := range srcState.items {
- if item.id != prop.src.itemID {
- continue
- }
- srcItem = item
- break
- }
- if srcItem == nil {
- return fmt.Errorf("source item not found: %v", prop.src.itemID)
- }
-
- for _, dest := range prop.dest {
- destState, ok := lr0.states[dest.kernelID]
- if !ok {
- return fmt.Errorf("destination state not found: %v", dest.kernelID)
- }
- var destItem *lrItem
- for _, item := range destState.items {
- if item.id != dest.itemID {
- continue
- }
- destItem = item
- break
- }
- if destItem == nil {
- for _, item := range destState.emptyProdItems {
- if item.id != dest.itemID {
- continue
- }
- destItem = item
- break
- }
- if destItem == nil {
- return fmt.Errorf("destination item not found: %v", dest.itemID)
- }
- }
-
- for a := range srcItem.lookAhead.symbols {
- if _, ok := destItem.lookAhead.symbols[a]; ok {
- continue
- }
-
- if destItem.lookAhead.symbols == nil {
- destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
- }
-
- destItem.lookAhead.symbols[a] = struct{}{}
- changed = true
- }
- }
- }
- if !changed {
- break
- }
- }
-
- return nil
-}
diff --git a/src/urubu/grammar/lexical/compiler.go b/src/urubu/grammar/lexical.go
index 637018a..515e491 100644
--- a/src/urubu/grammar/lexical/compiler.go
+++ b/src/urubu/grammar/lexical.go
@@ -3,6 +3,8 @@ package lexical
import (
"bytes"
"fmt"
+ "sort"
+ "strings"
"urubu/compressor"
"urubu/grammar/lexical/dfa"
@@ -411,3 +413,165 @@ func convertIntSliceToStateIDSlice(s []int) []spec.StateID {
}
return ss
}
+
+type LexEntry struct {
+ Kind spec.LexKindName
+ Pattern string
+ Modes []spec.LexModeName
+ Push spec.LexModeName
+ Pop bool
+ Fragment bool
+}
+
+type LexSpec struct {
+ Entries []*LexEntry
+}
+
+func (s *LexSpec) Validate() error {
+ if len(s.Entries) <= 0 {
+ return fmt.Errorf("the lexical specification must have at least one entry")
+ }
+ {
+ ks := map[string]struct{}{}
+ fks := map[string]struct{}{}
+ for _, e := range s.Entries {
+ // Allow duplicate names between fragments and non-fragments.
+ if e.Fragment {
+ if _, exist := fks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ fks[e.Kind.String()] = struct{}{}
+ } else {
+ if _, exist := ks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ ks[e.Kind.String()] = struct{}{}
+ }
+ }
+ }
+ {
+ kinds := []string{}
+ modes := []string{
+ spec.LexModeNameDefault.String(), // This is a predefined mode.
+ }
+ for _, e := range s.Entries {
+ if e.Fragment {
+ continue
+ }
+
+ kinds = append(kinds, e.Kind.String())
+
+ for _, m := range e.Modes {
+ modes = append(modes, m.String())
+ }
+ }
+
+ kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+ modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+ if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", ids[0])
+ for _, id := range ids[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String())
+ }
+ return nil
+ })
+ errs := append(kindErrs, modeErrs...)
+ if len(errs) > 0 {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v", errs[0])
+ for _, err := range errs[1:] {
+ fmt.Fprintf(&b, "\n%v", err)
+ }
+ return fmt.Errorf(b.String())
+ }
+ }
+
+ return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+ duplicated := FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return nil
+ }
+
+ var errs []error
+ for _, dup := range duplicated {
+ if hook != nil {
+ err := hook(dup)
+ if err != nil {
+ errs = append(errs, err)
+ continue
+ }
+ }
+
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+ errs = append(errs, err)
+ }
+
+ return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+ m := map[string][]string{}
+ for _, id := range removeDuplicates(ids) {
+ c := SnakeCaseToUpperCamelCase(id)
+ m[c] = append(m[c], id)
+ }
+
+ var duplicated [][]string
+ for _, camels := range m {
+ if len(camels) == 1 {
+ continue
+ }
+ duplicated = append(duplicated, camels)
+ }
+
+ for _, dup := range duplicated {
+ sort.Slice(dup, func(i, j int) bool {
+ return dup[i] < dup[j]
+ })
+ }
+ sort.Slice(duplicated, func(i, j int) bool {
+ return duplicated[i][0] < duplicated[j][0]
+ })
+
+ return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+ m := map[string]struct{}{}
+ for _, v := range s {
+ m[v] = struct{}{}
+ }
+
+ var unique []string
+ for v := range m {
+ unique = append(unique, v)
+ }
+
+ return unique
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+ elems := strings.Split(snake, "_")
+ for i, e := range elems {
+ if len(e) == 0 {
+ continue
+ }
+ elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+ }
+
+ return strings.Join(elems, "")
+}
diff --git a/src/urubu/grammar/lexical/dfa/tree.go b/src/urubu/grammar/lexical/dfa.go
index 8a11aee..982420d 100644
--- a/src/urubu/grammar/lexical/dfa/tree.go
+++ b/src/urubu/grammar/lexical/dfa.go
@@ -1,15 +1,358 @@
package dfa
import (
+ "encoding/binary"
"fmt"
"io"
"sort"
+ "strings"
"urubu/grammar/lexical/parser"
spec "urubu/spec/grammar"
"urubu/utf8"
)
+type symbolTable struct {
+ symPos2Byte map[symbolPosition]byteRange
+ endPos2ID map[symbolPosition]spec.LexModeKindID
+}
+
+func genSymbolTable(root byteTree) *symbolTable {
+ symTab := &symbolTable{
+ symPos2Byte: map[symbolPosition]byteRange{},
+ endPos2ID: map[symbolPosition]spec.LexModeKindID{},
+ }
+ return genSymTab(symTab, root)
+}
+
+func genSymTab(symTab *symbolTable, node byteTree) *symbolTable {
+ if node == nil {
+ return symTab
+ }
+
+ switch n := node.(type) {
+ case *symbolNode:
+ symTab.symPos2Byte[n.pos] = byteRange{
+ from: n.from,
+ to: n.to,
+ }
+ case *endMarkerNode:
+ symTab.endPos2ID[n.pos] = n.id
+ default:
+ left, right := node.children()
+ genSymTab(symTab, left)
+ genSymTab(symTab, right)
+ }
+ return symTab
+}
+
+type DFA struct {
+ States []string
+ InitialState string
+ AcceptingStatesTable map[string]spec.LexModeKindID
+ TransitionTable map[string][256]string
+}
+
+func GenDFA(root byteTree, symTab *symbolTable) *DFA {
+ initialState := root.first()
+ initialStateHash := initialState.hash()
+ stateMap := map[string]*symbolPositionSet{
+ initialStateHash: initialState,
+ }
+ tranTab := map[string][256]string{}
+ {
+ follow := genFollowTable(root)
+ unmarkedStates := map[string]*symbolPositionSet{
+ initialStateHash: initialState,
+ }
+ for len(unmarkedStates) > 0 {
+ nextUnmarkedStates := map[string]*symbolPositionSet{}
+ for hash, state := range unmarkedStates {
+ tranTabOfState := [256]*symbolPositionSet{}
+ for _, pos := range state.set() {
+ if pos.isEndMark() {
+ continue
+ }
+ valRange := symTab.symPos2Byte[pos]
+ for symVal := valRange.from; symVal <= valRange.to; symVal++ {
+ if tranTabOfState[symVal] == nil {
+ tranTabOfState[symVal] = newSymbolPositionSet()
+ }
+ tranTabOfState[symVal].merge(follow[pos])
+ }
+ }
+ for _, t := range tranTabOfState {
+ if t == nil {
+ continue
+ }
+ h := t.hash()
+ if _, ok := stateMap[h]; ok {
+ continue
+ }
+ stateMap[h] = t
+ nextUnmarkedStates[h] = t
+ }
+ tabOfState := [256]string{}
+ for v, t := range tranTabOfState {
+ if t == nil {
+ continue
+ }
+ tabOfState[v] = t.hash()
+ }
+ tranTab[hash] = tabOfState
+ }
+ unmarkedStates = nextUnmarkedStates
+ }
+ }
+
+ accTab := map[string]spec.LexModeKindID{}
+ {
+ for h, s := range stateMap {
+ for _, pos := range s.set() {
+ if !pos.isEndMark() {
+ continue
+ }
+ priorID, ok := accTab[h]
+ if !ok {
+ accTab[h] = symTab.endPos2ID[pos]
+ } else {
+ id := symTab.endPos2ID[pos]
+ if id < priorID {
+ accTab[h] = id
+ }
+ }
+ }
+ }
+ }
+
+ var states []string
+ {
+ for s := range stateMap {
+ states = append(states, s)
+ }
+ sort.Slice(states, func(i, j int) bool {
+ return states[i] < states[j]
+ })
+ }
+
+ return &DFA{
+ States: states,
+ InitialState: initialStateHash,
+ AcceptingStatesTable: accTab,
+ TransitionTable: tranTab,
+ }
+}
+
+func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) {
+ stateHash2ID := map[string]spec.StateID{}
+ for i, s := range dfa.States {
+ // Since 0 represents an invalid value in a transition table,
+ // assign a number greater than or equal to 1 to states.
+ stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int())
+ }
+
+ acc := make([]spec.LexModeKindID, len(dfa.States)+1)
+ for _, s := range dfa.States {
+ id, ok := dfa.AcceptingStatesTable[s]
+ if !ok {
+ continue
+ }
+ acc[stateHash2ID[s]] = id
+ }
+
+ rowCount := len(dfa.States) + 1
+ colCount := 256
+ tran := make([]spec.StateID, rowCount*colCount)
+ for s, tab := range dfa.TransitionTable {
+ for v, to := range tab {
+ tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to]
+ }
+ }
+
+ return &spec.TransitionTable{
+ InitialStateID: stateHash2ID[dfa.InitialState],
+ AcceptingStates: acc,
+ UncompressedTransition: tran,
+ RowCount: rowCount,
+ ColCount: colCount,
+ }, nil
+}
+
+type symbolPosition uint16
+
+const (
+ symbolPositionNil symbolPosition = 0x0000
+
+ symbolPositionMin uint16 = 0x0001
+ symbolPositionMax uint16 = 0x7fff
+
+ symbolPositionMaskSymbol uint16 = 0x0000
+ symbolPositionMaskEndMark uint16 = 0x8000
+
+ symbolPositionMaskValue uint16 = 0x7fff
+)
+
+func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) {
+ if n < symbolPositionMin || n > symbolPositionMax {
+ return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark)
+ }
+ if endMark {
+ return symbolPosition(n | symbolPositionMaskEndMark), nil
+ }
+ return symbolPosition(n | symbolPositionMaskSymbol), nil
+}
+
+func (p symbolPosition) String() string {
+ if p.isEndMark() {
+ return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue)
+ }
+ return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue)
+}
+
+func (p symbolPosition) isEndMark() bool {
+ return uint16(p)&symbolPositionMaskEndMark > 1
+}
+
+func (p symbolPosition) describe() (uint16, bool) {
+ v := uint16(p) & symbolPositionMaskValue
+ if p.isEndMark() {
+ return v, true
+ }
+ return v, false
+}
+
+type symbolPositionSet struct {
+ // `s` represents a set of symbol positions.
+ // However, immediately after adding a symbol position, the elements may be duplicated.
+ // When you need an aligned set with no duplicates, you can get such value via the set function.
+ s []symbolPosition
+ sorted bool
+}
+
+func newSymbolPositionSet() *symbolPositionSet {
+ return &symbolPositionSet{
+ s: []symbolPosition{},
+ sorted: false,
+ }
+}
+
+func (s *symbolPositionSet) String() string {
+ if len(s.s) <= 0 {
+ return "{}"
+ }
+ ps := s.sortAndRemoveDuplicates()
+ var b strings.Builder
+ fmt.Fprintf(&b, "{")
+ for i, p := range ps {
+ if i <= 0 {
+ fmt.Fprintf(&b, "%v", p)
+ continue
+ }
+ fmt.Fprintf(&b, ", %v", p)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+}
+
+func (s *symbolPositionSet) set() []symbolPosition {
+ s.sortAndRemoveDuplicates()
+ return s.s
+}
+
+func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet {
+ s.s = append(s.s, pos)
+ s.sorted = false
+ return s
+}
+
+func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet {
+ s.s = append(s.s, t.s...)
+ s.sorted = false
+ return s
+}
+
+func (s *symbolPositionSet) hash() string {
+ if len(s.s) <= 0 {
+ return ""
+ }
+ sorted := s.sortAndRemoveDuplicates()
+ var buf []byte
+ for _, p := range sorted {
+ b := make([]byte, 8)
+ binary.PutUvarint(b, uint64(p))
+ buf = append(buf, b...)
+ }
+ // Convert to a string to be able to use it as a key of a map.
+ // But note this byte sequence is made from values of symbol positions,
+ // so this is not a well-formed UTF-8 sequence.
+ return string(buf)
+}
+
+func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition {
+ if s.sorted {
+ return s.s
+ }
+
+ sortSymbolPositions(s.s, 0, len(s.s)-1)
+
+ // Remove duplicates.
+ lastV := s.s[0]
+ nextIdx := 1
+ for _, v := range s.s[1:] {
+ if v == lastV {
+ continue
+ }
+ s.s[nextIdx] = v
+ nextIdx++
+ lastV = v
+ }
+ s.s = s.s[:nextIdx]
+ s.sorted = true
+
+ return s.s
+}
+
+// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort.
+func sortSymbolPositions(ps []symbolPosition, left, right int) {
+ if left >= right {
+ return
+ }
+ var pivot symbolPosition
+ {
+ // Use a median as a pivot.
+ p1 := ps[left]
+ p2 := ps[(left+right)/2]
+ p3 := ps[right]
+ if p1 > p2 {
+ p1, p2 = p2, p1
+ }
+ if p2 > p3 {
+ p2 = p3
+ if p1 > p2 {
+ p2 = p1
+ }
+ }
+ pivot = p2
+ }
+ i := left
+ j := right
+ for i <= j {
+ for ps[i] < pivot {
+ i++
+ }
+ for ps[j] > pivot {
+ j--
+ }
+ if i <= j {
+ ps[i], ps[j] = ps[j], ps[i]
+ i++
+ j--
+ }
+ }
+ sortSymbolPositions(ps, left, j)
+ sortSymbolPositions(ps, i, right)
+}
+
type byteTree interface {
fmt.Stringer
children() (byteTree, byteTree)
diff --git a/src/urubu/grammar/lexical/dfa/dfa.go b/src/urubu/grammar/lexical/dfa/dfa.go
deleted file mode 100644
index 48bd8b4..0000000
--- a/src/urubu/grammar/lexical/dfa/dfa.go
+++ /dev/null
@@ -1,173 +0,0 @@
-package dfa
-
-import (
- "sort"
-
- spec "urubu/spec/grammar"
-)
-
-type symbolTable struct {
- symPos2Byte map[symbolPosition]byteRange
- endPos2ID map[symbolPosition]spec.LexModeKindID
-}
-
-func genSymbolTable(root byteTree) *symbolTable {
- symTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byteRange{},
- endPos2ID: map[symbolPosition]spec.LexModeKindID{},
- }
- return genSymTab(symTab, root)
-}
-
-func genSymTab(symTab *symbolTable, node byteTree) *symbolTable {
- if node == nil {
- return symTab
- }
-
- switch n := node.(type) {
- case *symbolNode:
- symTab.symPos2Byte[n.pos] = byteRange{
- from: n.from,
- to: n.to,
- }
- case *endMarkerNode:
- symTab.endPos2ID[n.pos] = n.id
- default:
- left, right := node.children()
- genSymTab(symTab, left)
- genSymTab(symTab, right)
- }
- return symTab
-}
-
-type DFA struct {
- States []string
- InitialState string
- AcceptingStatesTable map[string]spec.LexModeKindID
- TransitionTable map[string][256]string
-}
-
-func GenDFA(root byteTree, symTab *symbolTable) *DFA {
- initialState := root.first()
- initialStateHash := initialState.hash()
- stateMap := map[string]*symbolPositionSet{
- initialStateHash: initialState,
- }
- tranTab := map[string][256]string{}
- {
- follow := genFollowTable(root)
- unmarkedStates := map[string]*symbolPositionSet{
- initialStateHash: initialState,
- }
- for len(unmarkedStates) > 0 {
- nextUnmarkedStates := map[string]*symbolPositionSet{}
- for hash, state := range unmarkedStates {
- tranTabOfState := [256]*symbolPositionSet{}
- for _, pos := range state.set() {
- if pos.isEndMark() {
- continue
- }
- valRange := symTab.symPos2Byte[pos]
- for symVal := valRange.from; symVal <= valRange.to; symVal++ {
- if tranTabOfState[symVal] == nil {
- tranTabOfState[symVal] = newSymbolPositionSet()
- }
- tranTabOfState[symVal].merge(follow[pos])
- }
- }
- for _, t := range tranTabOfState {
- if t == nil {
- continue
- }
- h := t.hash()
- if _, ok := stateMap[h]; ok {
- continue
- }
- stateMap[h] = t
- nextUnmarkedStates[h] = t
- }
- tabOfState := [256]string{}
- for v, t := range tranTabOfState {
- if t == nil {
- continue
- }
- tabOfState[v] = t.hash()
- }
- tranTab[hash] = tabOfState
- }
- unmarkedStates = nextUnmarkedStates
- }
- }
-
- accTab := map[string]spec.LexModeKindID{}
- {
- for h, s := range stateMap {
- for _, pos := range s.set() {
- if !pos.isEndMark() {
- continue
- }
- priorID, ok := accTab[h]
- if !ok {
- accTab[h] = symTab.endPos2ID[pos]
- } else {
- id := symTab.endPos2ID[pos]
- if id < priorID {
- accTab[h] = id
- }
- }
- }
- }
- }
-
- var states []string
- {
- for s := range stateMap {
- states = append(states, s)
- }
- sort.Slice(states, func(i, j int) bool {
- return states[i] < states[j]
- })
- }
-
- return &DFA{
- States: states,
- InitialState: initialStateHash,
- AcceptingStatesTable: accTab,
- TransitionTable: tranTab,
- }
-}
-
-func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) {
- stateHash2ID := map[string]spec.StateID{}
- for i, s := range dfa.States {
- // Since 0 represents an invalid value in a transition table,
- // assign a number greater than or equal to 1 to states.
- stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int())
- }
-
- acc := make([]spec.LexModeKindID, len(dfa.States)+1)
- for _, s := range dfa.States {
- id, ok := dfa.AcceptingStatesTable[s]
- if !ok {
- continue
- }
- acc[stateHash2ID[s]] = id
- }
-
- rowCount := len(dfa.States) + 1
- colCount := 256
- tran := make([]spec.StateID, rowCount*colCount)
- for s, tab := range dfa.TransitionTable {
- for v, to := range tab {
- tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to]
- }
- }
-
- return &spec.TransitionTable{
- InitialStateID: stateHash2ID[dfa.InitialState],
- AcceptingStates: acc,
- UncompressedTransition: tran,
- RowCount: rowCount,
- ColCount: colCount,
- }, nil
-}
diff --git a/src/urubu/grammar/lexical/dfa/symbol_position.go b/src/urubu/grammar/lexical/dfa/symbol_position.go
deleted file mode 100644
index f154251..0000000
--- a/src/urubu/grammar/lexical/dfa/symbol_position.go
+++ /dev/null
@@ -1,182 +0,0 @@
-package dfa
-
-import (
- "encoding/binary"
- "fmt"
- "strings"
-)
-
-type symbolPosition uint16
-
-const (
- symbolPositionNil symbolPosition = 0x0000
-
- symbolPositionMin uint16 = 0x0001
- symbolPositionMax uint16 = 0x7fff
-
- symbolPositionMaskSymbol uint16 = 0x0000
- symbolPositionMaskEndMark uint16 = 0x8000
-
- symbolPositionMaskValue uint16 = 0x7fff
-)
-
-func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) {
- if n < symbolPositionMin || n > symbolPositionMax {
- return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark)
- }
- if endMark {
- return symbolPosition(n | symbolPositionMaskEndMark), nil
- }
- return symbolPosition(n | symbolPositionMaskSymbol), nil
-}
-
-func (p symbolPosition) String() string {
- if p.isEndMark() {
- return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue)
- }
- return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue)
-}
-
-func (p symbolPosition) isEndMark() bool {
- return uint16(p)&symbolPositionMaskEndMark > 1
-}
-
-func (p symbolPosition) describe() (uint16, bool) {
- v := uint16(p) & symbolPositionMaskValue
- if p.isEndMark() {
- return v, true
- }
- return v, false
-}
-
-type symbolPositionSet struct {
- // `s` represents a set of symbol positions.
- // However, immediately after adding a symbol position, the elements may be duplicated.
- // When you need an aligned set with no duplicates, you can get such value via the set function.
- s []symbolPosition
- sorted bool
-}
-
-func newSymbolPositionSet() *symbolPositionSet {
- return &symbolPositionSet{
- s: []symbolPosition{},
- sorted: false,
- }
-}
-
-func (s *symbolPositionSet) String() string {
- if len(s.s) <= 0 {
- return "{}"
- }
- ps := s.sortAndRemoveDuplicates()
- var b strings.Builder
- fmt.Fprintf(&b, "{")
- for i, p := range ps {
- if i <= 0 {
- fmt.Fprintf(&b, "%v", p)
- continue
- }
- fmt.Fprintf(&b, ", %v", p)
- }
- fmt.Fprintf(&b, "}")
- return b.String()
-}
-
-func (s *symbolPositionSet) set() []symbolPosition {
- s.sortAndRemoveDuplicates()
- return s.s
-}
-
-func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet {
- s.s = append(s.s, pos)
- s.sorted = false
- return s
-}
-
-func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet {
- s.s = append(s.s, t.s...)
- s.sorted = false
- return s
-}
-
-func (s *symbolPositionSet) hash() string {
- if len(s.s) <= 0 {
- return ""
- }
- sorted := s.sortAndRemoveDuplicates()
- var buf []byte
- for _, p := range sorted {
- b := make([]byte, 8)
- binary.PutUvarint(b, uint64(p))
- buf = append(buf, b...)
- }
- // Convert to a string to be able to use it as a key of a map.
- // But note this byte sequence is made from values of symbol positions,
- // so this is not a well-formed UTF-8 sequence.
- return string(buf)
-}
-
-func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition {
- if s.sorted {
- return s.s
- }
-
- sortSymbolPositions(s.s, 0, len(s.s)-1)
-
- // Remove duplicates.
- lastV := s.s[0]
- nextIdx := 1
- for _, v := range s.s[1:] {
- if v == lastV {
- continue
- }
- s.s[nextIdx] = v
- nextIdx++
- lastV = v
- }
- s.s = s.s[:nextIdx]
- s.sorted = true
-
- return s.s
-}
-
-// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort.
-func sortSymbolPositions(ps []symbolPosition, left, right int) {
- if left >= right {
- return
- }
- var pivot symbolPosition
- {
- // Use a median as a pivot.
- p1 := ps[left]
- p2 := ps[(left+right)/2]
- p3 := ps[right]
- if p1 > p2 {
- p1, p2 = p2, p1
- }
- if p2 > p3 {
- p2 = p3
- if p1 > p2 {
- p2 = p1
- }
- }
- pivot = p2
- }
- i := left
- j := right
- for i <= j {
- for ps[i] < pivot {
- i++
- }
- for ps[j] > pivot {
- j--
- }
- if i <= j {
- ps[i], ps[j] = ps[j], ps[i]
- i++
- j--
- }
- }
- sortSymbolPositions(ps, left, j)
- sortSymbolPositions(ps, i, right)
-}
diff --git a/src/urubu/grammar/lexical/entry.go b/src/urubu/grammar/lexical/entry.go
deleted file mode 100644
index 44af8ea..0000000
--- a/src/urubu/grammar/lexical/entry.go
+++ /dev/null
@@ -1,171 +0,0 @@
-package lexical
-
-import (
- "fmt"
- "sort"
- "strings"
-
- spec "urubu/spec/grammar"
-)
-
-type LexEntry struct {
- Kind spec.LexKindName
- Pattern string
- Modes []spec.LexModeName
- Push spec.LexModeName
- Pop bool
- Fragment bool
-}
-
-type LexSpec struct {
- Entries []*LexEntry
-}
-
-func (s *LexSpec) Validate() error {
- if len(s.Entries) <= 0 {
- return fmt.Errorf("the lexical specification must have at least one entry")
- }
- {
- ks := map[string]struct{}{}
- fks := map[string]struct{}{}
- for _, e := range s.Entries {
- // Allow duplicate names between fragments and non-fragments.
- if e.Fragment {
- if _, exist := fks[e.Kind.String()]; exist {
- return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
- }
- fks[e.Kind.String()] = struct{}{}
- } else {
- if _, exist := ks[e.Kind.String()]; exist {
- return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
- }
- ks[e.Kind.String()] = struct{}{}
- }
- }
- }
- {
- kinds := []string{}
- modes := []string{
- spec.LexModeNameDefault.String(), // This is a predefined mode.
- }
- for _, e := range s.Entries {
- if e.Fragment {
- continue
- }
-
- kinds = append(kinds, e.Kind.String())
-
- for _, m := range e.Modes {
- modes = append(modes, m.String())
- }
- }
-
- kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
- modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
- if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) {
- var b strings.Builder
- fmt.Fprintf(&b, "%+v", ids[0])
- for _, id := range ids[1:] {
- fmt.Fprintf(&b, ", %+v", id)
- }
- return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String())
- }
- return nil
- })
- errs := append(kindErrs, modeErrs...)
- if len(errs) > 0 {
- var b strings.Builder
- fmt.Fprintf(&b, "%v", errs[0])
- for _, err := range errs[1:] {
- fmt.Fprintf(&b, "\n%v", err)
- }
- return fmt.Errorf(b.String())
- }
- }
-
- return nil
-}
-
-func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
- duplicated := FindSpellingInconsistencies(ids)
- if len(duplicated) == 0 {
- return nil
- }
-
- var errs []error
- for _, dup := range duplicated {
- if hook != nil {
- err := hook(dup)
- if err != nil {
- errs = append(errs, err)
- continue
- }
- }
-
- var b strings.Builder
- fmt.Fprintf(&b, "%+v", dup[0])
- for _, id := range dup[1:] {
- fmt.Fprintf(&b, ", %+v", id)
- }
- err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
- errs = append(errs, err)
- }
-
- return errs
-}
-
-// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
-// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
-// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
-func FindSpellingInconsistencies(ids []string) [][]string {
- m := map[string][]string{}
- for _, id := range removeDuplicates(ids) {
- c := SnakeCaseToUpperCamelCase(id)
- m[c] = append(m[c], id)
- }
-
- var duplicated [][]string
- for _, camels := range m {
- if len(camels) == 1 {
- continue
- }
- duplicated = append(duplicated, camels)
- }
-
- for _, dup := range duplicated {
- sort.Slice(dup, func(i, j int) bool {
- return dup[i] < dup[j]
- })
- }
- sort.Slice(duplicated, func(i, j int) bool {
- return duplicated[i][0] < duplicated[j][0]
- })
-
- return duplicated
-}
-
-func removeDuplicates(s []string) []string {
- m := map[string]struct{}{}
- for _, v := range s {
- m[v] = struct{}{}
- }
-
- var unique []string
- for v := range m {
- unique = append(unique, v)
- }
-
- return unique
-}
-
-func SnakeCaseToUpperCamelCase(snake string) string {
- elems := strings.Split(snake, "_")
- for i, e := range elems {
- if len(e) == 0 {
- continue
- }
- elems[i] = strings.ToUpper(string(e[0])) + e[1:]
- }
-
- return strings.Join(elems, "")
-}
diff --git a/src/urubu/grammar/lexical/parser.go b/src/urubu/grammar/lexical/parser.go
new file mode 100644
index 0000000..748e8fe
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser.go
@@ -0,0 +1,1668 @@
+package parser
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "io"
+ "sort"
+ "strconv"
+ "strings"
+
+ spec "urubu/spec/grammar"
+ "urubu/ucd"
+)
+
+var (
+ ParseErr = fmt.Errorf("parse error")
+
+ // lexical errors
+ synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
+ synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
+ synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
+ synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
+ SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
+
+ // syntax errors
+ synErrUnexpectedToken = fmt.Errorf("unexpected token")
+ synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
+ synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
+ synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
+ synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
+ synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
+ synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
+ synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
+ synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
+ synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
+ synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
+ synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
+ synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
+ synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
+ synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
+ synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
+ synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
+ synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
+ synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
+ synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
+)
+
+type incompleteFragment struct {
+ kind spec.LexKindName
+ root *rootNode
+}
+
+func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
+ if len(fragments) == 0 {
+ return nil
+ }
+
+ completeFragments := map[spec.LexKindName]CPTree{}
+ incompleteFragments := []*incompleteFragment{}
+ for kind, tree := range fragments {
+ root, ok := tree.(*rootNode)
+ if !ok {
+ return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
+ }
+ if root.incomplete() {
+ incompleteFragments = append(incompleteFragments, &incompleteFragment{
+ kind: kind,
+ root: root,
+ })
+ } else {
+ completeFragments[kind] = root
+ }
+ }
+ for len(incompleteFragments) > 0 {
+ lastIncompCount := len(incompleteFragments)
+ remainingFragments := []*incompleteFragment{}
+ for _, e := range incompleteFragments {
+ complete, err := ApplyFragments(e.root, completeFragments)
+ if err != nil {
+ return err
+ }
+ if !complete {
+ remainingFragments = append(remainingFragments, e)
+ } else {
+ completeFragments[e.kind] = e.root
+ }
+ }
+ incompleteFragments = remainingFragments
+ if len(incompleteFragments) == lastIncompCount {
+ return ParseErr
+ }
+ }
+
+ return nil
+}
+
+func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
+ root, ok := t.(*rootNode)
+ if !ok {
+ return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
+ }
+
+ for name, frag := range fragments {
+ err := root.applyFragment(name, frag)
+ if err != nil {
+ return false, err
+ }
+ }
+
+ return !root.incomplete(), nil
+}
+
+type tokenKind string
+
+const (
+ tokenKindChar tokenKind = "char"
+ tokenKindAnyChar tokenKind = "."
+ tokenKindRepeat tokenKind = "*"
+ tokenKindRepeatOneOrMore tokenKind = "+"
+ tokenKindOption tokenKind = "?"
+ tokenKindAlt tokenKind = "|"
+ tokenKindGroupOpen tokenKind = "("
+ tokenKindGroupClose tokenKind = ")"
+ tokenKindBExpOpen tokenKind = "["
+ tokenKindInverseBExpOpen tokenKind = "[^"
+ tokenKindBExpClose tokenKind = "]"
+ tokenKindCharRange tokenKind = "-"
+ tokenKindCodePointLeader tokenKind = "\\u"
+ tokenKindCharPropLeader tokenKind = "\\p"
+ tokenKindFragmentLeader tokenKind = "\\f"
+ tokenKindLBrace tokenKind = "{"
+ tokenKindRBrace tokenKind = "}"
+ tokenKindEqual tokenKind = "="
+ tokenKindCodePoint tokenKind = "code point"
+ tokenKindCharPropSymbol tokenKind = "character property symbol"
+ tokenKindFragmentSymbol tokenKind = "fragment symbol"
+ tokenKindEOF tokenKind = "eof"
+)
+
+type token struct {
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
+ fragmentSymbol string
+}
+
+const nullChar = '\u0000'
+
+func newToken(kind tokenKind, char rune) *token {
+ return &token{
+ kind: kind,
+ char: char,
+ }
+}
+
+func newCodePointToken(codePoint string) *token {
+ return &token{
+ kind: tokenKindCodePoint,
+ codePoint: codePoint,
+ }
+}
+
+func newCharPropSymbolToken(propSymbol string) *token {
+ return &token{
+ kind: tokenKindCharPropSymbol,
+ propSymbol: propSymbol,
+ }
+}
+
+func newFragmentSymbolToken(fragmentSymbol string) *token {
+ return &token{
+ kind: tokenKindFragmentSymbol,
+ fragmentSymbol: fragmentSymbol,
+ }
+}
+
+type lexerMode string
+
+const (
+ lexerModeDefault lexerMode = "default"
+ lexerModeBExp lexerMode = "bracket expression"
+ lexerModeCPExp lexerMode = "code point expression"
+ lexerModeCharPropExp lexerMode = "character property expression"
+ lexerModeFragmentExp lexerMode = "fragment expression"
+)
+
+type lexerModeStack struct {
+ stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+ return &lexerModeStack{
+ stack: []lexerMode{
+ lexerModeDefault,
+ },
+ }
+}
+
+func (s *lexerModeStack) top() lexerMode {
+ return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+ s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+ s.stack = s.stack[:len(s.stack)-1]
+}
+
+type rangeState string
+
+// [a-z]
+// ^^^^
+// |||`-- ready
+// ||`-- expect range terminator
+// |`-- read range initiator
+// `-- ready
+const (
+ rangeStateReady rangeState = "ready"
+ rangeStateReadRangeInitiator rangeState = "read range initiator"
+ rangeStateExpectRangeTerminator rangeState = "expect range terminator"
+)
+
+type lexer struct {
+ src *bufio.Reader
+ peekChar2 rune
+ peekEOF2 bool
+ peekChar1 rune
+ peekEOF1 bool
+ lastChar rune
+ reachedEOF bool
+ prevChar1 rune
+ prevEOF1 bool
+ prevChar2 rune
+ pervEOF2 bool
+ modeStack *lexerModeStack
+ rangeState rangeState
+
+ errCause error
+ errDetail string
+}
+
+func newLexer(src io.Reader) *lexer {
+ return &lexer{
+ src: bufio.NewReader(src),
+ peekChar2: nullChar,
+ peekEOF2: false,
+ peekChar1: nullChar,
+ peekEOF1: false,
+ lastChar: nullChar,
+ reachedEOF: false,
+ prevChar1: nullChar,
+ prevEOF1: false,
+ prevChar2: nullChar,
+ pervEOF2: false,
+ modeStack: newLexerModeStack(),
+ rangeState: rangeStateReady,
+ }
+}
+
+func (l *lexer) error() (string, error) {
+ return l.errDetail, l.errCause
+}
+
+func (l *lexer) next() (*token, error) {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return newToken(tokenKindEOF, nullChar), nil
+ }
+
+ switch l.modeStack.top() {
+ case lexerModeBExp:
+ tok, err := l.nextInBExp(c)
+ if err != nil {
+ return nil, err
+ }
+ if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
+ switch l.rangeState {
+ case rangeStateReady:
+ l.rangeState = rangeStateReadRangeInitiator
+ case rangeStateExpectRangeTerminator:
+ l.rangeState = rangeStateReady
+ }
+ }
+ switch tok.kind {
+ case tokenKindBExpClose:
+ l.modeStack.pop()
+ case tokenKindCharRange:
+ l.rangeState = rangeStateExpectRangeTerminator
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ }
+ return tok, nil
+ case lexerModeCPExp:
+ tok, err := l.nextInCodePoint(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeCharPropExp:
+ tok, err := l.nextInCharProp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeFragmentExp:
+ tok, err := l.nextInFragment(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ default:
+ tok, err := l.nextInDefault(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindInverseBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ case tokenKindFragmentLeader:
+ l.modeStack.push(lexerModeFragmentExp)
+ }
+ return tok, nil
+ }
+}
+
+func (l *lexer) nextInDefault(c rune) (*token, error) {
+ switch c {
+ case '*':
+ return newToken(tokenKindRepeat, nullChar), nil
+ case '+':
+ return newToken(tokenKindRepeatOneOrMore, nullChar), nil
+ case '?':
+ return newToken(tokenKindOption, nullChar), nil
+ case '.':
+ return newToken(tokenKindAnyChar, nullChar), nil
+ case '|':
+ return newToken(tokenKindAlt, nullChar), nil
+ case '(':
+ return newToken(tokenKindGroupOpen, nullChar), nil
+ case ')':
+ return newToken(tokenKindGroupClose, nullChar), nil
+ case '[':
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ if c1 != '^' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ c2, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ if c2 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == 'f' {
+ return newToken(tokenKindFragmentLeader, nullChar), nil
+ }
+ if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInBExp(c rune) (*token, error) {
+ switch c {
+ case '-':
+ if l.rangeState != rangeStateReadRangeInitiator {
+ return newToken(tokenKindChar, c), nil
+ }
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ }
+ if c1 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindCharRange, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ case ']':
+ return newToken(tokenKindBExpClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == '\\' || c == '^' || c == '-' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ if !isHexDigit(c) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if !isHexDigit(c) || n >= 6 {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ cp := b.String()
+ cpLen := len(cp)
+ if !(cpLen == 4 || cpLen == 6) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ return newCodePointToken(b.String()), nil
+ }
+}
+
+func isHexDigit(c rune) bool {
+ if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+ return true
+ }
+ return false
+}
+
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ case '=':
+ return newToken(tokenKindEqual, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' || c == '=' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = synErrCharPropInvalidSymbol
+ return nil, ParseErr
+ }
+ return newCharPropSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) nextInFragment(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = SynErrFragmentInvalidSymbol
+ return nil, ParseErr
+ }
+ return newFragmentSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) read() (rune, bool, error) {
+ if l.reachedEOF {
+ return l.lastChar, l.reachedEOF, nil
+ }
+ if l.peekChar1 != nullChar || l.peekEOF1 {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = l.peekChar1
+ l.reachedEOF = l.peekEOF1
+ l.peekChar1 = l.peekChar2
+ l.peekEOF1 = l.peekEOF2
+ l.peekChar2 = nullChar
+ l.peekEOF2 = false
+ return l.lastChar, l.reachedEOF, nil
+ }
+ c, _, err := l.src.ReadRune()
+ if err != nil {
+ if err == io.EOF {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = nullChar
+ l.reachedEOF = true
+ return l.lastChar, l.reachedEOF, nil
+ }
+ return nullChar, false, err
+ }
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = c
+ l.reachedEOF = false
+ return l.lastChar, l.reachedEOF, nil
+}
+
+func (l *lexer) restore() error {
+ if l.lastChar == nullChar && !l.reachedEOF {
+ return fmt.Errorf("failed to call restore() because the last character is null")
+ }
+ l.peekChar2 = l.peekChar1
+ l.peekEOF2 = l.peekEOF1
+ l.peekChar1 = l.lastChar
+ l.peekEOF1 = l.reachedEOF
+ l.lastChar = l.prevChar1
+ l.reachedEOF = l.prevEOF1
+ l.prevChar1 = l.prevChar2
+ l.prevEOF1 = l.pervEOF2
+ l.prevChar2 = nullChar
+ l.pervEOF2 = false
+ return nil
+}
+
+type PatternEntry struct {
+ ID spec.LexModeKindID
+ Pattern []byte
+}
+
+type parser struct {
+ kind spec.LexKindName
+ lex *lexer
+ peekedTok *token
+ lastTok *token
+
+ // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
+ // appear in property expressions.
+ //
+ // The contributory properties are not exposed, and users cannot use those properties because the parser
+ // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
+ //
+ // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
+ // interpret derived properties internally because the derived properties consist of other properties that
+ // may contain the contributory properties.
+ //
+ // [UAX #44 5.13 Property APIs] says:
+ // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
+ // > except in limited circumstances. They may not be useful, particularly in public API collections,
+ // > and may instead prove misleading to the users of such API collections.
+ // > * Contributory properties are not recommended for public APIs.
+ // > ...
+ // https://unicode.org/reports/tr44/#Property_APIs
+ isContributoryPropertyExposed bool
+
+ errCause error
+ errDetail string
+}
+
+func NewParser(kind spec.LexKindName, src io.Reader) *parser {
+ return &parser{
+ kind: kind,
+ lex: newLexer(src),
+ isContributoryPropertyExposed: false,
+ }
+}
+
+func (p *parser) exposeContributoryProperty() {
+ p.isContributoryPropertyExposed = true
+}
+
+func (p *parser) Error() (string, error) {
+ return p.errDetail, p.errCause
+}
+
+func (p *parser) Parse() (root CPTree, retErr error) {
+ defer func() {
+ err := recover()
+ if err != nil {
+ var ok bool
+ retErr, ok = err.(error)
+ if !ok {
+ panic(err)
+ }
+ return
+ }
+ }()
+
+ return newRootNode(p.kind, p.parseRegexp()), nil
+}
+
+func (p *parser) parseRegexp() CPTree {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.raiseParseError(synErrNullPattern, "")
+ }
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.expect(tokenKindEOF)
+ return alt
+}
+
+func (p *parser) parseAlt() CPTree {
+ left := p.parseConcat()
+ if left == nil {
+ if p.consume(tokenKindAlt) {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ return nil
+ }
+ for {
+ if !p.consume(tokenKindAlt) {
+ break
+ }
+ right := p.parseConcat()
+ if right == nil {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ left = newAltNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseConcat() CPTree {
+ left := p.parseRepeat()
+ for {
+ right := p.parseRepeat()
+ if right == nil {
+ break
+ }
+ left = newConcatNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseRepeat() CPTree {
+ group := p.parseGroup()
+ if group == nil {
+ if p.consume(tokenKindRepeat) {
+ p.raiseParseError(synErrRepNoTarget, "* needs an operand")
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
+ }
+ if p.consume(tokenKindOption) {
+ p.raiseParseError(synErrRepNoTarget, "? needs an operand")
+ }
+ return nil
+ }
+ if p.consume(tokenKindRepeat) {
+ return newRepeatNode(group)
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ return newRepeatOneOrMoreNode(group)
+ }
+ if p.consume(tokenKindOption) {
+ return newOptionNode(group)
+ }
+ return group
+}
+
+func (p *parser) parseGroup() CPTree {
+ if p.consume(tokenKindGroupOpen) {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ p.raiseParseError(synErrGroupNoElem, "")
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ if !p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupInvalidForm, "")
+ }
+ return alt
+ }
+ return p.parseSingleChar()
+}
+
+func (p *parser) parseSingleChar() CPTree {
+ if p.consume(tokenKindAnyChar) {
+ return genAnyCharAST()
+ }
+ if p.consume(tokenKindBExpOpen) {
+ left := p.parseBExpElem()
+ if left == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ for {
+ right := p.parseBExpElem()
+ if right == nil {
+ break
+ }
+ left = newAltNode(left, right)
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return left
+ }
+ if p.consume(tokenKindInverseBExpOpen) {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ inverse := exclude(elem, genAnyCharAST())
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ break
+ }
+ inverse = exclude(elem, inverse)
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return inverse
+ }
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
+ if p.consume(tokenKindCharPropLeader) {
+ return p.parseCharProp()
+ }
+ if p.consume(tokenKindFragmentLeader) {
+ return p.parseFragment()
+ }
+ c := p.parseNormalChar()
+ if c == nil {
+ if p.consume(tokenKindBExpClose) {
+ p.raiseParseError(synErrBExpInvalidForm, "")
+ }
+ return nil
+ }
+ return c
+}
+
+func (p *parser) parseBExpElem() CPTree {
+ var left CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ left = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ left = p.parseCharProp()
+ if p.consume(tokenKindCharRange) {
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ }
+ default:
+ left = p.parseNormalChar()
+ }
+ if left == nil {
+ return nil
+ }
+ if !p.consume(tokenKindCharRange) {
+ return left
+ }
+ var right CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ right = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ default:
+ right = p.parseNormalChar()
+ }
+ if right == nil {
+ p.raiseParseError(synErrRangeInvalidForm, "")
+ }
+ from, _, _ := left.Range()
+ _, to, _ := right.Range()
+ if !isValidOrder(from, to) {
+ p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
+ }
+ return newRangeSymbolNode(from, to)
+}
+
+func (p *parser) parseCodePoint() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindCodePoint) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
+ if err != nil {
+ panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
+ }
+ if n < 0x0000 || n > 0x10FFFF {
+ p.raiseParseError(synErrCPExpOutOfRange, "")
+ }
+
+ sym := newSymbolNode(rune(n))
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ return sym
+}
+
+func (p *parser) parseCharProp() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ var sym1, sym2 string
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym1 = p.lastTok.propSymbol
+ if p.consume(tokenKindEqual) {
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym2 = p.lastTok.propSymbol
+ }
+
+ var alt CPTree
+ var propName, propVal string
+ if sym2 != "" {
+ propName = sym1
+ propVal = sym2
+ } else {
+ propName = ""
+ propVal = sym1
+ }
+ if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
+ p.raiseParseError(synErrCharPropUnsupported, propName)
+ }
+ pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if pat != "" {
+ p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
+ p.exposeContributoryProperty()
+ ast, err := p.Parse()
+ if err != nil {
+ panic(err)
+ }
+ alt = ast
+ } else {
+ cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if inverse {
+ r := cpRanges[0]
+ alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for _, r := range cpRanges[1:] {
+ alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ alt = genAltNode(
+ alt,
+ newRangeSymbolNode(r.From, r.To),
+ )
+ }
+ }
+ }
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+
+ return alt
+}
+
+func (p *parser) parseFragment() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindFragmentSymbol) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ sym := p.lastTok.fragmentSymbol
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+
+ return newFragmentNode(spec.LexKindName(sym), nil)
+}
+
+func (p *parser) parseNormalChar() CPTree {
+ if !p.consume(tokenKindChar) {
+ return nil
+ }
+ return newSymbolNode(p.lastTok.char)
+}
+
+func exclude(symbol, base CPTree) CPTree {
+ if left, right, ok := symbol.Alternatives(); ok {
+ return exclude(right, exclude(left, base))
+ }
+
+ if left, right, ok := base.Alternatives(); ok {
+ return genAltNode(
+ exclude(symbol, left),
+ exclude(symbol, right),
+ )
+ }
+
+ if bFrom, bTo, ok := base.Range(); ok {
+ sFrom, sTo, ok := symbol.Range()
+ if !ok {
+ panic(fmt.Errorf("invalid symbol tree: %T", symbol))
+ }
+
+ switch {
+ case sFrom > bFrom && sTo < bTo:
+ return genAltNode(
+ newRangeSymbolNode(bFrom, sFrom-1),
+ newRangeSymbolNode(sTo+1, bTo),
+ )
+ case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
+ return newRangeSymbolNode(sTo+1, bTo)
+ case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
+ return newRangeSymbolNode(bFrom, sFrom-1)
+ case sFrom <= bFrom && sTo >= bTo:
+ return nil
+ default:
+ return base
+ }
+ }
+
+ panic(fmt.Errorf("invalid base tree: %T", base))
+}
+
+func genAnyCharAST() CPTree {
+ return newRangeSymbolNode(0x0, 0x10FFFF)
+}
+
+func isValidOrder(from, to rune) bool {
+ return from <= to
+}
+
+func genConcatNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ concat = newConcatNode(concat, c)
+ }
+ return concat
+}
+
+func genAltNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ alt = newAltNode(alt, c)
+ }
+ return alt
+}
+
+func (p *parser) expect(expected tokenKind) {
+ if !p.consume(expected) {
+ tok := p.peekedTok
+ p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
+ }
+}
+
+func (p *parser) consume(expected tokenKind) bool {
+ var tok *token
+ var err error
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ if err == ParseErr {
+ detail, cause := p.lex.error()
+ p.raiseParseError(cause, detail)
+ }
+ panic(err)
+ }
+ }
+ p.lastTok = tok
+ if tok.kind == expected {
+ return true
+ }
+ p.peekedTok = tok
+ p.lastTok = nil
+
+ return false
+}
+
+func (p *parser) raiseParseError(err error, detail string) {
+ p.errCause = err
+ p.errDetail = detail
+ panic(ParseErr)
+}
+
+type CPRange struct {
+ From rune
+ To rune
+}
+
+type CPTree interface {
+ fmt.Stringer
+ Range() (rune, rune, bool)
+ Optional() (CPTree, bool)
+ Repeatable() (CPTree, bool)
+ Concatenation() (CPTree, CPTree, bool)
+ Alternatives() (CPTree, CPTree, bool)
+ Describe() (spec.LexKindName, []spec.LexKindName, error)
+
+ children() (CPTree, CPTree)
+ clone() CPTree
+}
+
+var (
+ _ CPTree = &rootNode{}
+ _ CPTree = &symbolNode{}
+ _ CPTree = &concatNode{}
+ _ CPTree = &altNode{}
+ _ CPTree = &quantifierNode{}
+ _ CPTree = &fragmentNode{}
+)
+
+type rootNode struct {
+ kind spec.LexKindName
+ tree CPTree
+ fragments map[spec.LexKindName][]*fragmentNode
+}
+
+func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
+ fragments := map[spec.LexKindName][]*fragmentNode{}
+ collectFragments(t, fragments)
+
+ return &rootNode{
+ kind: kind,
+ tree: t,
+ fragments: fragments,
+ }
+}
+
+func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
+ if n == nil {
+ return
+ }
+
+ if f, ok := n.(*fragmentNode); ok {
+ fragments[f.kind] = append(fragments[f.kind], f)
+ return
+ }
+
+ l, r := n.children()
+ collectFragments(l, fragments)
+ collectFragments(r, fragments)
+}
+
+func (n *rootNode) String() string {
+ return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
+}
+
+func (n *rootNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *rootNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *rootNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ var frags []spec.LexKindName
+ for f := range n.fragments {
+ frags = append(frags, spec.LexKindName(f))
+ }
+ sort.Slice(frags, func(i, j int) bool {
+ return frags[i] < frags[j]
+ })
+
+ return n.kind, frags, nil
+}
+
+func (n *rootNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *rootNode) clone() CPTree {
+ return n.tree.clone()
+}
+
+func (n *rootNode) incomplete() bool {
+ return len(n.fragments) > 0
+}
+
+func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
+ root, ok := fragment.(*rootNode)
+ if !ok {
+ return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
+ }
+ if root.incomplete() {
+ return fmt.Errorf("fragment is incomplete")
+ }
+
+ fs, ok := n.fragments[kind]
+ if !ok {
+ return nil
+ }
+ for _, f := range fs {
+ f.tree = root.clone()
+ }
+ delete(n.fragments, kind)
+
+ return nil
+}
+
+type symbolNode struct {
+ CPRange
+}
+
+func newSymbolNode(cp rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: cp,
+ To: cp,
+ },
+ }
+}
+
+func newRangeSymbolNode(from, to rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: from,
+ To: to,
+ },
+ }
+}
+
+func (n *symbolNode) String() string {
+ return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
+}
+
+func (n *symbolNode) Range() (rune, rune, bool) {
+ return n.From, n.To, true
+}
+
+func (n *symbolNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *symbolNode) children() (CPTree, CPTree) {
+ return nil, nil
+}
+
+func (n *symbolNode) clone() CPTree {
+ return newRangeSymbolNode(n.From, n.To)
+}
+
+type concatNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newConcatNode(left, right CPTree) *concatNode {
+ return &concatNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *concatNode) String() string {
+ return "concat"
+}
+
+func (n *concatNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *concatNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *concatNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *concatNode) clone() CPTree {
+ if n == nil {
+ return nil
+ }
+ return newConcatNode(n.left.clone(), n.right.clone())
+}
+
+type altNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newAltNode(left, right CPTree) *altNode {
+ return &altNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *altNode) String() string {
+ return "alt"
+}
+
+func (n *altNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *altNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *altNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *altNode) clone() CPTree {
+ return newAltNode(n.left.clone(), n.right.clone())
+}
+
+type quantifierNode struct {
+ optional bool
+ repeatable bool
+ tree CPTree
+}
+
+func (n *quantifierNode) String() string {
+ switch {
+ case n.repeatable:
+ return "repeatable (>= 0 times)"
+ case n.optional:
+ return "optional (0 or 1 times)"
+ default:
+ return "invalid quantifier"
+ }
+}
+
+func newRepeatNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ repeatable: true,
+ tree: t,
+ }
+}
+
+func newRepeatOneOrMoreNode(t CPTree) *concatNode {
+ return newConcatNode(
+ t,
+ &quantifierNode{
+ repeatable: true,
+ tree: t.clone(),
+ })
+}
+
+func newOptionNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ optional: true,
+ tree: t,
+ }
+}
+
+func (n *quantifierNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *quantifierNode) Optional() (CPTree, bool) {
+ return n.tree, n.optional
+}
+
+func (n *quantifierNode) Repeatable() (CPTree, bool) {
+ return n.tree, n.repeatable
+}
+
+func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *quantifierNode) children() (CPTree, CPTree) {
+ return n.tree, nil
+}
+
+func (n *quantifierNode) clone() CPTree {
+ if n.repeatable {
+ return newRepeatNode(n.tree.clone())
+ }
+ return newOptionNode(n.tree.clone())
+}
+
+type fragmentNode struct {
+ kind spec.LexKindName
+ tree CPTree
+}
+
+func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
+ return &fragmentNode{
+ kind: kind,
+ tree: t,
+ }
+}
+
+func (n *fragmentNode) String() string {
+ return fmt.Sprintf("fragment: %v", n.kind)
+}
+
+func (n *fragmentNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *fragmentNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *fragmentNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *fragmentNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *fragmentNode) clone() CPTree {
+ if n.tree == nil {
+ return newFragmentNode(n.kind, nil)
+ }
+ return newFragmentNode(n.kind, n.tree.clone())
+}
+
+//nolint:unused
+func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
+ if t == nil {
+ return
+ }
+ fmt.Fprintf(w, "%v%v\n", ruledLine, t)
+ children := []CPTree{}
+ switch n := t.(type) {
+ case *rootNode:
+ children = append(children, n.tree)
+ case *fragmentNode:
+ children = append(children, n.tree)
+ default:
+ left, right := t.children()
+ if left != nil {
+ children = append(children, left)
+ }
+ if right != nil {
+ children = append(children, right)
+ }
+ }
+ num := len(children)
+ for i, child := range children {
+ line := "└─ "
+ if num > 1 {
+ if i == 0 {
+ line = "├─ "
+ } else if i < num-1 {
+ line = "│ "
+ }
+ }
+ prefix := "│ "
+ if i >= num-1 {
+ prefix = " "
+ }
+ printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+}
diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go
deleted file mode 100644
index be81da4..0000000
--- a/src/urubu/grammar/lexical/parser/error.go
+++ /dev/null
@@ -1,36 +0,0 @@
-package parser
-
-import "fmt"
-
-var (
- ParseErr = fmt.Errorf("parse error")
-
- // lexical errors
- synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
- synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
- synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
- synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
- SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
-
- // syntax errors
- synErrUnexpectedToken = fmt.Errorf("unexpected token")
- synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
- synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
- synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
- synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
- synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
- synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
- synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
- synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
- synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
- synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
- synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
- synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
- synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
- synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
- synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
- synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
- synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
- synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
- synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
-)
diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go
deleted file mode 100644
index 196c00b..0000000
--- a/src/urubu/grammar/lexical/parser/fragment.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package parser
-
-import (
- "fmt"
-
- spec "urubu/spec/grammar"
-)
-
-type incompleteFragment struct {
- kind spec.LexKindName
- root *rootNode
-}
-
-func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
- if len(fragments) == 0 {
- return nil
- }
-
- completeFragments := map[spec.LexKindName]CPTree{}
- incompleteFragments := []*incompleteFragment{}
- for kind, tree := range fragments {
- root, ok := tree.(*rootNode)
- if !ok {
- return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
- }
- if root.incomplete() {
- incompleteFragments = append(incompleteFragments, &incompleteFragment{
- kind: kind,
- root: root,
- })
- } else {
- completeFragments[kind] = root
- }
- }
- for len(incompleteFragments) > 0 {
- lastIncompCount := len(incompleteFragments)
- remainingFragments := []*incompleteFragment{}
- for _, e := range incompleteFragments {
- complete, err := ApplyFragments(e.root, completeFragments)
- if err != nil {
- return err
- }
- if !complete {
- remainingFragments = append(remainingFragments, e)
- } else {
- completeFragments[e.kind] = e.root
- }
- }
- incompleteFragments = remainingFragments
- if len(incompleteFragments) == lastIncompCount {
- return ParseErr
- }
- }
-
- return nil
-}
-
-func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
- root, ok := t.(*rootNode)
- if !ok {
- return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
- }
-
- for name, frag := range fragments {
- err := root.applyFragment(name, frag)
- if err != nil {
- return false, err
- }
- }
-
- return !root.incomplete(), nil
-}
diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go
deleted file mode 100644
index 3861825..0000000
--- a/src/urubu/grammar/lexical/parser/lexer.go
+++ /dev/null
@@ -1,594 +0,0 @@
-package parser
-
-import (
- "bufio"
- "fmt"
- "io"
- "strings"
-)
-
-type tokenKind string
-
-const (
- tokenKindChar tokenKind = "char"
- tokenKindAnyChar tokenKind = "."
- tokenKindRepeat tokenKind = "*"
- tokenKindRepeatOneOrMore tokenKind = "+"
- tokenKindOption tokenKind = "?"
- tokenKindAlt tokenKind = "|"
- tokenKindGroupOpen tokenKind = "("
- tokenKindGroupClose tokenKind = ")"
- tokenKindBExpOpen tokenKind = "["
- tokenKindInverseBExpOpen tokenKind = "[^"
- tokenKindBExpClose tokenKind = "]"
- tokenKindCharRange tokenKind = "-"
- tokenKindCodePointLeader tokenKind = "\\u"
- tokenKindCharPropLeader tokenKind = "\\p"
- tokenKindFragmentLeader tokenKind = "\\f"
- tokenKindLBrace tokenKind = "{"
- tokenKindRBrace tokenKind = "}"
- tokenKindEqual tokenKind = "="
- tokenKindCodePoint tokenKind = "code point"
- tokenKindCharPropSymbol tokenKind = "character property symbol"
- tokenKindFragmentSymbol tokenKind = "fragment symbol"
- tokenKindEOF tokenKind = "eof"
-)
-
-type token struct {
- kind tokenKind
- char rune
- propSymbol string
- codePoint string
- fragmentSymbol string
-}
-
-const nullChar = '\u0000'
-
-func newToken(kind tokenKind, char rune) *token {
- return &token{
- kind: kind,
- char: char,
- }
-}
-
-func newCodePointToken(codePoint string) *token {
- return &token{
- kind: tokenKindCodePoint,
- codePoint: codePoint,
- }
-}
-
-func newCharPropSymbolToken(propSymbol string) *token {
- return &token{
- kind: tokenKindCharPropSymbol,
- propSymbol: propSymbol,
- }
-}
-
-func newFragmentSymbolToken(fragmentSymbol string) *token {
- return &token{
- kind: tokenKindFragmentSymbol,
- fragmentSymbol: fragmentSymbol,
- }
-}
-
-type lexerMode string
-
-const (
- lexerModeDefault lexerMode = "default"
- lexerModeBExp lexerMode = "bracket expression"
- lexerModeCPExp lexerMode = "code point expression"
- lexerModeCharPropExp lexerMode = "character property expression"
- lexerModeFragmentExp lexerMode = "fragment expression"
-)
-
-type lexerModeStack struct {
- stack []lexerMode
-}
-
-func newLexerModeStack() *lexerModeStack {
- return &lexerModeStack{
- stack: []lexerMode{
- lexerModeDefault,
- },
- }
-}
-
-func (s *lexerModeStack) top() lexerMode {
- return s.stack[len(s.stack)-1]
-}
-
-func (s *lexerModeStack) push(m lexerMode) {
- s.stack = append(s.stack, m)
-}
-
-func (s *lexerModeStack) pop() {
- s.stack = s.stack[:len(s.stack)-1]
-}
-
-type rangeState string
-
-// [a-z]
-// ^^^^
-// |||`-- ready
-// ||`-- expect range terminator
-// |`-- read range initiator
-// `-- ready
-const (
- rangeStateReady rangeState = "ready"
- rangeStateReadRangeInitiator rangeState = "read range initiator"
- rangeStateExpectRangeTerminator rangeState = "expect range terminator"
-)
-
-type lexer struct {
- src *bufio.Reader
- peekChar2 rune
- peekEOF2 bool
- peekChar1 rune
- peekEOF1 bool
- lastChar rune
- reachedEOF bool
- prevChar1 rune
- prevEOF1 bool
- prevChar2 rune
- pervEOF2 bool
- modeStack *lexerModeStack
- rangeState rangeState
-
- errCause error
- errDetail string
-}
-
-func newLexer(src io.Reader) *lexer {
- return &lexer{
- src: bufio.NewReader(src),
- peekChar2: nullChar,
- peekEOF2: false,
- peekChar1: nullChar,
- peekEOF1: false,
- lastChar: nullChar,
- reachedEOF: false,
- prevChar1: nullChar,
- prevEOF1: false,
- prevChar2: nullChar,
- pervEOF2: false,
- modeStack: newLexerModeStack(),
- rangeState: rangeStateReady,
- }
-}
-
-func (l *lexer) error() (string, error) {
- return l.errDetail, l.errCause
-}
-
-func (l *lexer) next() (*token, error) {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return newToken(tokenKindEOF, nullChar), nil
- }
-
- switch l.modeStack.top() {
- case lexerModeBExp:
- tok, err := l.nextInBExp(c)
- if err != nil {
- return nil, err
- }
- if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
- switch l.rangeState {
- case rangeStateReady:
- l.rangeState = rangeStateReadRangeInitiator
- case rangeStateExpectRangeTerminator:
- l.rangeState = rangeStateReady
- }
- }
- switch tok.kind {
- case tokenKindBExpClose:
- l.modeStack.pop()
- case tokenKindCharRange:
- l.rangeState = rangeStateExpectRangeTerminator
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- }
- return tok, nil
- case lexerModeCPExp:
- tok, err := l.nextInCodePoint(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeCharPropExp:
- tok, err := l.nextInCharProp(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeFragmentExp:
- tok, err := l.nextInFragment(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- default:
- tok, err := l.nextInDefault(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindInverseBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- case tokenKindFragmentLeader:
- l.modeStack.push(lexerModeFragmentExp)
- }
- return tok, nil
- }
-}
-
-func (l *lexer) nextInDefault(c rune) (*token, error) {
- switch c {
- case '*':
- return newToken(tokenKindRepeat, nullChar), nil
- case '+':
- return newToken(tokenKindRepeatOneOrMore, nullChar), nil
- case '?':
- return newToken(tokenKindOption, nullChar), nil
- case '.':
- return newToken(tokenKindAnyChar, nullChar), nil
- case '|':
- return newToken(tokenKindAlt, nullChar), nil
- case '(':
- return newToken(tokenKindGroupOpen, nullChar), nil
- case ')':
- return newToken(tokenKindGroupClose, nullChar), nil
- case '[':
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- if c1 != '^' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- c2, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- if c2 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == 'f' {
- return newToken(tokenKindFragmentLeader, nullChar), nil
- }
- if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInBExp(c rune) (*token, error) {
- switch c {
- case '-':
- if l.rangeState != rangeStateReadRangeInitiator {
- return newToken(tokenKindChar, c), nil
- }
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- }
- if c1 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindCharRange, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- case ']':
- return newToken(tokenKindBExpClose, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == '\\' || c == '^' || c == '-' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInCodePoint(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- if !isHexDigit(c) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if !isHexDigit(c) || n >= 6 {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- cp := b.String()
- cpLen := len(cp)
- if !(cpLen == 4 || cpLen == 6) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- return newCodePointToken(b.String()), nil
- }
-}
-
-func isHexDigit(c rune) bool {
- if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
- return true
- }
- return false
-}
-
-func (l *lexer) nextInCharProp(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- case '=':
- return newToken(tokenKindEqual, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' || c == '=' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = synErrCharPropInvalidSymbol
- return nil, ParseErr
- }
- return newCharPropSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) nextInFragment(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = SynErrFragmentInvalidSymbol
- return nil, ParseErr
- }
- return newFragmentSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) read() (rune, bool, error) {
- if l.reachedEOF {
- return l.lastChar, l.reachedEOF, nil
- }
- if l.peekChar1 != nullChar || l.peekEOF1 {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = l.peekChar1
- l.reachedEOF = l.peekEOF1
- l.peekChar1 = l.peekChar2
- l.peekEOF1 = l.peekEOF2
- l.peekChar2 = nullChar
- l.peekEOF2 = false
- return l.lastChar, l.reachedEOF, nil
- }
- c, _, err := l.src.ReadRune()
- if err != nil {
- if err == io.EOF {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = nullChar
- l.reachedEOF = true
- return l.lastChar, l.reachedEOF, nil
- }
- return nullChar, false, err
- }
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = c
- l.reachedEOF = false
- return l.lastChar, l.reachedEOF, nil
-}
-
-func (l *lexer) restore() error {
- if l.lastChar == nullChar && !l.reachedEOF {
- return fmt.Errorf("failed to call restore() because the last character is null")
- }
- l.peekChar2 = l.peekChar1
- l.peekEOF2 = l.peekEOF1
- l.peekChar1 = l.lastChar
- l.peekEOF1 = l.reachedEOF
- l.lastChar = l.prevChar1
- l.reachedEOF = l.prevEOF1
- l.prevChar1 = l.prevChar2
- l.prevEOF1 = l.pervEOF2
- l.prevChar2 = nullChar
- l.pervEOF2 = false
- return nil
-}
diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go
deleted file mode 100644
index 425b553..0000000
--- a/src/urubu/grammar/lexical/parser/parser.go
+++ /dev/null
@@ -1,531 +0,0 @@
-package parser
-
-import (
- "bytes"
- "fmt"
- "io"
- "strconv"
-
- spec "urubu/spec/grammar"
- "urubu/ucd"
-)
-
-type PatternEntry struct {
- ID spec.LexModeKindID
- Pattern []byte
-}
-
-type parser struct {
- kind spec.LexKindName
- lex *lexer
- peekedTok *token
- lastTok *token
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-
- errCause error
- errDetail string
-}
-
-func NewParser(kind spec.LexKindName, src io.Reader) *parser {
- return &parser{
- kind: kind,
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) Error() (string, error) {
- return p.errDetail, p.errCause
-}
-
-func (p *parser) Parse() (root CPTree, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- panic(err)
- }
- return
- }
- }()
-
- return newRootNode(p.kind, p.parseRegexp()), nil
-}
-
-func (p *parser) parseRegexp() CPTree {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.raiseParseError(synErrNullPattern, "")
- }
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.expect(tokenKindEOF)
- return alt
-}
-
-func (p *parser) parseAlt() CPTree {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() CPTree {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() CPTree {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.raiseParseError(synErrRepNoTarget, "* needs an operand")
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
- }
- if p.consume(tokenKindOption) {
- p.raiseParseError(synErrRepNoTarget, "? needs an operand")
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() CPTree {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- p.raiseParseError(synErrGroupNoElem, "")
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- if !p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupInvalidForm, "")
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() CPTree {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- p.raiseParseError(synErrBExpInvalidForm, "")
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() CPTree {
- var left CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- left = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- left = p.parseCharProp()
- if p.consume(tokenKindCharRange) {
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- }
- default:
- left = p.parseNormalChar()
- }
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- var right CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- right = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- default:
- right = p.parseNormalChar()
- }
- if right == nil {
- p.raiseParseError(synErrRangeInvalidForm, "")
- }
- from, _, _ := left.Range()
- _, to, _ := right.Range()
- if !isValidOrder(from, to) {
- p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
- }
- return newRangeSymbolNode(from, to)
-}
-
-func (p *parser) parseCodePoint() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
- if !p.consume(tokenKindCodePoint) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
- }
- if n < 0x0000 || n > 0x10FFFF {
- p.raiseParseError(synErrCPExpOutOfRange, "")
- }
-
- sym := newSymbolNode(rune(n))
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- return sym
-}
-
-func (p *parser) parseCharProp() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt CPTree
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.raiseParseError(synErrCharPropUnsupported, propName)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if pat != "" {
- p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.Parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if inverse {
- r := cpRanges[0]
- alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for _, r := range cpRanges[1:] {
- alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- } else {
- for _, r := range cpRanges {
- alt = genAltNode(
- alt,
- newRangeSymbolNode(r.From, r.To),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- if !p.consume(tokenKindFragmentSymbol) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
-
- return newFragmentNode(spec.LexKindName(sym), nil)
-}
-
-func (p *parser) parseNormalChar() CPTree {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return newSymbolNode(p.lastTok.char)
-}
-
-func exclude(symbol, base CPTree) CPTree {
- if left, right, ok := symbol.Alternatives(); ok {
- return exclude(right, exclude(left, base))
- }
-
- if left, right, ok := base.Alternatives(); ok {
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- }
-
- if bFrom, bTo, ok := base.Range(); ok {
- sFrom, sTo, ok := symbol.Range()
- if !ok {
- panic(fmt.Errorf("invalid symbol tree: %T", symbol))
- }
-
- switch {
- case sFrom > bFrom && sTo < bTo:
- return genAltNode(
- newRangeSymbolNode(bFrom, sFrom-1),
- newRangeSymbolNode(sTo+1, bTo),
- )
- case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
- return newRangeSymbolNode(sTo+1, bTo)
- case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
- return newRangeSymbolNode(bFrom, sFrom-1)
- case sFrom <= bFrom && sTo >= bTo:
- return nil
- default:
- return base
- }
- }
-
- panic(fmt.Errorf("invalid base tree: %T", base))
-}
-
-func genAnyCharAST() CPTree {
- return newRangeSymbolNode(0x0, 0x10FFFF)
-}
-
-func isValidOrder(from, to rune) bool {
- return from <= to
-}
-
-func genConcatNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- if err == ParseErr {
- detail, cause := p.lex.error()
- p.raiseParseError(cause, detail)
- }
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
-
-func (p *parser) raiseParseError(err error, detail string) {
- p.errCause = err
- p.errDetail = detail
- panic(ParseErr)
-}
diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go
deleted file mode 100644
index df03d37..0000000
--- a/src/urubu/grammar/lexical/parser/tree.go
+++ /dev/null
@@ -1,459 +0,0 @@
-package parser
-
-import (
- "fmt"
- "io"
- "sort"
-
- spec "urubu/spec/grammar"
-)
-
-type CPRange struct {
- From rune
- To rune
-}
-
-type CPTree interface {
- fmt.Stringer
- Range() (rune, rune, bool)
- Optional() (CPTree, bool)
- Repeatable() (CPTree, bool)
- Concatenation() (CPTree, CPTree, bool)
- Alternatives() (CPTree, CPTree, bool)
- Describe() (spec.LexKindName, []spec.LexKindName, error)
-
- children() (CPTree, CPTree)
- clone() CPTree
-}
-
-var (
- _ CPTree = &rootNode{}
- _ CPTree = &symbolNode{}
- _ CPTree = &concatNode{}
- _ CPTree = &altNode{}
- _ CPTree = &quantifierNode{}
- _ CPTree = &fragmentNode{}
-)
-
-type rootNode struct {
- kind spec.LexKindName
- tree CPTree
- fragments map[spec.LexKindName][]*fragmentNode
-}
-
-func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
- fragments := map[spec.LexKindName][]*fragmentNode{}
- collectFragments(t, fragments)
-
- return &rootNode{
- kind: kind,
- tree: t,
- fragments: fragments,
- }
-}
-
-func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
- if n == nil {
- return
- }
-
- if f, ok := n.(*fragmentNode); ok {
- fragments[f.kind] = append(fragments[f.kind], f)
- return
- }
-
- l, r := n.children()
- collectFragments(l, fragments)
- collectFragments(r, fragments)
-}
-
-func (n *rootNode) String() string {
- return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
-}
-
-func (n *rootNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *rootNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *rootNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- var frags []spec.LexKindName
- for f := range n.fragments {
- frags = append(frags, spec.LexKindName(f))
- }
- sort.Slice(frags, func(i, j int) bool {
- return frags[i] < frags[j]
- })
-
- return n.kind, frags, nil
-}
-
-func (n *rootNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *rootNode) clone() CPTree {
- return n.tree.clone()
-}
-
-func (n *rootNode) incomplete() bool {
- return len(n.fragments) > 0
-}
-
-func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
- root, ok := fragment.(*rootNode)
- if !ok {
- return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
- }
- if root.incomplete() {
- return fmt.Errorf("fragment is incomplete")
- }
-
- fs, ok := n.fragments[kind]
- if !ok {
- return nil
- }
- for _, f := range fs {
- f.tree = root.clone()
- }
- delete(n.fragments, kind)
-
- return nil
-}
-
-type symbolNode struct {
- CPRange
-}
-
-func newSymbolNode(cp rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: cp,
- To: cp,
- },
- }
-}
-
-func newRangeSymbolNode(from, to rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: from,
- To: to,
- },
- }
-}
-
-func (n *symbolNode) String() string {
- return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
-}
-
-func (n *symbolNode) Range() (rune, rune, bool) {
- return n.From, n.To, true
-}
-
-func (n *symbolNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *symbolNode) children() (CPTree, CPTree) {
- return nil, nil
-}
-
-func (n *symbolNode) clone() CPTree {
- return newRangeSymbolNode(n.From, n.To)
-}
-
-type concatNode struct {
- left CPTree
- right CPTree
-}
-
-func newConcatNode(left, right CPTree) *concatNode {
- return &concatNode{
- left: left,
- right: right,
- }
-}
-
-func (n *concatNode) String() string {
- return "concat"
-}
-
-func (n *concatNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *concatNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *concatNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *concatNode) clone() CPTree {
- if n == nil {
- return nil
- }
- return newConcatNode(n.left.clone(), n.right.clone())
-}
-
-type altNode struct {
- left CPTree
- right CPTree
-}
-
-func newAltNode(left, right CPTree) *altNode {
- return &altNode{
- left: left,
- right: right,
- }
-}
-
-func (n *altNode) String() string {
- return "alt"
-}
-
-func (n *altNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *altNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *altNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *altNode) clone() CPTree {
- return newAltNode(n.left.clone(), n.right.clone())
-}
-
-type quantifierNode struct {
- optional bool
- repeatable bool
- tree CPTree
-}
-
-func (n *quantifierNode) String() string {
- switch {
- case n.repeatable:
- return "repeatable (>= 0 times)"
- case n.optional:
- return "optional (0 or 1 times)"
- default:
- return "invalid quantifier"
- }
-}
-
-func newRepeatNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- repeatable: true,
- tree: t,
- }
-}
-
-func newRepeatOneOrMoreNode(t CPTree) *concatNode {
- return newConcatNode(
- t,
- &quantifierNode{
- repeatable: true,
- tree: t.clone(),
- })
-}
-
-func newOptionNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- optional: true,
- tree: t,
- }
-}
-
-func (n *quantifierNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *quantifierNode) Optional() (CPTree, bool) {
- return n.tree, n.optional
-}
-
-func (n *quantifierNode) Repeatable() (CPTree, bool) {
- return n.tree, n.repeatable
-}
-
-func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *quantifierNode) children() (CPTree, CPTree) {
- return n.tree, nil
-}
-
-func (n *quantifierNode) clone() CPTree {
- if n.repeatable {
- return newRepeatNode(n.tree.clone())
- }
- return newOptionNode(n.tree.clone())
-}
-
-type fragmentNode struct {
- kind spec.LexKindName
- tree CPTree
-}
-
-func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
- return &fragmentNode{
- kind: kind,
- tree: t,
- }
-}
-
-func (n *fragmentNode) String() string {
- return fmt.Sprintf("fragment: %v", n.kind)
-}
-
-func (n *fragmentNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *fragmentNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *fragmentNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *fragmentNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *fragmentNode) clone() CPTree {
- if n.tree == nil {
- return newFragmentNode(n.kind, nil)
- }
- return newFragmentNode(n.kind, n.tree.clone())
-}
-
-//nolint:unused
-func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
- if t == nil {
- return
- }
- fmt.Fprintf(w, "%v%v\n", ruledLine, t)
- children := []CPTree{}
- switch n := t.(type) {
- case *rootNode:
- children = append(children, n.tree)
- case *fragmentNode:
- children = append(children, n.tree)
- default:
- left, right := t.children()
- if left != nil {
- children = append(children, left)
- }
- if right != nil {
- children = append(children, right)
- }
- }
- num := len(children)
- for i, child := range children {
- line := "└─ "
- if num > 1 {
- if i == 0 {
- line = "├─ "
- } else if i < num-1 {
- line = "│ "
- }
- }
- prefix := "│ "
- if i >= num-1 {
- prefix = " "
- }
- printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
- }
-}
diff --git a/src/urubu/grammar/lr0.go b/src/urubu/grammar/lr0.go
deleted file mode 100644
index 92a2137..0000000
--- a/src/urubu/grammar/lr0.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package grammar
-
-import (
- "fmt"
- "sort"
-
- "urubu/grammar/symbol"
-)
-
-type lr0Automaton struct {
- initialState kernelID
- states map[kernelID]*lrState
-}
-
-func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) {
- if !startSym.IsStart() {
- return nil, fmt.Errorf("passed symbold is not a start symbol")
- }
-
- automaton := &lr0Automaton{
- states: map[kernelID]*lrState{},
- }
-
- currentState := stateNumInitial
- knownKernels := map[kernelID]struct{}{}
- uncheckedKernels := []*kernel{}
-
- // Generate an initial kernel.
- {
- prods, _ := prods.findByLHS(startSym)
- initialItem, err := newLR0Item(prods[0], 0)
- if err != nil {
- return nil, err
- }
-
- k, err := newKernel([]*lrItem{initialItem})
- if err != nil {
- return nil, err
- }
-
- automaton.initialState = k.id
- knownKernels[k.id] = struct{}{}
- uncheckedKernels = append(uncheckedKernels, k)
- }
-
- for len(uncheckedKernels) > 0 {
- nextUncheckedKernels := []*kernel{}
- for _, k := range uncheckedKernels {
- state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym)
- if err != nil {
- return nil, err
- }
- state.num = currentState
- currentState = currentState.next()
-
- automaton.states[state.id] = state
-
- for _, k := range neighbours {
- if _, known := knownKernels[k.id]; known {
- continue
- }
- knownKernels[k.id] = struct{}{}
- nextUncheckedKernels = append(nextUncheckedKernels, k)
- }
- }
- uncheckedKernels = nextUncheckedKernels
- }
-
- return automaton, nil
-}
-
-func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) {
- items, err := genLR0Closure(k, prods)
- if err != nil {
- return nil, nil, err
- }
- neighbours, err := genNeighbourKernels(items, prods)
- if err != nil {
- return nil, nil, err
- }
-
- next := map[symbol.Symbol]kernelID{}
- kernels := []*kernel{}
- for _, n := range neighbours {
- next[n.symbol] = n.kernel.id
- kernels = append(kernels, n.kernel)
- }
-
- reducible := map[productionID]struct{}{}
- var emptyProdItems []*lrItem
- isErrorTrapper := false
- for _, item := range items {
- if item.dottedSymbol == errSym {
- isErrorTrapper = true
- }
-
- if item.reducible {
- reducible[item.prod] = struct{}{}
-
- prod, ok := prods.findByID(item.prod)
- if !ok {
- return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod)
- }
- if prod.isEmpty() {
- emptyProdItems = append(emptyProdItems, item)
- }
- }
- }
-
- return &lrState{
- kernel: k,
- next: next,
- reducible: reducible,
- emptyProdItems: emptyProdItems,
- isErrorTrapper: isErrorTrapper,
- }, kernels, nil
-}
-
-func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) {
- items := []*lrItem{}
- knownItems := map[lrItemID]struct{}{}
- uncheckedItems := []*lrItem{}
- for _, item := range k.items {
- items = append(items, item)
- uncheckedItems = append(uncheckedItems, item)
- }
- for len(uncheckedItems) > 0 {
- nextUncheckedItems := []*lrItem{}
- for _, item := range uncheckedItems {
- if item.dottedSymbol.IsTerminal() {
- continue
- }
-
- ps, _ := prods.findByLHS(item.dottedSymbol)
- for _, prod := range ps {
- item, err := newLR0Item(prod, 0)
- if err != nil {
- return nil, err
- }
- if _, exist := knownItems[item.id]; exist {
- continue
- }
- items = append(items, item)
- knownItems[item.id] = struct{}{}
- nextUncheckedItems = append(nextUncheckedItems, item)
- }
- }
- uncheckedItems = nextUncheckedItems
- }
-
- return items, nil
-}
-
-type neighbourKernel struct {
- symbol symbol.Symbol
- kernel *kernel
-}
-
-func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) {
- kItemMap := map[symbol.Symbol][]*lrItem{}
- for _, item := range items {
- if item.dottedSymbol.IsNil() {
- continue
- }
- prod, ok := prods.findByID(item.prod)
- if !ok {
- return nil, fmt.Errorf("a production was not found: %v", item.prod)
- }
- kItem, err := newLR0Item(prod, item.dot+1)
- if err != nil {
- return nil, err
- }
- kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem)
- }
-
- nextSyms := []symbol.Symbol{}
- for sym := range kItemMap {
- nextSyms = append(nextSyms, sym)
- }
- sort.Slice(nextSyms, func(i, j int) bool {
- return nextSyms[i] < nextSyms[j]
- })
-
- kernels := []*neighbourKernel{}
- for _, sym := range nextSyms {
- k, err := newKernel(kItemMap[sym])
- if err != nil {
- return nil, err
- }
- kernels = append(kernels, &neighbourKernel{
- symbol: sym,
- kernel: k,
- })
- }
-
- return kernels, nil
-}
diff --git a/src/urubu/grammar/parsing_table.go b/src/urubu/grammar/parsing_table.go
deleted file mode 100644
index 48ea9fe..0000000
--- a/src/urubu/grammar/parsing_table.go
+++ /dev/null
@@ -1,553 +0,0 @@
-package grammar
-
-import (
- "fmt"
- "sort"
-
- "urubu/grammar/symbol"
- spec "urubu/spec/grammar"
-)
-
-type ActionType string
-
-const (
- ActionTypeShift = ActionType("shift")
- ActionTypeReduce = ActionType("reduce")
- ActionTypeError = ActionType("error")
-)
-
-type actionEntry int
-
-const actionEntryEmpty = actionEntry(0)
-
-func newShiftActionEntry(state stateNum) actionEntry {
- return actionEntry(state * -1)
-}
-
-func newReduceActionEntry(prod productionNum) actionEntry {
- return actionEntry(prod)
-}
-
-func (e actionEntry) isEmpty() bool {
- return e == actionEntryEmpty
-}
-
-func (e actionEntry) describe() (ActionType, stateNum, productionNum) {
- if e == actionEntryEmpty {
- return ActionTypeError, stateNumInitial, productionNumNil
- }
- if e < 0 {
- return ActionTypeShift, stateNum(e * -1), productionNumNil
- }
- return ActionTypeReduce, stateNumInitial, productionNum(e)
-}
-
-type GoToType string
-
-const (
- GoToTypeRegistered = GoToType("registered")
- GoToTypeError = GoToType("error")
-)
-
-type goToEntry uint
-
-const goToEntryEmpty = goToEntry(0)
-
-func newGoToEntry(state stateNum) goToEntry {
- return goToEntry(state)
-}
-
-func (e goToEntry) describe() (GoToType, stateNum) {
- if e == goToEntryEmpty {
- return GoToTypeError, stateNumInitial
- }
- return GoToTypeRegistered, stateNum(e)
-}
-
-type conflictResolutionMethod int
-
-func (m conflictResolutionMethod) Int() int {
- return int(m)
-}
-
-const (
- ResolvedByPrec conflictResolutionMethod = 1
- ResolvedByAssoc conflictResolutionMethod = 2
- ResolvedByShift conflictResolutionMethod = 3
- ResolvedByProdOrder conflictResolutionMethod = 4
-)
-
-type conflict interface {
- conflict()
-}
-
-type shiftReduceConflict struct {
- state stateNum
- sym symbol.Symbol
- nextState stateNum
- prodNum productionNum
- resolvedBy conflictResolutionMethod
-}
-
-func (c *shiftReduceConflict) conflict() {
-}
-
-type reduceReduceConflict struct {
- state stateNum
- sym symbol.Symbol
- prodNum1 productionNum
- prodNum2 productionNum
- resolvedBy conflictResolutionMethod
-}
-
-func (c *reduceReduceConflict) conflict() {
-}
-
-var (
- _ conflict = &shiftReduceConflict{}
- _ conflict = &reduceReduceConflict{}
-)
-
-type ParsingTable struct {
- actionTable []actionEntry
- goToTable []goToEntry
- stateCount int
- terminalCount int
- nonTerminalCount int
-
- // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`,
- // the state has an item having the following form. The `α` and `β` can be empty.
- //
- // A → α・error β
- errorTrapperStates []int
-
- InitialState stateNum
-}
-
-func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) {
- pos := state.Int()*t.terminalCount + sym.Int()
- return t.actionTable[pos].describe()
-}
-
-func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) {
- pos := state.Int()*t.nonTerminalCount + sym.Int()
- return t.goToTable[pos].describe()
-}
-
-func (t *ParsingTable) readAction(row int, col int) actionEntry {
- return t.actionTable[row*t.terminalCount+col]
-}
-
-func (t *ParsingTable) writeAction(row int, col int, act actionEntry) {
- t.actionTable[row*t.terminalCount+col] = act
-}
-
-func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) {
- pos := state.Int()*t.nonTerminalCount + sym.Num().Int()
- t.goToTable[pos] = newGoToEntry(nextState)
-}
-
-type lrTableBuilder struct {
- automaton *lr0Automaton
- prods *productionSet
- termCount int
- nonTermCount int
- symTab *symbol.SymbolTableReader
- precAndAssoc *precAndAssoc
-
- conflicts []conflict
-}
-
-func (b *lrTableBuilder) build() (*ParsingTable, error) {
- var ptab *ParsingTable
- {
- initialState := b.automaton.states[b.automaton.initialState]
- ptab = &ParsingTable{
- actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount),
- goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount),
- stateCount: len(b.automaton.states),
- terminalCount: b.termCount,
- nonTerminalCount: b.nonTermCount,
- errorTrapperStates: make([]int, len(b.automaton.states)),
- InitialState: initialState.num,
- }
- }
-
- for _, state := range b.automaton.states {
- if state.isErrorTrapper {
- ptab.errorTrapperStates[state.num] = 1
- }
-
- for sym, kID := range state.next {
- nextState := b.automaton.states[kID]
- if sym.IsTerminal() {
- b.writeShiftAction(ptab, state.num, sym, nextState.num)
- } else {
- ptab.writeGoTo(state.num, sym, nextState.num)
- }
- }
-
- for prodID := range state.reducible {
- reducibleProd, ok := b.prods.findByID(prodID)
- if !ok {
- return nil, fmt.Errorf("reducible production not found: %v", prodID)
- }
-
- var reducibleItem *lrItem
- for _, item := range state.items {
- if item.prod != reducibleProd.id {
- continue
- }
-
- reducibleItem = item
- break
- }
- if reducibleItem == nil {
- for _, item := range state.emptyProdItems {
- if item.prod != reducibleProd.id {
- continue
- }
-
- reducibleItem = item
- break
- }
- if reducibleItem == nil {
- return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num)
- }
- }
-
- for a := range reducibleItem.lookAhead.symbols {
- b.writeReduceAction(ptab, state.num, a, reducibleProd.num)
- }
- }
- }
-
- return ptab, nil
-}
-
-// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred,
-// we prioritize the shift action.
-func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) {
- act := tab.readAction(state.Int(), sym.Num().Int())
- if !act.isEmpty() {
- ty, _, p := act.describe()
- if ty == ActionTypeReduce {
- act, method := b.resolveSRConflict(sym.Num(), p)
- b.conflicts = append(b.conflicts, &shiftReduceConflict{
- state: state,
- sym: sym,
- nextState: nextState,
- prodNum: p,
- resolvedBy: method,
- })
- if act == ActionTypeShift {
- tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
- }
- return
- }
- }
- tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
-}
-
-// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred,
-// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces
-// the production with higher priority. Productions defined earlier in the grammar file have a higher priority.
-func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) {
- act := tab.readAction(state.Int(), sym.Num().Int())
- if !act.isEmpty() {
- ty, s, p := act.describe()
- switch ty {
- case ActionTypeReduce:
- if p == prod {
- return
- }
-
- b.conflicts = append(b.conflicts, &reduceReduceConflict{
- state: state,
- sym: sym,
- prodNum1: p,
- prodNum2: prod,
- resolvedBy: ResolvedByProdOrder,
- })
- if p < prod {
- tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p))
- } else {
- tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
- }
- case ActionTypeShift:
- act, method := b.resolveSRConflict(sym.Num(), prod)
- b.conflicts = append(b.conflicts, &shiftReduceConflict{
- state: state,
- sym: sym,
- nextState: s,
- prodNum: prod,
- resolvedBy: method,
- })
- if act == ActionTypeReduce {
- tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
- }
- }
- return
- }
- tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
-}
-
-func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) {
- symPrec := b.precAndAssoc.terminalPrecedence(sym)
- prodPrec := b.precAndAssoc.productionPredence(prod)
- if symPrec == 0 || prodPrec == 0 {
- return ActionTypeShift, ResolvedByShift
- }
- if symPrec == prodPrec {
- assoc := b.precAndAssoc.productionAssociativity(prod)
- if assoc != assocTypeLeft {
- return ActionTypeShift, ResolvedByAssoc
- }
- return ActionTypeReduce, ResolvedByAssoc
- }
- if symPrec < prodPrec {
- return ActionTypeShift, ResolvedByPrec
- }
- return ActionTypeReduce, ResolvedByPrec
-}
-
-func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) {
- var terms []*spec.Terminal
- {
- termSyms := b.symTab.TerminalSymbols()
- terms = make([]*spec.Terminal, len(termSyms)+1)
-
- for _, sym := range termSyms {
- name, ok := b.symTab.ToText(sym)
- if !ok {
- return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym)
- }
-
- term := &spec.Terminal{
- Number: sym.Num().Int(),
- Name: name,
- }
-
- prec := b.precAndAssoc.terminalPrecedence(sym.Num())
- if prec != precNil {
- term.Precedence = prec
- }
-
- assoc := b.precAndAssoc.terminalAssociativity(sym.Num())
- switch assoc {
- case assocTypeLeft:
- term.Associativity = "l"
- case assocTypeRight:
- term.Associativity = "r"
- }
-
- terms[sym.Num()] = term
- }
- }
-
- var nonTerms []*spec.NonTerminal
- {
- nonTermSyms := b.symTab.NonTerminalSymbols()
- nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1)
- for _, sym := range nonTermSyms {
- name, ok := b.symTab.ToText(sym)
- if !ok {
- return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym)
- }
-
- nonTerms[sym.Num()] = &spec.NonTerminal{
- Number: sym.Num().Int(),
- Name: name,
- }
- }
- }
-
- var prods []*spec.Production
- {
- ps := gram.productionSet.getAllProductions()
- prods = make([]*spec.Production, len(ps)+1)
- for _, p := range ps {
- rhs := make([]int, len(p.rhs))
- for i, e := range p.rhs {
- if e.IsTerminal() {
- rhs[i] = e.Num().Int()
- } else {
- rhs[i] = e.Num().Int() * -1
- }
- }
-
- prod := &spec.Production{
- Number: p.num.Int(),
- LHS: p.lhs.Num().Int(),
- RHS: rhs,
- }
-
- prec := b.precAndAssoc.productionPredence(p.num)
- if prec != precNil {
- prod.Precedence = prec
- }
-
- assoc := b.precAndAssoc.productionAssociativity(p.num)
- switch assoc {
- case assocTypeLeft:
- prod.Associativity = "l"
- case assocTypeRight:
- prod.Associativity = "r"
- }
-
- prods[p.num.Int()] = prod
- }
- }
-
- var states []*spec.State
- {
- srConflicts := map[stateNum][]*shiftReduceConflict{}
- rrConflicts := map[stateNum][]*reduceReduceConflict{}
- for _, con := range b.conflicts {
- switch c := con.(type) {
- case *shiftReduceConflict:
- srConflicts[c.state] = append(srConflicts[c.state], c)
- case *reduceReduceConflict:
- rrConflicts[c.state] = append(rrConflicts[c.state], c)
- }
- }
-
- states = make([]*spec.State, len(b.automaton.states))
- for _, s := range b.automaton.states {
- kernel := make([]*spec.Item, len(s.items))
- for i, item := range s.items {
- p, ok := b.prods.findByID(item.prod)
- if !ok {
- return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod)
- }
-
- kernel[i] = &spec.Item{
- Production: p.num.Int(),
- Dot: item.dot,
- }
- }
-
- sort.Slice(kernel, func(i, j int) bool {
- if kernel[i].Production < kernel[j].Production {
- return true
- }
- if kernel[i].Production > kernel[j].Production {
- return false
- }
- return kernel[i].Dot < kernel[j].Dot
- })
-
- var shift []*spec.Transition
- var reduce []*spec.Reduce
- var goTo []*spec.Transition
- {
- TERMINALS_LOOP:
- for _, t := range b.symTab.TerminalSymbols() {
- act, next, prod := tab.getAction(s.num, t.Num())
- switch act {
- case ActionTypeShift:
- shift = append(shift, &spec.Transition{
- Symbol: t.Num().Int(),
- State: next.Int(),
- })
- case ActionTypeReduce:
- for _, r := range reduce {
- if r.Production == prod.Int() {
- r.LookAhead = append(r.LookAhead, t.Num().Int())
- continue TERMINALS_LOOP
- }
- }
- reduce = append(reduce, &spec.Reduce{
- LookAhead: []int{t.Num().Int()},
- Production: prod.Int(),
- })
- }
- }
-
- for _, n := range b.symTab.NonTerminalSymbols() {
- ty, next := tab.getGoTo(s.num, n.Num())
- if ty == GoToTypeRegistered {
- goTo = append(goTo, &spec.Transition{
- Symbol: n.Num().Int(),
- State: next.Int(),
- })
- }
- }
-
- sort.Slice(shift, func(i, j int) bool {
- return shift[i].State < shift[j].State
- })
- sort.Slice(reduce, func(i, j int) bool {
- return reduce[i].Production < reduce[j].Production
- })
- sort.Slice(goTo, func(i, j int) bool {
- return goTo[i].State < goTo[j].State
- })
- }
-
- sr := []*spec.SRConflict{}
- rr := []*spec.RRConflict{}
- {
- for _, c := range srConflicts[s.num] {
- conflict := &spec.SRConflict{
- Symbol: c.sym.Num().Int(),
- State: c.nextState.Int(),
- Production: c.prodNum.Int(),
- ResolvedBy: c.resolvedBy.Int(),
- }
-
- ty, s, p := tab.getAction(s.num, c.sym.Num())
- switch ty {
- case ActionTypeShift:
- n := s.Int()
- conflict.AdoptedState = &n
- case ActionTypeReduce:
- n := p.Int()
- conflict.AdoptedProduction = &n
- }
-
- sr = append(sr, conflict)
- }
-
- sort.Slice(sr, func(i, j int) bool {
- return sr[i].Symbol < sr[j].Symbol
- })
-
- for _, c := range rrConflicts[s.num] {
- conflict := &spec.RRConflict{
- Symbol: c.sym.Num().Int(),
- Production1: c.prodNum1.Int(),
- Production2: c.prodNum2.Int(),
- ResolvedBy: c.resolvedBy.Int(),
- }
-
- _, _, p := tab.getAction(s.num, c.sym.Num())
- conflict.AdoptedProduction = p.Int()
-
- rr = append(rr, conflict)
- }
-
- sort.Slice(rr, func(i, j int) bool {
- return rr[i].Symbol < rr[j].Symbol
- })
- }
-
- states[s.num.Int()] = &spec.State{
- Number: s.num.Int(),
- Kernel: kernel,
- Shift: shift,
- Reduce: reduce,
- GoTo: goTo,
- SRConflict: sr,
- RRConflict: rr,
- }
- }
- }
-
- return &spec.Report{
- Terminals: terms,
- NonTerminals: nonTerms,
- Productions: prods,
- States: states,
- }, nil
-}
diff --git a/src/urubu/grammar/production.go b/src/urubu/grammar/production.go
deleted file mode 100644
index 8f6c103..0000000
--- a/src/urubu/grammar/production.go
+++ /dev/null
@@ -1,117 +0,0 @@
-package grammar
-
-import (
- "crypto/sha256"
- "encoding/hex"
- "fmt"
-
- "urubu/grammar/symbol"
-)
-
-type productionID [32]byte
-
-func (id productionID) String() string {
- return hex.EncodeToString(id[:])
-}
-
-func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID {
- seq := lhs.Byte()
- for _, sym := range rhs {
- seq = append(seq, sym.Byte()...)
- }
- return productionID(sha256.Sum256(seq))
-}
-
-type productionNum uint16
-
-const (
- productionNumNil = productionNum(0)
- productionNumStart = productionNum(1)
- productionNumMin = productionNum(2)
-)
-
-func (n productionNum) Int() int {
- return int(n)
-}
-
-type production struct {
- id productionID
- num productionNum
- lhs symbol.Symbol
- rhs []symbol.Symbol
- rhsLen int
-}
-
-func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) {
- if lhs.IsNil() {
- return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
- }
- for _, sym := range rhs {
- if sym.IsNil() {
- return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
- }
- }
-
- return &production{
- id: genProductionID(lhs, rhs),
- lhs: lhs,
- rhs: rhs,
- rhsLen: len(rhs),
- }, nil
-}
-
-func (p *production) isEmpty() bool {
- return p.rhsLen == 0
-}
-
-type productionSet struct {
- lhs2Prods map[symbol.Symbol][]*production
- id2Prod map[productionID]*production
- num productionNum
-}
-
-func newProductionSet() *productionSet {
- return &productionSet{
- lhs2Prods: map[symbol.Symbol][]*production{},
- id2Prod: map[productionID]*production{},
- num: productionNumMin,
- }
-}
-
-func (ps *productionSet) append(prod *production) {
- if _, ok := ps.id2Prod[prod.id]; ok {
- return
- }
-
- if prod.lhs.IsStart() {
- prod.num = productionNumStart
- } else {
- prod.num = ps.num
- ps.num++
- }
-
- if prods, ok := ps.lhs2Prods[prod.lhs]; ok {
- ps.lhs2Prods[prod.lhs] = append(prods, prod)
- } else {
- ps.lhs2Prods[prod.lhs] = []*production{prod}
- }
- ps.id2Prod[prod.id] = prod
-}
-
-func (ps *productionSet) findByID(id productionID) (*production, bool) {
- prod, ok := ps.id2Prod[id]
- return prod, ok
-}
-
-func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) {
- if lhs.IsNil() {
- return nil, false
- }
-
- prods, ok := ps.lhs2Prods[lhs]
- return prods, ok
-}
-
-func (ps *productionSet) getAllProductions() map[productionID]*production {
- return ps.id2Prod
-}
diff --git a/src/urubu/grammar/semantic_error.go b/src/urubu/grammar/semantic_error.go
deleted file mode 100644
index 88a6b17..0000000
--- a/src/urubu/grammar/semantic_error.go
+++ /dev/null
@@ -1,30 +0,0 @@
-package grammar
-
-import "errors"
-
-var (
- semErrNoGrammarName = errors.New("name is missing")
- semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling")
- semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol")
- semErrUndefinedPrec = errors.New("symbol must has precedence")
- semErrUndefinedOrdSym = errors.New("undefined ordered symbol")
- semErrUnusedProduction = errors.New("unused production")
- semErrUnusedTerminal = errors.New("unused terminal")
- semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped")
- semErrNoProduction = errors.New("a grammar needs at least one production")
- semErrUndefinedSym = errors.New("undefined symbol")
- semErrDuplicateProduction = errors.New("duplicate production")
- semErrDuplicateTerminal = errors.New("duplicate terminal")
- semErrDuplicateFragment = errors.New("duplicate fragment")
- semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals")
- semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol")
- semErrDuplicateLabel = errors.New("a label must be unique in an alternative")
- semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols")
- semErrDirInvalidName = errors.New("invalid directive name")
- semErrDirInvalidParam = errors.New("invalid parameter")
- semErrDuplicateDir = errors.New("a directive must not be duplicated")
- semErrDuplicateElem = errors.New("duplicate element")
- semErrAmbiguousElem = errors.New("ambiguous element")
- semErrInvalidProdDir = errors.New("invalid production directive")
- semErrInvalidAltDir = errors.New("invalid alternative directive")
-)
diff --git a/src/urubu/grammar/symbol/symbol.go b/src/urubu/grammar/symbol.go
index f9e6a93..f9e6a93 100644
--- a/src/urubu/grammar/symbol/symbol.go
+++ b/src/urubu/grammar/symbol.go
diff --git a/src/urubu/spec/grammar/grammar.go b/src/urubu/spec/grammar.go
index bf1ea89..c2708d8 100644
--- a/src/urubu/spec/grammar/grammar.go
+++ b/src/urubu/spec/grammar.go
@@ -1,6 +1,79 @@
package grammar
-import "strconv"
+import (
+ "strconv"
+ "strings"
+)
+
+type Terminal struct {
+ Number int `json:"number"`
+ Name string `json:"name"`
+ Pattern string `json:"pattern"`
+ Precedence int `json:"prec"`
+ Associativity string `json:"assoc"`
+}
+
+type NonTerminal struct {
+ Number int `json:"number"`
+ Name string `json:"name"`
+}
+
+type Production struct {
+ Number int `json:"number"`
+ LHS int `json:"lhs"`
+ RHS []int `json:"rhs"`
+ Precedence int `json:"prec"`
+ Associativity string `json:"assoc"`
+}
+
+type Item struct {
+ Production int `json:"production"`
+ Dot int `json:"dot"`
+}
+
+type Transition struct {
+ Symbol int `json:"symbol"`
+ State int `json:"state"`
+}
+
+type Reduce struct {
+ LookAhead []int `json:"look_ahead"`
+ Production int `json:"production"`
+}
+
+type SRConflict struct {
+ Symbol int `json:"symbol"`
+ State int `json:"state"`
+ Production int `json:"production"`
+ AdoptedState *int `json:"adopted_state"`
+ AdoptedProduction *int `json:"adopted_production"`
+ ResolvedBy int `json:"resolved_by"`
+}
+
+type RRConflict struct {
+ Symbol int `json:"symbol"`
+ Production1 int `json:"production_1"`
+ Production2 int `json:"production_2"`
+ AdoptedProduction int `json:"adopted_production"`
+ ResolvedBy int `json:"resolved_by"`
+}
+
+type State struct {
+ Number int `json:"number"`
+ Kernel []*Item `json:"kernel"`
+ Shift []*Transition `json:"shift"`
+ Reduce []*Reduce `json:"reduce"`
+ GoTo []*Transition `json:"goto"`
+ SRConflict []*SRConflict `json:"sr_conflict"`
+ RRConflict []*RRConflict `json:"rr_conflict"`
+}
+
+type Report struct {
+ Terminals []*Terminal `json:"terminals"`
+ NonTerminals []*NonTerminal `json:"non_terminals"`
+ Productions []*Production `json:"productions"`
+ States []*State `json:"states"`
+}
type CompiledGrammar struct {
Name string `json:"name"`
@@ -158,3 +231,21 @@ type SyntacticSpec struct {
type ASTAction struct {
Entries [][]int `json:"entries"`
}
+
+var rep = strings.NewReplacer(
+ `.`, `\.`,
+ `*`, `\*`,
+ `+`, `\+`,
+ `?`, `\?`,
+ `|`, `\|`,
+ `(`, `\(`,
+ `)`, `\)`,
+ `[`, `\[`,
+ `\`, `\\`,
+)
+
+// EscapePattern escapes the special characters.
+// For example, EscapePattern(`+`) returns `\+`.
+func EscapePattern(s string) string {
+ return rep.Replace(s)
+}
diff --git a/src/urubu/spec/grammar/parser/clexspec.json b/src/urubu/spec/grammar/clexspec.json
index d0ed3d3..d0ed3d3 100644
--- a/src/urubu/spec/grammar/parser/clexspec.json
+++ b/src/urubu/spec/grammar/clexspec.json
diff --git a/src/urubu/spec/grammar/description.go b/src/urubu/spec/grammar/description.go
deleted file mode 100644
index 0d2a0b7..0000000
--- a/src/urubu/spec/grammar/description.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package grammar
-
-type Terminal struct {
- Number int `json:"number"`
- Name string `json:"name"`
- Pattern string `json:"pattern"`
- Precedence int `json:"prec"`
- Associativity string `json:"assoc"`
-}
-
-type NonTerminal struct {
- Number int `json:"number"`
- Name string `json:"name"`
-}
-
-type Production struct {
- Number int `json:"number"`
- LHS int `json:"lhs"`
- RHS []int `json:"rhs"`
- Precedence int `json:"prec"`
- Associativity string `json:"assoc"`
-}
-
-type Item struct {
- Production int `json:"production"`
- Dot int `json:"dot"`
-}
-
-type Transition struct {
- Symbol int `json:"symbol"`
- State int `json:"state"`
-}
-
-type Reduce struct {
- LookAhead []int `json:"look_ahead"`
- Production int `json:"production"`
-}
-
-type SRConflict struct {
- Symbol int `json:"symbol"`
- State int `json:"state"`
- Production int `json:"production"`
- AdoptedState *int `json:"adopted_state"`
- AdoptedProduction *int `json:"adopted_production"`
- ResolvedBy int `json:"resolved_by"`
-}
-
-type RRConflict struct {
- Symbol int `json:"symbol"`
- Production1 int `json:"production_1"`
- Production2 int `json:"production_2"`
- AdoptedProduction int `json:"adopted_production"`
- ResolvedBy int `json:"resolved_by"`
-}
-
-type State struct {
- Number int `json:"number"`
- Kernel []*Item `json:"kernel"`
- Shift []*Transition `json:"shift"`
- Reduce []*Reduce `json:"reduce"`
- GoTo []*Transition `json:"goto"`
- SRConflict []*SRConflict `json:"sr_conflict"`
- RRConflict []*RRConflict `json:"rr_conflict"`
-}
-
-type Report struct {
- Terminals []*Terminal `json:"terminals"`
- NonTerminals []*NonTerminal `json:"non_terminals"`
- Productions []*Production `json:"productions"`
- States []*State `json:"states"`
-}
diff --git a/src/urubu/spec/grammar/parser/lexspec.json b/src/urubu/spec/grammar/lexspec.json
index caf1f0e..caf1f0e 100644
--- a/src/urubu/spec/grammar/parser/lexspec.json
+++ b/src/urubu/spec/grammar/lexspec.json
diff --git a/src/urubu/spec/grammar/parser/vartan_lexer.go b/src/urubu/spec/grammar/parser.go
index 76ddfde..0e5a16b 100644
--- a/src/urubu/spec/grammar/parser/vartan_lexer.go
+++ b/src/urubu/spec/grammar/parser.go
@@ -1,11 +1,920 @@
-// Code generated by maleeni-go. DO NOT EDIT.
+//go:generate maleeni compile lexspec.json -o clexspec.json
+//go:generate maleeni-go clexspec.json --package parser
+
package parser
import (
+ _ "embed"
"fmt"
"io"
"io/ioutil"
+ "regexp"
+ "strings"
+
+ verr "urubu/error"
+ spec "urubu/spec/grammar"
+)
+
+type tokenKind string
+
+const (
+ tokenKindKWFragment = tokenKind("fragment")
+ tokenKindID = tokenKind("id")
+ tokenKindTerminalPattern = tokenKind("terminal pattern")
+ tokenKindStringLiteral = tokenKind("string")
+ tokenKindColon = tokenKind(":")
+ tokenKindOr = tokenKind("|")
+ tokenKindSemicolon = tokenKind(";")
+ tokenKindLabelMarker = tokenKind("@")
+ tokenKindDirectiveMarker = tokenKind("#")
+ tokenKindExpantion = tokenKind("...")
+ tokenKindOrderedSymbolMarker = tokenKind("$")
+ tokenKindLParen = tokenKind("(")
+ tokenKindRParen = tokenKind(")")
+ tokenKindNewline = tokenKind("newline")
+ tokenKindEOF = tokenKind("eof")
+ tokenKindInvalid = tokenKind("invalid")
+)
+
+var (
+ reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`)
+ reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`)
+)
+
+type Position struct {
+ Row int
+ Col int
+}
+
+func newPosition(row, col int) Position {
+ return Position{
+ Row: row,
+ Col: col,
+ }
+}
+
+type token struct {
+ kind tokenKind
+ text string
+ pos Position
+}
+
+func newSymbolToken(kind tokenKind, pos Position) *token {
+ return &token{
+ kind: kind,
+ pos: pos,
+ }
+}
+
+func newIDToken(text string, pos Position) *token {
+ return &token{
+ kind: tokenKindID,
+ text: text,
+ pos: pos,
+ }
+}
+
+func newTerminalPatternToken(text string, pos Position) *token {
+ return &token{
+ kind: tokenKindTerminalPattern,
+ text: text,
+ pos: pos,
+ }
+}
+
+func newStringLiteralToken(text string, pos Position) *token {
+ return &token{
+ kind: tokenKindStringLiteral,
+ text: text,
+ pos: pos,
+ }
+}
+
+func newEOFToken() *token {
+ return &token{
+ kind: tokenKindEOF,
+ }
+}
+
+func newInvalidToken(text string, pos Position) *token {
+ return &token{
+ kind: tokenKindInvalid,
+ text: text,
+ pos: pos,
+ }
+}
+
+type lexer struct {
+ d *Lexer
+ buf *token
+}
+
+func newLexer(src io.Reader) (*lexer, error) {
+ d, err := NewLexer(NewLexSpec(), src)
+ if err != nil {
+ return nil, err
+ }
+ return &lexer{
+ d: d,
+ }, nil
+}
+
+func (l *lexer) next() (*token, error) {
+ if l.buf != nil {
+ tok := l.buf
+ l.buf = nil
+ return tok, nil
+ }
+
+ var newline *token
+ for {
+ tok, err := l.lexAndSkipWSs()
+ if err != nil {
+ return nil, err
+ }
+ if tok.kind == tokenKindNewline {
+ newline = tok
+ continue
+ }
+
+ if newline != nil {
+ l.buf = tok
+ return newline, nil
+ }
+ return tok, nil
+ }
+}
+
+func (l *lexer) lexAndSkipWSs() (*token, error) {
+ var tok *Token
+ for {
+ var err error
+ tok, err = l.d.Next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.Invalid {
+ return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
+ }
+ if tok.EOF {
+ return newEOFToken(), nil
+ }
+ switch tok.KindID {
+ case KindIDWhiteSpace:
+ continue
+ case KindIDLineComment:
+ continue
+ }
+
+ break
+ }
+
+ switch tok.KindID {
+ case KindIDNewline:
+ return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDKwFragment:
+ return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDIdentifier:
+ if !reIDChar.Match(tok.Lexeme) {
+ return nil, &verr.SpecError{
+ Cause: synErrIDInvalidChar,
+ Detail: string(tok.Lexeme),
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") {
+ return nil, &verr.SpecError{
+ Cause: synErrIDInvalidUnderscorePos,
+ Detail: string(tok.Lexeme),
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ if strings.Contains(string(tok.Lexeme), "__") {
+ return nil, &verr.SpecError{
+ Cause: synErrIDConsecutiveUnderscores,
+ Detail: string(tok.Lexeme),
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ if reIDInvalidDigitsPos.Match(tok.Lexeme) {
+ return nil, &verr.SpecError{
+ Cause: synErrIDInvalidDigitsPos,
+ Detail: string(tok.Lexeme),
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDTerminalOpen:
+ var b strings.Builder
+ for {
+ tok, err := l.d.Next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.EOF {
+ return nil, &verr.SpecError{
+ Cause: synErrUnclosedTerminal,
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ switch tok.KindID {
+ case KindIDPattern:
+ // The escape sequences in a pattern string are interpreted by the lexer, except for the \".
+ // We must interpret the \" before passing them to the lexer because they are delimiters for
+ // the pattern strings.
+ fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`))
+ case KindIDEscapeSymbol:
+ return nil, &verr.SpecError{
+ Cause: synErrIncompletedEscSeq,
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ case KindIDTerminalClose:
+ pat := b.String()
+ if pat == "" {
+ return nil, &verr.SpecError{
+ Cause: synErrEmptyPattern,
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil
+ }
+ }
+ case KindIDStringLiteralOpen:
+ var b strings.Builder
+ for {
+ tok, err := l.d.Next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.EOF {
+ return nil, &verr.SpecError{
+ Cause: synErrUnclosedString,
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ switch tok.KindID {
+ case KindIDCharSeq:
+ fmt.Fprint(&b, string(tok.Lexeme))
+ case KindIDStringLiteralClose:
+ str := b.String()
+ if str == "" {
+ return nil, &verr.SpecError{
+ Cause: synErrEmptyString,
+ Row: tok.Row + 1,
+ Col: tok.Col + 1,
+ }
+ }
+ return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil
+ }
+ }
+ case KindIDColon:
+ return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDOr:
+ return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDSemicolon:
+ return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDLabelMarker:
+ return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDDirectiveMarker:
+ return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDExpansion:
+ return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDOrderedSymbolMarker:
+ return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDLParen:
+ return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil
+ case KindIDRParen:
+ return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil
+ default:
+ return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
+ }
+}
+
+type RootNode struct {
+ Directives []*DirectiveNode
+ Productions []*ProductionNode
+ LexProductions []*ProductionNode
+ Fragments []*FragmentNode
+}
+
+type ProductionNode struct {
+ Directives []*DirectiveNode
+ LHS string
+ RHS []*AlternativeNode
+ Pos Position
+}
+
+func (n *ProductionNode) isLexical() bool {
+ if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" {
+ return true
+ }
+ return false
+}
+
+type AlternativeNode struct {
+ Elements []*ElementNode
+ Directives []*DirectiveNode
+ Pos Position
+}
+
+type ElementNode struct {
+ ID string
+ Pattern string
+ Label *LabelNode
+ Literally bool
+ Pos Position
+}
+
+type LabelNode struct {
+ Name string
+ Pos Position
+}
+
+type DirectiveNode struct {
+ Name string
+ Parameters []*ParameterNode
+ Pos Position
+}
+
+type ParameterNode struct {
+ ID string
+ Pattern string
+ String string
+ OrderedSymbol string
+ Group []*DirectiveNode
+ Expansion bool
+ Pos Position
+}
+
+type FragmentNode struct {
+ LHS string
+ RHS string
+ Pos Position
+}
+
+func raiseSyntaxError(row int, synErr *SyntaxError) {
+ panic(&verr.SpecError{
+ Cause: synErr,
+ Row: row,
+ })
+}
+
+func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) {
+ panic(&verr.SpecError{
+ Cause: synErr,
+ Detail: detail,
+ Row: row,
+ })
+}
+
+func Parse(src io.Reader) (*RootNode, error) {
+ p, err := newParser(src)
+ if err != nil {
+ return nil, err
+ }
+
+ return p.parse()
+}
+
+type parser struct {
+ lex *lexer
+ peekedTok *token
+ lastTok *token
+ errs verr.SpecErrors
+
+ // A token position that the parser read at last.
+ // It is used as additional information in error messages.
+ pos Position
+}
+
+func newParser(src io.Reader) (*parser, error) {
+ lex, err := newLexer(src)
+ if err != nil {
+ return nil, err
+ }
+ return &parser{
+ lex: lex,
+ }, nil
+}
+
+func (p *parser) parse() (root *RootNode, retErr error) {
+ root = p.parseRoot()
+ if len(p.errs) > 0 {
+ return nil, p.errs
+ }
+
+ return root, nil
+}
+
+func (p *parser) parseRoot() *RootNode {
+ defer func() {
+ err := recover()
+ if err != nil {
+ specErr, ok := err.(*verr.SpecError)
+ if !ok {
+ panic(fmt.Errorf("an unexpected error occurred: %v", err))
+ }
+ p.errs = append(p.errs, specErr)
+ }
+ }()
+
+ var dirs []*DirectiveNode
+ var prods []*ProductionNode
+ var lexProds []*ProductionNode
+ var fragments []*FragmentNode
+ for {
+ dir := p.parseTopLevelDirective()
+ if dir != nil {
+ dirs = append(dirs, dir)
+ continue
+ }
+
+ fragment := p.parseFragment()
+ if fragment != nil {
+ fragments = append(fragments, fragment)
+ continue
+ }
+
+ prod := p.parseProduction()
+ if prod != nil {
+ if prod.isLexical() {
+ lexProds = append(lexProds, prod)
+ } else {
+ prods = append(prods, prod)
+ }
+ continue
+ }
+
+ if p.consume(tokenKindEOF) {
+ break
+ }
+ }
+
+ return &RootNode{
+ Directives: dirs,
+ Productions: prods,
+ LexProductions: lexProds,
+ Fragments: fragments,
+ }
+}
+
+func (p *parser) parseTopLevelDirective() *DirectiveNode {
+ defer func() {
+ err := recover()
+ if err == nil {
+ return
+ }
+
+ specErr, ok := err.(*verr.SpecError)
+ if !ok {
+ panic(err)
+ }
+
+ p.errs = append(p.errs, specErr)
+ p.skipOverTo(tokenKindSemicolon)
+ }()
+
+ dir := p.parseDirective()
+ if dir == nil {
+ return nil
+ }
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindSemicolon) {
+ raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon)
+ }
+
+ return dir
+}
+
+func (p *parser) parseFragment() *FragmentNode {
+ defer func() {
+ err := recover()
+ if err == nil {
+ return
+ }
+
+ specErr, ok := err.(*verr.SpecError)
+ if !ok {
+ panic(err)
+ }
+
+ p.errs = append(p.errs, specErr)
+ p.skipOverTo(tokenKindSemicolon)
+ }()
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindKWFragment) {
+ return nil
+ }
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindID) {
+ raiseSyntaxError(p.pos.Row, synErrNoProductionName)
+ }
+ lhs := p.lastTok.text
+ lhsPos := p.lastTok.pos
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindColon) {
+ raiseSyntaxError(p.pos.Row, synErrNoColon)
+ }
+
+ var rhs string
+ switch {
+ case p.consume(tokenKindTerminalPattern):
+ rhs = p.lastTok.text
+ case p.consume(tokenKindStringLiteral):
+ rhs = spec.EscapePattern(p.lastTok.text)
+ default:
+ raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern)
+ }
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindSemicolon) {
+ raiseSyntaxError(p.pos.Row, synErrNoSemicolon)
+ }
+
+ if !p.consume(tokenKindNewline) {
+ if !p.consume(tokenKindEOF) {
+ raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline)
+ }
+ }
+
+ return &FragmentNode{
+ LHS: lhs,
+ RHS: rhs,
+ Pos: lhsPos,
+ }
+}
+
+func (p *parser) parseProduction() *ProductionNode {
+ defer func() {
+ err := recover()
+ if err == nil {
+ return
+ }
+
+ specErr, ok := err.(*verr.SpecError)
+ if !ok {
+ panic(err)
+ }
+
+ p.errs = append(p.errs, specErr)
+ p.skipOverTo(tokenKindSemicolon)
+ }()
+
+ p.consume(tokenKindNewline)
+
+ if p.consume(tokenKindEOF) {
+ return nil
+ }
+
+ if !p.consume(tokenKindID) {
+ raiseSyntaxError(p.pos.Row, synErrNoProductionName)
+ }
+ lhs := p.lastTok.text
+ lhsPos := p.lastTok.pos
+
+ var dirs []*DirectiveNode
+ for {
+ dir := p.parseDirective()
+ if dir == nil {
+ break
+ }
+ dirs = append(dirs, dir)
+ }
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindColon) {
+ raiseSyntaxError(p.pos.Row, synErrNoColon)
+ }
+
+ alt := p.parseAlternative()
+ rhs := []*AlternativeNode{alt}
+ for {
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindOr) {
+ break
+ }
+ alt := p.parseAlternative()
+ rhs = append(rhs, alt)
+ }
+
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindSemicolon) {
+ raiseSyntaxError(p.pos.Row, synErrNoSemicolon)
+ }
+
+ if !p.consume(tokenKindNewline) {
+ if !p.consume(tokenKindEOF) {
+ raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline)
+ }
+ }
+
+ prod := &ProductionNode{
+ Directives: dirs,
+ LHS: lhs,
+ RHS: rhs,
+ Pos: lhsPos,
+ }
+
+ // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs.
+ // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate
+ // name to the pattern. Therefore, this code prohibits alternatives from containing patterns.
+ if !prod.isLexical() {
+ for _, alt := range prod.RHS {
+ for _, elem := range alt.Elements {
+ if elem.Pattern != "" {
+ raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt)
+ }
+ }
+ }
+ }
+
+ return prod
+}
+
+func (p *parser) parseAlternative() *AlternativeNode {
+ elems := []*ElementNode{}
+ for {
+ elem := p.parseElement()
+ if elem == nil {
+ break
+ }
+ elems = append(elems, elem)
+ }
+
+ // When a length of an alternative is zero, we cannot set a position.
+ var firstElemPos Position
+ if len(elems) > 0 {
+ firstElemPos = elems[0].Pos
+ }
+
+ var dirs []*DirectiveNode
+ for {
+ dir := p.parseDirective()
+ if dir == nil {
+ break
+ }
+ dirs = append(dirs, dir)
+ }
+
+ return &AlternativeNode{
+ Elements: elems,
+ Directives: dirs,
+ Pos: firstElemPos,
+ }
+}
+
+func (p *parser) parseElement() *ElementNode {
+ var elem *ElementNode
+ switch {
+ case p.consume(tokenKindID):
+ elem = &ElementNode{
+ ID: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindTerminalPattern):
+ elem = &ElementNode{
+ Pattern: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindStringLiteral):
+ elem = &ElementNode{
+ Pattern: p.lastTok.text,
+ Literally: true,
+ Pos: p.lastTok.pos,
+ }
+ default:
+ if p.consume(tokenKindLabelMarker) {
+ raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol)
+ }
+ return nil
+ }
+ if p.consume(tokenKindLabelMarker) {
+ if !p.consume(tokenKindID) {
+ raiseSyntaxError(p.pos.Row, synErrNoLabel)
+ }
+ elem.Label = &LabelNode{
+ Name: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ }
+ return elem
+}
+
+func (p *parser) parseDirective() *DirectiveNode {
+ p.consume(tokenKindNewline)
+
+ if !p.consume(tokenKindDirectiveMarker) {
+ return nil
+ }
+ dirPos := p.lastTok.pos
+
+ if !p.consume(tokenKindID) {
+ raiseSyntaxError(p.pos.Row, synErrNoDirectiveName)
+ }
+ name := p.lastTok.text
+
+ var params []*ParameterNode
+ for {
+ param := p.parseParameter()
+ if param == nil {
+ break
+ }
+ params = append(params, param)
+ }
+
+ return &DirectiveNode{
+ Name: name,
+ Parameters: params,
+ Pos: dirPos,
+ }
+}
+
+func (p *parser) parseParameter() *ParameterNode {
+ var param *ParameterNode
+ switch {
+ case p.consume(tokenKindID):
+ param = &ParameterNode{
+ ID: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindTerminalPattern):
+ param = &ParameterNode{
+ Pattern: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindStringLiteral):
+ param = &ParameterNode{
+ String: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindOrderedSymbolMarker):
+ if !p.consume(tokenKindID) {
+ raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName)
+ }
+ param = &ParameterNode{
+ OrderedSymbol: p.lastTok.text,
+ Pos: p.lastTok.pos,
+ }
+ case p.consume(tokenKindLParen):
+ pos := p.lastTok.pos
+ var g []*DirectiveNode
+ for {
+ dir := p.parseDirective()
+ if dir == nil {
+ break
+ }
+ g = append(g, dir)
+ }
+ if !p.consume(tokenKindRParen) {
+ raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup)
+ }
+ if len(g) == 0 {
+ // Set an empty slice representing an empty directive group to distinguish between the following two cases.
+ //
+ // - #prec (); // vartan allows this case.
+ // - #prec; // This case will raise an error.
+ g = []*DirectiveNode{}
+ }
+ param = &ParameterNode{
+ Group: g,
+ Pos: pos,
+ }
+ }
+ if p.consume(tokenKindExpantion) {
+ switch {
+ case param == nil:
+ raiseSyntaxError(p.pos.Row, synErrStrayExpOp)
+ case param.ID == "":
+ raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand)
+ }
+ param.Expansion = true
+ }
+ return param
+}
+
+func (p *parser) consume(expected tokenKind) bool {
+ var tok *token
+ var err error
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ panic(err)
+ }
+ }
+ p.pos = tok.pos
+ if tok.kind == tokenKindInvalid {
+ raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text)
+ }
+ if tok.kind == expected {
+ p.lastTok = tok
+ return true
+ }
+ p.peekedTok = tok
+
+ return false
+}
+
+func (p *parser) skip() {
+ var tok *token
+ var err error
+ for {
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ p.errs = append(p.errs, &verr.SpecError{
+ Cause: err,
+ Row: p.pos.Row,
+ })
+ continue
+ }
+ }
+
+ break
+ }
+
+ p.lastTok = tok
+ p.pos = tok.pos
+}
+
+func (p *parser) skipOverTo(kind tokenKind) {
+ for {
+ if p.consume(kind) || p.consume(tokenKindEOF) {
+ return
+ }
+ p.skip()
+ }
+}
+
+type SyntaxError struct {
+ message string
+}
+
+func newSyntaxError(message string) *SyntaxError {
+ return &SyntaxError{
+ message: message,
+ }
+}
+
+func (e *SyntaxError) Error() string {
+ return e.message
+}
+
+var (
+ // lexical errors
+ synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore")
+ synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier")
+ synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively")
+ synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier")
+ synErrUnclosedTerminal = newSyntaxError("unclosed terminal")
+ synErrUnclosedString = newSyntaxError("unclosed string")
+ synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash")
+ synErrEmptyPattern = newSyntaxError("a pattern must include at least one character")
+ synErrEmptyString = newSyntaxError("a string must include at least one character")
+
+ // syntax errors
+ synErrInvalidToken = newSyntaxError("invalid token")
+ synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;")
+ synErrNoProductionName = newSyntaxError("a production name is missing")
+ synErrNoColon = newSyntaxError("the colon must precede alternatives")
+ synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative")
+ synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol")
+ synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @")
+ synErrNoDirectiveName = newSyntaxError("a directive needs a name")
+ synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing")
+ synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )")
+ synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal")
+ synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier")
+ synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier")
+ synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline")
+ synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element")
)
+// Code generated by maleeni-go. DO NOT EDIT.
type ModeID int
diff --git a/src/urubu/spec/grammar/parser/lexer.go b/src/urubu/spec/grammar/parser/lexer.go
deleted file mode 100644
index bd8a24f..0000000
--- a/src/urubu/spec/grammar/parser/lexer.go
+++ /dev/null
@@ -1,297 +0,0 @@
-//go:generate maleeni compile lexspec.json -o clexspec.json
-//go:generate maleeni-go clexspec.json --package parser
-
-package parser
-
-import (
- _ "embed"
- "fmt"
- "io"
- "regexp"
- "strings"
-
- verr "urubu/error"
-)
-
-type tokenKind string
-
-const (
- tokenKindKWFragment = tokenKind("fragment")
- tokenKindID = tokenKind("id")
- tokenKindTerminalPattern = tokenKind("terminal pattern")
- tokenKindStringLiteral = tokenKind("string")
- tokenKindColon = tokenKind(":")
- tokenKindOr = tokenKind("|")
- tokenKindSemicolon = tokenKind(";")
- tokenKindLabelMarker = tokenKind("@")
- tokenKindDirectiveMarker = tokenKind("#")
- tokenKindExpantion = tokenKind("...")
- tokenKindOrderedSymbolMarker = tokenKind("$")
- tokenKindLParen = tokenKind("(")
- tokenKindRParen = tokenKind(")")
- tokenKindNewline = tokenKind("newline")
- tokenKindEOF = tokenKind("eof")
- tokenKindInvalid = tokenKind("invalid")
-)
-
-var (
- reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`)
- reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`)
-)
-
-type Position struct {
- Row int
- Col int
-}
-
-func newPosition(row, col int) Position {
- return Position{
- Row: row,
- Col: col,
- }
-}
-
-type token struct {
- kind tokenKind
- text string
- pos Position
-}
-
-func newSymbolToken(kind tokenKind, pos Position) *token {
- return &token{
- kind: kind,
- pos: pos,
- }
-}
-
-func newIDToken(text string, pos Position) *token {
- return &token{
- kind: tokenKindID,
- text: text,
- pos: pos,
- }
-}
-
-func newTerminalPatternToken(text string, pos Position) *token {
- return &token{
- kind: tokenKindTerminalPattern,
- text: text,
- pos: pos,
- }
-}
-
-func newStringLiteralToken(text string, pos Position) *token {
- return &token{
- kind: tokenKindStringLiteral,
- text: text,
- pos: pos,
- }
-}
-
-func newEOFToken() *token {
- return &token{
- kind: tokenKindEOF,
- }
-}
-
-func newInvalidToken(text string, pos Position) *token {
- return &token{
- kind: tokenKindInvalid,
- text: text,
- pos: pos,
- }
-}
-
-type lexer struct {
- d *Lexer
- buf *token
-}
-
-func newLexer(src io.Reader) (*lexer, error) {
- d, err := NewLexer(NewLexSpec(), src)
- if err != nil {
- return nil, err
- }
- return &lexer{
- d: d,
- }, nil
-}
-
-func (l *lexer) next() (*token, error) {
- if l.buf != nil {
- tok := l.buf
- l.buf = nil
- return tok, nil
- }
-
- var newline *token
- for {
- tok, err := l.lexAndSkipWSs()
- if err != nil {
- return nil, err
- }
- if tok.kind == tokenKindNewline {
- newline = tok
- continue
- }
-
- if newline != nil {
- l.buf = tok
- return newline, nil
- }
- return tok, nil
- }
-}
-
-func (l *lexer) lexAndSkipWSs() (*token, error) {
- var tok *Token
- for {
- var err error
- tok, err = l.d.Next()
- if err != nil {
- return nil, err
- }
- if tok.Invalid {
- return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
- }
- if tok.EOF {
- return newEOFToken(), nil
- }
- switch tok.KindID {
- case KindIDWhiteSpace:
- continue
- case KindIDLineComment:
- continue
- }
-
- break
- }
-
- switch tok.KindID {
- case KindIDNewline:
- return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDKwFragment:
- return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDIdentifier:
- if !reIDChar.Match(tok.Lexeme) {
- return nil, &verr.SpecError{
- Cause: synErrIDInvalidChar,
- Detail: string(tok.Lexeme),
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") {
- return nil, &verr.SpecError{
- Cause: synErrIDInvalidUnderscorePos,
- Detail: string(tok.Lexeme),
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- if strings.Contains(string(tok.Lexeme), "__") {
- return nil, &verr.SpecError{
- Cause: synErrIDConsecutiveUnderscores,
- Detail: string(tok.Lexeme),
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- if reIDInvalidDigitsPos.Match(tok.Lexeme) {
- return nil, &verr.SpecError{
- Cause: synErrIDInvalidDigitsPos,
- Detail: string(tok.Lexeme),
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDTerminalOpen:
- var b strings.Builder
- for {
- tok, err := l.d.Next()
- if err != nil {
- return nil, err
- }
- if tok.EOF {
- return nil, &verr.SpecError{
- Cause: synErrUnclosedTerminal,
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- switch tok.KindID {
- case KindIDPattern:
- // The escape sequences in a pattern string are interpreted by the lexer, except for the \".
- // We must interpret the \" before passing them to the lexer because they are delimiters for
- // the pattern strings.
- fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`))
- case KindIDEscapeSymbol:
- return nil, &verr.SpecError{
- Cause: synErrIncompletedEscSeq,
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- case KindIDTerminalClose:
- pat := b.String()
- if pat == "" {
- return nil, &verr.SpecError{
- Cause: synErrEmptyPattern,
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil
- }
- }
- case KindIDStringLiteralOpen:
- var b strings.Builder
- for {
- tok, err := l.d.Next()
- if err != nil {
- return nil, err
- }
- if tok.EOF {
- return nil, &verr.SpecError{
- Cause: synErrUnclosedString,
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- switch tok.KindID {
- case KindIDCharSeq:
- fmt.Fprint(&b, string(tok.Lexeme))
- case KindIDStringLiteralClose:
- str := b.String()
- if str == "" {
- return nil, &verr.SpecError{
- Cause: synErrEmptyString,
- Row: tok.Row + 1,
- Col: tok.Col + 1,
- }
- }
- return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil
- }
- }
- case KindIDColon:
- return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDOr:
- return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDSemicolon:
- return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDLabelMarker:
- return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDDirectiveMarker:
- return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDExpansion:
- return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDOrderedSymbolMarker:
- return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDLParen:
- return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil
- case KindIDRParen:
- return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil
- default:
- return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil
- }
-}
diff --git a/src/urubu/spec/grammar/parser/parser.go b/src/urubu/spec/grammar/parser/parser.go
deleted file mode 100644
index b604074..0000000
--- a/src/urubu/spec/grammar/parser/parser.go
+++ /dev/null
@@ -1,582 +0,0 @@
-package parser
-
-import (
- "fmt"
- "io"
-
- verr "urubu/error"
- spec "urubu/spec/grammar"
-)
-
-type RootNode struct {
- Directives []*DirectiveNode
- Productions []*ProductionNode
- LexProductions []*ProductionNode
- Fragments []*FragmentNode
-}
-
-type ProductionNode struct {
- Directives []*DirectiveNode
- LHS string
- RHS []*AlternativeNode
- Pos Position
-}
-
-func (n *ProductionNode) isLexical() bool {
- if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" {
- return true
- }
- return false
-}
-
-type AlternativeNode struct {
- Elements []*ElementNode
- Directives []*DirectiveNode
- Pos Position
-}
-
-type ElementNode struct {
- ID string
- Pattern string
- Label *LabelNode
- Literally bool
- Pos Position
-}
-
-type LabelNode struct {
- Name string
- Pos Position
-}
-
-type DirectiveNode struct {
- Name string
- Parameters []*ParameterNode
- Pos Position
-}
-
-type ParameterNode struct {
- ID string
- Pattern string
- String string
- OrderedSymbol string
- Group []*DirectiveNode
- Expansion bool
- Pos Position
-}
-
-type FragmentNode struct {
- LHS string
- RHS string
- Pos Position
-}
-
-func raiseSyntaxError(row int, synErr *SyntaxError) {
- panic(&verr.SpecError{
- Cause: synErr,
- Row: row,
- })
-}
-
-func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) {
- panic(&verr.SpecError{
- Cause: synErr,
- Detail: detail,
- Row: row,
- })
-}
-
-func Parse(src io.Reader) (*RootNode, error) {
- p, err := newParser(src)
- if err != nil {
- return nil, err
- }
-
- return p.parse()
-}
-
-type parser struct {
- lex *lexer
- peekedTok *token
- lastTok *token
- errs verr.SpecErrors
-
- // A token position that the parser read at last.
- // It is used as additional information in error messages.
- pos Position
-}
-
-func newParser(src io.Reader) (*parser, error) {
- lex, err := newLexer(src)
- if err != nil {
- return nil, err
- }
- return &parser{
- lex: lex,
- }, nil
-}
-
-func (p *parser) parse() (root *RootNode, retErr error) {
- root = p.parseRoot()
- if len(p.errs) > 0 {
- return nil, p.errs
- }
-
- return root, nil
-}
-
-func (p *parser) parseRoot() *RootNode {
- defer func() {
- err := recover()
- if err != nil {
- specErr, ok := err.(*verr.SpecError)
- if !ok {
- panic(fmt.Errorf("an unexpected error occurred: %v", err))
- }
- p.errs = append(p.errs, specErr)
- }
- }()
-
- var dirs []*DirectiveNode
- var prods []*ProductionNode
- var lexProds []*ProductionNode
- var fragments []*FragmentNode
- for {
- dir := p.parseTopLevelDirective()
- if dir != nil {
- dirs = append(dirs, dir)
- continue
- }
-
- fragment := p.parseFragment()
- if fragment != nil {
- fragments = append(fragments, fragment)
- continue
- }
-
- prod := p.parseProduction()
- if prod != nil {
- if prod.isLexical() {
- lexProds = append(lexProds, prod)
- } else {
- prods = append(prods, prod)
- }
- continue
- }
-
- if p.consume(tokenKindEOF) {
- break
- }
- }
-
- return &RootNode{
- Directives: dirs,
- Productions: prods,
- LexProductions: lexProds,
- Fragments: fragments,
- }
-}
-
-func (p *parser) parseTopLevelDirective() *DirectiveNode {
- defer func() {
- err := recover()
- if err == nil {
- return
- }
-
- specErr, ok := err.(*verr.SpecError)
- if !ok {
- panic(err)
- }
-
- p.errs = append(p.errs, specErr)
- p.skipOverTo(tokenKindSemicolon)
- }()
-
- dir := p.parseDirective()
- if dir == nil {
- return nil
- }
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindSemicolon) {
- raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon)
- }
-
- return dir
-}
-
-func (p *parser) parseFragment() *FragmentNode {
- defer func() {
- err := recover()
- if err == nil {
- return
- }
-
- specErr, ok := err.(*verr.SpecError)
- if !ok {
- panic(err)
- }
-
- p.errs = append(p.errs, specErr)
- p.skipOverTo(tokenKindSemicolon)
- }()
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindKWFragment) {
- return nil
- }
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindID) {
- raiseSyntaxError(p.pos.Row, synErrNoProductionName)
- }
- lhs := p.lastTok.text
- lhsPos := p.lastTok.pos
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindColon) {
- raiseSyntaxError(p.pos.Row, synErrNoColon)
- }
-
- var rhs string
- switch {
- case p.consume(tokenKindTerminalPattern):
- rhs = p.lastTok.text
- case p.consume(tokenKindStringLiteral):
- rhs = spec.EscapePattern(p.lastTok.text)
- default:
- raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern)
- }
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindSemicolon) {
- raiseSyntaxError(p.pos.Row, synErrNoSemicolon)
- }
-
- if !p.consume(tokenKindNewline) {
- if !p.consume(tokenKindEOF) {
- raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline)
- }
- }
-
- return &FragmentNode{
- LHS: lhs,
- RHS: rhs,
- Pos: lhsPos,
- }
-}
-
-func (p *parser) parseProduction() *ProductionNode {
- defer func() {
- err := recover()
- if err == nil {
- return
- }
-
- specErr, ok := err.(*verr.SpecError)
- if !ok {
- panic(err)
- }
-
- p.errs = append(p.errs, specErr)
- p.skipOverTo(tokenKindSemicolon)
- }()
-
- p.consume(tokenKindNewline)
-
- if p.consume(tokenKindEOF) {
- return nil
- }
-
- if !p.consume(tokenKindID) {
- raiseSyntaxError(p.pos.Row, synErrNoProductionName)
- }
- lhs := p.lastTok.text
- lhsPos := p.lastTok.pos
-
- var dirs []*DirectiveNode
- for {
- dir := p.parseDirective()
- if dir == nil {
- break
- }
- dirs = append(dirs, dir)
- }
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindColon) {
- raiseSyntaxError(p.pos.Row, synErrNoColon)
- }
-
- alt := p.parseAlternative()
- rhs := []*AlternativeNode{alt}
- for {
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindOr) {
- break
- }
- alt := p.parseAlternative()
- rhs = append(rhs, alt)
- }
-
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindSemicolon) {
- raiseSyntaxError(p.pos.Row, synErrNoSemicolon)
- }
-
- if !p.consume(tokenKindNewline) {
- if !p.consume(tokenKindEOF) {
- raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline)
- }
- }
-
- prod := &ProductionNode{
- Directives: dirs,
- LHS: lhs,
- RHS: rhs,
- Pos: lhsPos,
- }
-
- // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs.
- // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate
- // name to the pattern. Therefore, this code prohibits alternatives from containing patterns.
- if !prod.isLexical() {
- for _, alt := range prod.RHS {
- for _, elem := range alt.Elements {
- if elem.Pattern != "" {
- raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt)
- }
- }
- }
- }
-
- return prod
-}
-
-func (p *parser) parseAlternative() *AlternativeNode {
- elems := []*ElementNode{}
- for {
- elem := p.parseElement()
- if elem == nil {
- break
- }
- elems = append(elems, elem)
- }
-
- // When a length of an alternative is zero, we cannot set a position.
- var firstElemPos Position
- if len(elems) > 0 {
- firstElemPos = elems[0].Pos
- }
-
- var dirs []*DirectiveNode
- for {
- dir := p.parseDirective()
- if dir == nil {
- break
- }
- dirs = append(dirs, dir)
- }
-
- return &AlternativeNode{
- Elements: elems,
- Directives: dirs,
- Pos: firstElemPos,
- }
-}
-
-func (p *parser) parseElement() *ElementNode {
- var elem *ElementNode
- switch {
- case p.consume(tokenKindID):
- elem = &ElementNode{
- ID: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindTerminalPattern):
- elem = &ElementNode{
- Pattern: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindStringLiteral):
- elem = &ElementNode{
- Pattern: p.lastTok.text,
- Literally: true,
- Pos: p.lastTok.pos,
- }
- default:
- if p.consume(tokenKindLabelMarker) {
- raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol)
- }
- return nil
- }
- if p.consume(tokenKindLabelMarker) {
- if !p.consume(tokenKindID) {
- raiseSyntaxError(p.pos.Row, synErrNoLabel)
- }
- elem.Label = &LabelNode{
- Name: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- }
- return elem
-}
-
-func (p *parser) parseDirective() *DirectiveNode {
- p.consume(tokenKindNewline)
-
- if !p.consume(tokenKindDirectiveMarker) {
- return nil
- }
- dirPos := p.lastTok.pos
-
- if !p.consume(tokenKindID) {
- raiseSyntaxError(p.pos.Row, synErrNoDirectiveName)
- }
- name := p.lastTok.text
-
- var params []*ParameterNode
- for {
- param := p.parseParameter()
- if param == nil {
- break
- }
- params = append(params, param)
- }
-
- return &DirectiveNode{
- Name: name,
- Parameters: params,
- Pos: dirPos,
- }
-}
-
-func (p *parser) parseParameter() *ParameterNode {
- var param *ParameterNode
- switch {
- case p.consume(tokenKindID):
- param = &ParameterNode{
- ID: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindTerminalPattern):
- param = &ParameterNode{
- Pattern: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindStringLiteral):
- param = &ParameterNode{
- String: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindOrderedSymbolMarker):
- if !p.consume(tokenKindID) {
- raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName)
- }
- param = &ParameterNode{
- OrderedSymbol: p.lastTok.text,
- Pos: p.lastTok.pos,
- }
- case p.consume(tokenKindLParen):
- pos := p.lastTok.pos
- var g []*DirectiveNode
- for {
- dir := p.parseDirective()
- if dir == nil {
- break
- }
- g = append(g, dir)
- }
- if !p.consume(tokenKindRParen) {
- raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup)
- }
- if len(g) == 0 {
- // Set an empty slice representing an empty directive group to distinguish between the following two cases.
- //
- // - #prec (); // vartan allows this case.
- // - #prec; // This case will raise an error.
- g = []*DirectiveNode{}
- }
- param = &ParameterNode{
- Group: g,
- Pos: pos,
- }
- }
- if p.consume(tokenKindExpantion) {
- switch {
- case param == nil:
- raiseSyntaxError(p.pos.Row, synErrStrayExpOp)
- case param.ID == "":
- raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand)
- }
- param.Expansion = true
- }
- return param
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- panic(err)
- }
- }
- p.pos = tok.pos
- if tok.kind == tokenKindInvalid {
- raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text)
- }
- if tok.kind == expected {
- p.lastTok = tok
- return true
- }
- p.peekedTok = tok
-
- return false
-}
-
-func (p *parser) skip() {
- var tok *token
- var err error
- for {
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- p.errs = append(p.errs, &verr.SpecError{
- Cause: err,
- Row: p.pos.Row,
- })
- continue
- }
- }
-
- break
- }
-
- p.lastTok = tok
- p.pos = tok.pos
-}
-
-func (p *parser) skipOverTo(kind tokenKind) {
- for {
- if p.consume(kind) || p.consume(tokenKindEOF) {
- return
- }
- p.skip()
- }
-}
diff --git a/src/urubu/spec/grammar/parser/syntax_error.go b/src/urubu/spec/grammar/parser/syntax_error.go
deleted file mode 100644
index 719fb94..0000000
--- a/src/urubu/spec/grammar/parser/syntax_error.go
+++ /dev/null
@@ -1,45 +0,0 @@
-package parser
-
-type SyntaxError struct {
- message string
-}
-
-func newSyntaxError(message string) *SyntaxError {
- return &SyntaxError{
- message: message,
- }
-}
-
-func (e *SyntaxError) Error() string {
- return e.message
-}
-
-var (
- // lexical errors
- synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore")
- synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier")
- synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively")
- synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier")
- synErrUnclosedTerminal = newSyntaxError("unclosed terminal")
- synErrUnclosedString = newSyntaxError("unclosed string")
- synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash")
- synErrEmptyPattern = newSyntaxError("a pattern must include at least one character")
- synErrEmptyString = newSyntaxError("a string must include at least one character")
-
- // syntax errors
- synErrInvalidToken = newSyntaxError("invalid token")
- synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;")
- synErrNoProductionName = newSyntaxError("a production name is missing")
- synErrNoColon = newSyntaxError("the colon must precede alternatives")
- synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative")
- synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol")
- synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @")
- synErrNoDirectiveName = newSyntaxError("a directive needs a name")
- synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing")
- synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )")
- synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal")
- synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier")
- synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier")
- synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline")
- synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element")
-)
diff --git a/src/urubu/spec/grammar/util.go b/src/urubu/spec/grammar/util.go
deleted file mode 100644
index bf3f233..0000000
--- a/src/urubu/spec/grammar/util.go
+++ /dev/null
@@ -1,21 +0,0 @@
-package grammar
-
-import "strings"
-
-var rep = strings.NewReplacer(
- `.`, `\.`,
- `*`, `\*`,
- `+`, `\+`,
- `?`, `\?`,
- `|`, `\|`,
- `(`, `\(`,
- `)`, `\)`,
- `[`, `\[`,
- `\`, `\\`,
-)
-
-// EscapePattern escapes the special characters.
-// For example, EscapePattern(`+`) returns `\+`.
-func EscapePattern(s string) string {
- return rep.Replace(s)
-}
diff --git a/src/urubu/spec/test/tree_lexer.go b/src/urubu/spec/test.go
index 8bb1c87..4985e14 100644
--- a/src/urubu/spec/test/tree_lexer.go
+++ b/src/urubu/spec/test.go
@@ -1,11 +1,342 @@
-// Code generated by vartan-go. DO NOT EDIT.
+//go:generate vartan compile tree.vartan -o tree.json
+//go:generate vartan-go tree.json --package test
+
package test
import (
+ "bufio"
+ "bytes"
+ "encoding/json"
+ "errors"
"fmt"
"io"
+ "regexp"
+ "strconv"
+ "strings"
+ "unicode/utf8"
)
+type TreeDiff struct {
+ ExpectedPath string
+ ActualPath string
+ Message string
+}
+
+func newTreeDiff(expected, actual *Tree, message string) *TreeDiff {
+ return &TreeDiff{
+ ExpectedPath: expected.path(),
+ ActualPath: actual.path(),
+ Message: message,
+ }
+}
+
+type Tree struct {
+ Parent *Tree
+ Offset int
+ Kind string
+ Children []*Tree
+ Lexeme string
+}
+
+func NewNonTerminalTree(kind string, children ...*Tree) *Tree {
+ return &Tree{
+ Kind: kind,
+ Children: children,
+ }
+}
+
+func NewTerminalNode(kind string, lexeme string) *Tree {
+ return &Tree{
+ Kind: kind,
+ Lexeme: lexeme,
+ }
+}
+
+func (t *Tree) Fill() *Tree {
+ for i, c := range t.Children {
+ c.Parent = t
+ c.Offset = i
+ c.Fill()
+ }
+ return t
+}
+
+func (t *Tree) path() string {
+ if t.Parent == nil {
+ return t.Kind
+ }
+ return fmt.Sprintf("%v.[%v]%v", t.Parent.path(), t.Offset, t.Kind)
+}
+
+func (t *Tree) Format() []byte {
+ var b bytes.Buffer
+ t.format(&b, 0)
+ return b.Bytes()
+}
+
+func (t *Tree) format(buf *bytes.Buffer, depth int) {
+ for i := 0; i < depth; i++ {
+ buf.WriteString(" ")
+ }
+ buf.WriteString("(")
+ buf.WriteString(t.Kind)
+ if len(t.Children) > 0 {
+ buf.WriteString("\n")
+ for i, c := range t.Children {
+ c.format(buf, depth+1)
+ if i < len(t.Children)-1 {
+ buf.WriteString("\n")
+ }
+ }
+ }
+ buf.WriteString(")")
+}
+
+func DiffTree(expected, actual *Tree) []*TreeDiff {
+ if expected == nil && actual == nil {
+ return nil
+ }
+ if actual.Kind != expected.Kind {
+ msg := fmt.Sprintf("unexpected kind: expected '%v' but got '%v'", expected.Kind, actual.Kind)
+ return []*TreeDiff{
+ newTreeDiff(expected, actual, msg),
+ }
+ }
+ if expected.Lexeme != actual.Lexeme {
+ msg := fmt.Sprintf("unexpected lexeme: expected '%v' but got '%v'", expected.Lexeme, actual.Lexeme)
+ return []*TreeDiff{
+ newTreeDiff(expected, actual, msg),
+ }
+ }
+ if len(actual.Children) != len(expected.Children) {
+ msg := fmt.Sprintf("unexpected node count: expected %v but got %v", len(expected.Children), len(actual.Children))
+ return []*TreeDiff{
+ newTreeDiff(expected, actual, msg),
+ }
+ }
+ var diffs []*TreeDiff
+ for i, exp := range expected.Children {
+ if ds := DiffTree(exp, actual.Children[i]); len(ds) > 0 {
+ diffs = append(diffs, ds...)
+ }
+ }
+ return diffs
+}
+
+type TestCase struct {
+ Description string
+ Source []byte
+ Output *Tree
+}
+
+func ParseTestCase(r io.Reader) (*TestCase, error) {
+ parts, err := splitIntoParts(r)
+ if err != nil {
+ return nil, err
+ }
+ if len(parts) != 3 {
+ return nil, fmt.Errorf("too many or too few part delimiters: a test case consists of just tree parts: %v parts found", len(parts))
+ }
+
+ tp := &treeParser{
+ lineOffset: parts[0].lineCount + parts[1].lineCount + 2,
+ }
+ tree, err := tp.parseTree(bytes.NewReader(parts[2].buf))
+ if err != nil {
+ return nil, err
+ }
+
+ return &TestCase{
+ Description: string(parts[0].buf),
+ Source: parts[1].buf,
+ Output: tree,
+ }, nil
+}
+
+type testCasePart struct {
+ buf []byte
+ lineCount int
+}
+
+func splitIntoParts(r io.Reader) ([]*testCasePart, error) {
+ var bufs []*testCasePart
+ s := bufio.NewScanner(r)
+ for {
+ buf, lineCount, err := readPart(s)
+ if err != nil {
+ return nil, err
+ }
+ if buf == nil {
+ break
+ }
+ bufs = append(bufs, &testCasePart{
+ buf: buf,
+ lineCount: lineCount,
+ })
+ }
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+ return bufs, nil
+}
+
+var reDelim = regexp.MustCompile(`^\s*---+\s*$`)
+
+func readPart(s *bufio.Scanner) ([]byte, int, error) {
+ if !s.Scan() {
+ return nil, 0, s.Err()
+ }
+ buf := &bytes.Buffer{}
+ line := s.Bytes()
+ if reDelim.Match(line) {
+ // Return an empty slice because (*bytes.Buffer).Bytes() returns nil if we have never written data.
+ return []byte{}, 0, nil
+ }
+ _, err := buf.Write(line)
+ if err != nil {
+ return nil, 0, err
+ }
+ lineCount := 1
+ for s.Scan() {
+ line := s.Bytes()
+ if reDelim.Match(line) {
+ return buf.Bytes(), lineCount, nil
+ }
+ _, err := buf.Write([]byte("\n"))
+ if err != nil {
+ return nil, 0, err
+ }
+ _, err = buf.Write(line)
+ if err != nil {
+ return nil, 0, err
+ }
+ lineCount++
+ }
+ if err := s.Err(); err != nil {
+ return nil, 0, err
+ }
+ return buf.Bytes(), lineCount, nil
+}
+
+type treeParser struct {
+ lineOffset int
+}
+
+func (tp *treeParser) parseTree(src io.Reader) (*Tree, error) {
+ toks, err := NewTokenStream(src)
+ if err != nil {
+ return nil, err
+ }
+ gram := NewGrammar()
+ tb := NewDefaultSyntaxTreeBuilder()
+ p, err := NewParser(toks, gram, SemanticAction(NewASTActionSet(gram, tb)))
+ if err != nil {
+ return nil, err
+ }
+ err = p.Parse()
+ if err != nil {
+ return nil, err
+ }
+ synErrs := p.SyntaxErrors()
+ if len(synErrs) > 0 {
+ var b strings.Builder
+ b.Write(formatSyntaxError(synErrs[0], gram, tp.lineOffset))
+ for _, synErr := range synErrs[1:] {
+ b.WriteRune('\n')
+ b.Write(formatSyntaxError(synErr, gram, tp.lineOffset))
+ }
+ return nil, errors.New(b.String())
+ }
+ t, err := tp.genTree(tb.Tree())
+ if err != nil {
+ return nil, err
+ }
+ return t.Fill(), nil
+}
+
+func formatSyntaxError(synErr *SyntaxError, gram Grammar, lineOffset int) []byte {
+ var b bytes.Buffer
+
+ b.WriteString(fmt.Sprintf("%v:%v: %v: ", lineOffset+synErr.Row+1, synErr.Col+1, synErr.Message))
+
+ tok := synErr.Token
+ switch {
+ case tok.EOF():
+ b.WriteString("<eof>")
+ case tok.Invalid():
+ b.WriteString(fmt.Sprintf("'%v' (<invalid>)", string(tok.Lexeme())))
+ default:
+ if term := gram.Terminal(tok.TerminalID()); term != "" {
+ b.WriteString(fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), term))
+ } else {
+ b.WriteString(fmt.Sprintf("'%v'", string(tok.Lexeme())))
+ }
+ }
+ b.WriteString(fmt.Sprintf(": expected: %v", synErr.ExpectedTerminals[0]))
+ for _, t := range synErr.ExpectedTerminals[1:] {
+ b.WriteString(fmt.Sprintf(", %v", t))
+ }
+
+ return b.Bytes()
+}
+
+func (tp *treeParser) genTree(node *Node) (*Tree, error) {
+ // A node labeled 'error' cannot have children. It always must be (error).
+ if sym := node.Children[0]; sym.Text == "error" {
+ if len(node.Children) > 1 {
+ return nil, fmt.Errorf("%v:%v: error node cannot take children", tp.lineOffset+sym.Row+1, sym.Col+1)
+ }
+ return NewTerminalNode(sym.Text, ""), nil
+ }
+
+ if len(node.Children) == 2 && node.Children[1].KindName == "string" {
+ var text string
+ str := node.Children[1].Children[0]
+ switch str.KindName {
+ case "raw_string":
+ text = str.Children[0].Text
+ case "interpreted_string":
+ var b strings.Builder
+ for _, c := range str.Children {
+ switch c.KindName {
+ case "escaped_seq":
+ b.WriteString(strings.TrimPrefix(`\`, c.Text))
+ case "escape_char":
+ return nil, fmt.Errorf("%v:%v: incomplete escape sequence", tp.lineOffset+c.Row+1, c.Col+1)
+ case "codepoint_expr":
+ cp := c.Children[0]
+ n, err := strconv.ParseInt(cp.Text, 16, 64)
+ if err != nil {
+ return nil, fmt.Errorf("%v:%v: %v", tp.lineOffset+cp.Row+1, cp.Col+1, err)
+ }
+ if !utf8.ValidRune(rune(n)) {
+ return nil, fmt.Errorf("%v:%v: invalid code point: %v", tp.lineOffset+cp.Row+1, cp.Col+1, cp.Text)
+ }
+ b.WriteRune(rune(n))
+ default:
+ b.WriteString(c.Text)
+ }
+ }
+ text = b.String()
+ }
+ return NewTerminalNode(node.Children[0].Text, text), nil
+ }
+
+ var children []*Tree
+ if len(node.Children) > 1 {
+ children = make([]*Tree, len(node.Children)-1)
+ for i, c := range node.Children[1:] {
+ var err error
+ children[i], err = tp.genTree(c)
+ if err != nil {
+ return nil, err
+ }
+ }
+ }
+ return NewNonTerminalTree(node.Children[0].Text, children...), nil
+}
+// Code generated by vartan-go. DO NOT EDIT.
+
type ModeID int
func (id ModeID) Int() int {
@@ -1022,3 +1353,1072 @@ func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, strin
id := s.kindIDs[mode][modeKind]
return id, s.kindNames[id]
}
+// Code generated by vartan-go. DO NOT EDIT.
+
+type Grammar interface {
+ // InitialState returns the initial state of a parser.
+ InitialState() int
+
+ // StartProduction returns the start production of grammar.
+ StartProduction() int
+
+ // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair.
+ Action(state int, terminal int) int
+
+ // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair.
+ GoTo(state int, lhs int) int
+
+ // ErrorTrapperState returns true when a state can shift the error symbol.
+ ErrorTrapperState(state int) bool
+
+ // LHS returns a LHS symbol of a production.
+ LHS(prod int) int
+
+ // AlternativeSymbolCount returns a symbol count of p production.
+ AlternativeSymbolCount(prod int) int
+
+ // RecoverProduction returns true when a production has the recover directive.
+ RecoverProduction(prod int) bool
+
+ // NonTerminal retuns a string representaion of a non-terminal symbol.
+ NonTerminal(nonTerminal int) string
+
+ // TerminalCount returns a terminal symbol count of grammar.
+ TerminalCount() int
+
+ // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis.
+ SkipTerminal(terminal int) bool
+
+ // EOF returns the EOF symbol.
+ EOF() int
+
+ // Error returns the error symbol.
+ Error() int
+
+ // Terminal retuns a string representaion of a terminal symbol.
+ Terminal(terminal int) string
+
+ // ASTAction returns an AST action entries.
+ ASTAction(prod int) []int
+}
+
+type VToken interface {
+ // TerminalID returns a terminal ID.
+ TerminalID() int
+
+ // Lexeme returns a lexeme.
+ Lexeme() []byte
+
+ // EOF returns true when a token represents EOF.
+ EOF() bool
+
+ // Invalid returns true when a token is invalid.
+ Invalid() bool
+
+ // Position returns (row, column) pair.
+ Position() (int, int)
+}
+
+type TokenStream interface {
+ Next() (VToken, error)
+}
+
+type SyntaxError struct {
+ Row int
+ Col int
+ Message string
+ Token VToken
+ ExpectedTerminals []string
+}
+
+type ParserOption func(p *Parser) error
+
+// DisableLAC disables LAC (lookahead correction). LAC is enabled by default.
+func DisableLAC() ParserOption {
+ return func(p *Parser) error {
+ p.disableLAC = true
+ return nil
+ }
+}
+
+func SemanticAction(semAct SemanticActionSet) ParserOption {
+ return func(p *Parser) error {
+ p.semAct = semAct
+ return nil
+ }
+}
+
+type Parser struct {
+ toks TokenStream
+ gram Grammar
+ stateStack *stateStack
+ semAct SemanticActionSet
+ disableLAC bool
+ onError bool
+ shiftCount int
+ synErrs []*SyntaxError
+}
+
+func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) {
+ p := &Parser{
+ toks: toks,
+ gram: gram,
+ stateStack: &stateStack{},
+ }
+
+ for _, opt := range opts {
+ err := opt(p)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return p, nil
+}
+
+func (p *Parser) Parse() error {
+ p.stateStack.push(p.gram.InitialState())
+ tok, err := p.nextToken()
+ if err != nil {
+ return err
+ }
+
+ACTION_LOOP:
+ for {
+ act := p.lookupAction(tok)
+
+ switch {
+ case act < 0: // Shift
+ nextState := act * -1
+
+ recovered := false
+ if p.onError {
+ p.shiftCount++
+
+ // When the parser performs shift three times, the parser recovers from the error state.
+ if p.shiftCount >= 3 {
+ p.onError = false
+ p.shiftCount = 0
+ recovered = true
+ }
+ }
+
+ p.shift(nextState)
+
+ if p.semAct != nil {
+ p.semAct.Shift(tok, recovered)
+ }
+
+ tok, err = p.nextToken()
+ if err != nil {
+ return err
+ }
+ case act > 0: // Reduce
+ prodNum := act
+
+ recovered := false
+ if p.onError && p.gram.RecoverProduction(prodNum) {
+ p.onError = false
+ p.shiftCount = 0
+ recovered = true
+ }
+
+ accepted := p.reduce(prodNum)
+ if accepted {
+ if p.semAct != nil {
+ p.semAct.Accept()
+ }
+
+ return nil
+ }
+
+ if p.semAct != nil {
+ p.semAct.Reduce(prodNum, recovered)
+ }
+ default: // Error
+ if p.onError {
+ tok, err = p.nextToken()
+ if err != nil {
+ return err
+ }
+ if tok.EOF() {
+ if p.semAct != nil {
+ p.semAct.MissError(tok)
+ }
+
+ return nil
+ }
+
+ continue ACTION_LOOP
+ }
+
+ row, col := tok.Position()
+ p.synErrs = append(p.synErrs, &SyntaxError{
+ Row: row,
+ Col: col,
+ Message: "unexpected token",
+ Token: tok,
+ ExpectedTerminals: p.searchLookahead(p.stateStack.top()),
+ })
+
+ count, ok := p.trapError()
+ if !ok {
+ if p.semAct != nil {
+ p.semAct.MissError(tok)
+ }
+
+ return nil
+ }
+
+ p.onError = true
+ p.shiftCount = 0
+
+ act, err := p.lookupActionOnError()
+ if err != nil {
+ return err
+ }
+
+ p.shift(act * -1)
+
+ if p.semAct != nil {
+ p.semAct.TrapAndShiftError(tok, count)
+ }
+ }
+ }
+}
+
+// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid,
+// this method returns `true`.
+func (p *Parser) validateLookahead(term int) bool {
+ p.stateStack.enableExploratoryMode()
+ defer p.stateStack.disableExploratoryMode()
+
+ for {
+ act := p.gram.Action(p.stateStack.topExploratorily(), term)
+
+ switch {
+ case act < 0: // Shift
+ return true
+ case act > 0: // Reduce
+ prodNum := act
+
+ lhs := p.gram.LHS(prodNum)
+ if lhs == p.gram.LHS(p.gram.StartProduction()) {
+ return true
+ }
+ n := p.gram.AlternativeSymbolCount(prodNum)
+ p.stateStack.popExploratorily(n)
+ state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs)
+ p.stateStack.pushExploratorily(state)
+ default: // Error
+ return false
+ }
+ }
+}
+
+func (p *Parser) nextToken() (VToken, error) {
+ for {
+ // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0,
+ // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect
+ // a syntax error because the parser cannot find an entry corresponding to the invalid token.
+ tok, err := p.toks.Next()
+ if err != nil {
+ return nil, err
+ }
+
+ if p.gram.SkipTerminal(tok.TerminalID()) {
+ continue
+ }
+
+ return tok, nil
+ }
+}
+
+func (p *Parser) tokenToTerminal(tok VToken) int {
+ if tok.EOF() {
+ return p.gram.EOF()
+ }
+
+ return tok.TerminalID()
+}
+
+func (p *Parser) lookupAction(tok VToken) int {
+ if !p.disableLAC {
+ term := p.tokenToTerminal(tok)
+ if !p.validateLookahead(term) {
+ return 0
+ }
+ }
+
+ return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok))
+}
+
+func (p *Parser) lookupActionOnError() (int, error) {
+ act := p.gram.Action(p.stateStack.top(), p.gram.Error())
+ if act >= 0 {
+ return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error()))
+ }
+
+ return act, nil
+}
+
+func (p *Parser) shift(nextState int) {
+ p.stateStack.push(nextState)
+}
+
+func (p *Parser) reduce(prodNum int) bool {
+ lhs := p.gram.LHS(prodNum)
+ if lhs == p.gram.LHS(p.gram.StartProduction()) {
+ return true
+ }
+ n := p.gram.AlternativeSymbolCount(prodNum)
+ p.stateStack.pop(n)
+ nextState := p.gram.GoTo(p.stateStack.top(), lhs)
+ p.stateStack.push(nextState)
+ return false
+}
+
+func (p *Parser) trapError() (int, bool) {
+ count := 0
+ for {
+ if p.gram.ErrorTrapperState(p.stateStack.top()) {
+ return count, true
+ }
+
+ if p.stateStack.top() != p.gram.InitialState() {
+ p.stateStack.pop(1)
+ count++
+ } else {
+ return 0, false
+ }
+ }
+}
+
+func (p *Parser) SyntaxErrors() []*SyntaxError {
+ return p.synErrs
+}
+
+func (p *Parser) searchLookahead(state int) []string {
+ kinds := []string{}
+ termCount := p.gram.TerminalCount()
+ for term := 0; term < termCount; term++ {
+ if p.disableLAC {
+ if p.gram.Action(p.stateStack.top(), term) == 0 {
+ continue
+ }
+ } else {
+ if !p.validateLookahead(term) {
+ continue
+ }
+ }
+
+ // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol
+ // intentionally.
+ if term == p.gram.Error() {
+ continue
+ }
+
+ kinds = append(kinds, p.gram.Terminal(term))
+ }
+
+ return kinds
+}
+
+type stateStack struct {
+ items []int
+ itemsExp []int
+}
+
+func (s *stateStack) enableExploratoryMode() {
+ s.itemsExp = make([]int, len(s.items))
+ copy(s.itemsExp, s.items)
+}
+
+func (s *stateStack) disableExploratoryMode() {
+ s.itemsExp = nil
+}
+
+func (s *stateStack) top() int {
+ return s.items[len(s.items)-1]
+}
+
+func (s *stateStack) topExploratorily() int {
+ return s.itemsExp[len(s.itemsExp)-1]
+}
+
+func (s *stateStack) push(state int) {
+ s.items = append(s.items, state)
+}
+
+func (s *stateStack) pushExploratorily(state int) {
+ s.itemsExp = append(s.itemsExp, state)
+}
+
+func (s *stateStack) pop(n int) {
+ s.items = s.items[:len(s.items)-n]
+}
+
+func (s *stateStack) popExploratorily(n int) {
+ s.itemsExp = s.itemsExp[:len(s.itemsExp)-n]
+}
+
+type grammarImpl struct {
+ recoverProductions []int
+ action []int
+ goTo []int
+ alternativeSymbolCounts []int
+ errorTrapperStates []int
+ nonTerminals []string
+ lhsSymbols []int
+ terminals []string
+ terminalSkip []int
+ astActions [][]int
+}
+
+func NewGrammar() *grammarImpl {
+ return &grammarImpl{
+ recoverProductions: []int{
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ },
+ action: []int{
+ 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -3, 0, 0, 0, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0,
+ -2, 7, 0, -11, 0, 0, -12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4,
+ 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -14, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 12, 12, 0, 0, -17, 12, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -22, 0, 16, 16, 0, 0, 0, 0, 0, -23,
+ -24, -25, -26, -27, -28, -29, 16, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -30, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -23, -24, -25, -26, -27, -28, -29, 15,
+ 0, 18, 0, 0, 18, 18, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 0,
+ 25, 0, 0, 25, 25, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -33, 0, 19, 0,
+ 0, 19, 19, 0, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, -34, 0, 0, 0, 0, 0, 0, 20, 0, 0, 20,
+ 20, 0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 20, 20, 20, 0, 21, 0, 0, 21, 21,
+ 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 21, 21, 0, 22, 0, 0, 22, 22, 0,
+ 0, 0, 0, 0, 22, 22, 22, 22, 22, 22, 22, 22, 0, 23, 0, 0, 23, 23, 0, 0,
+ 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23, 0, 24, 0, 0, 24, 24, 0, 0, 0,
+ 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 0, 10, 0, 0, 10, 10, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 13, 13, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 17, 17, 0, 0, 0, 0, 0, 17,
+ 17, 17, 17, 17, 17, 17, 17, 0, 14, 0, 0, 14, 14, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, -35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -36,
+ 0, 0, 0, 0, 0, 26, 0, 0, 26, 26, 0, 0, 0, 0, 0, 26, 26, 26, 26, 26,
+ 26, 26, 26,
+ },
+ goTo: []int{
+ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 0, 10, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 19, 20, 21, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 21,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ },
+ alternativeSymbolCounts: []int{
+ 0, 1, 4, 4, 3, 2, 1, 0, 1, 1, 3, 1, 0, 3, 3, 1, 0, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 4,
+ },
+ errorTrapperStates: []int{
+ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ nonTerminals: []string{
+ "",
+ "tree'",
+ "tree",
+ "tree_list",
+ "string",
+ "raw_string",
+ "opt_raw_string_body",
+ "interpreted_string",
+ "opt_interpreted_string_body",
+ "interpreted_string_body",
+ "interpreted_string_elem",
+ "codepoint_expr",
+ },
+ lhsSymbols: []int{
+ 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 10, 10, 10, 10, 10, 11,
+ },
+ terminals: []string{
+ "",
+ "<eof>",
+ "error",
+ "ws",
+ "l_paren",
+ "r_paren",
+ "identifier",
+ "raw_string_open",
+ "raw_string_body",
+ "raw_string_close",
+ "interpreted_string_open",
+ "interpreted_seq",
+ "codepoint_prefix",
+ "l_brace",
+ "r_brace",
+ "hex_digits",
+ "escaped_seq",
+ "escape_char",
+ "interpreted_string_close",
+ },
+ terminalSkip: []int{
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ astActions: [][]int{
+ nil,
+ nil,
+ {
+ 2, -3,
+ },
+ {
+ 2, 3,
+ },
+ {
+ 2,
+ },
+ {
+ -1, 2,
+ },
+ nil,
+ nil,
+ nil,
+ nil,
+ {
+ -2,
+ },
+ nil,
+ nil,
+ {
+ -2,
+ },
+ {
+ 2,
+ },
+ {
+ -1,
+ },
+ nil,
+ {
+ -1, -2,
+ },
+ {
+ -1,
+ },
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ nil,
+ {
+ 3,
+ },
+ },
+ }
+}
+
+func (g *grammarImpl) InitialState() int {
+ return 0
+}
+
+func (g *grammarImpl) StartProduction() int {
+ return 1
+}
+
+func (g *grammarImpl) RecoverProduction(prod int) bool {
+ return g.recoverProductions[prod] != 0
+}
+
+func (g *grammarImpl) Action(state int, terminal int) int {
+ return g.action[state*19+terminal]
+}
+
+func (g *grammarImpl) GoTo(state int, lhs int) int {
+ return g.goTo[state*12+lhs]
+}
+
+func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
+ return g.alternativeSymbolCounts[prod]
+}
+
+func (g *grammarImpl) TerminalCount() int {
+ return 19
+}
+
+func (g *grammarImpl) SkipTerminal(terminal int) bool {
+ return g.terminalSkip[terminal] == 1
+}
+
+func (g *grammarImpl) ErrorTrapperState(state int) bool {
+ return g.errorTrapperStates[state] != 0
+}
+
+func (g *grammarImpl) NonTerminal(nonTerminal int) string {
+ return g.nonTerminals[nonTerminal]
+}
+
+func (g *grammarImpl) LHS(prod int) int {
+ return g.lhsSymbols[prod]
+}
+
+func (g *grammarImpl) EOF() int {
+ return 1
+}
+
+func (g *grammarImpl) Error() int {
+ return 2
+}
+
+func (g *grammarImpl) Terminal(terminal int) string {
+ return g.terminals[terminal]
+}
+
+func (g *grammarImpl) ASTAction(prod int) []int {
+ return g.astActions[prod]
+}
+
+type vToken struct {
+ terminalID int
+ tok *Token
+}
+
+func (t *vToken) TerminalID() int {
+ return t.terminalID
+}
+
+func (t *vToken) Lexeme() []byte {
+ return t.tok.Lexeme
+}
+
+func (t *vToken) EOF() bool {
+ return t.tok.EOF
+}
+
+func (t *vToken) Invalid() bool {
+ return t.tok.Invalid
+}
+
+func (t *vToken) Position() (int, int) {
+ return t.tok.Row, t.tok.Col
+}
+
+var kindToTerminal = []int{
+ 0, 3, 4, 5, 6, 7, 10, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18,
+}
+
+type tokenStream struct {
+ lex *Lexer
+ kindToTerminal []int
+}
+
+func NewTokenStream(src io.Reader) (*tokenStream, error) {
+ lex, err := NewLexer(NewLexSpec(), src)
+ if err != nil {
+ return nil, err
+ }
+
+ return &tokenStream{
+ lex: lex,
+ }, nil
+}
+
+func (t *tokenStream) Next() (VToken, error) {
+ tok, err := t.lex.Next()
+ if err != nil {
+ return nil, err
+ }
+ return &vToken{
+ terminalID: kindToTerminal[tok.KindID],
+ tok: tok,
+ }, nil
+}
+// Code generated by vartan-go. DO NOT EDIT.
+
+// SemanticActionSet is a set of semantic actions a parser calls.
+type SemanticActionSet interface {
+ // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol.
+ // When the parser recovered from an error state by shifting the token, `recovered` is true.
+ Shift(tok VToken, recovered bool)
+
+ // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production.
+ // When the parser recovered from an error state by reducing the production, `recovered` is true.
+ Reduce(prodNum int, recovered bool)
+
+ // Accept runs when the parser accepts an input.
+ Accept()
+
+ // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack.
+ // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards
+ // from the state stack.
+ // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token
+ // corresponding to the error symbol doesn't exist.
+ TrapAndShiftError(cause VToken, popped int)
+
+ // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error.
+ MissError(cause VToken)
+}
+
+var _ SemanticActionSet = &SyntaxTreeActionSet{}
+
+// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface.
+type SyntaxTreeNode interface {
+ // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast`
+ // directive with `...` operator.
+ ChildCount() int
+
+ // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast`
+ // directive with `...` operator.
+ ExpandChildren() []SyntaxTreeNode
+}
+
+var _ SyntaxTreeNode = &Node{}
+
+// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types.
+// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface.
+type SyntaxTreeBuilder interface {
+ Shift(kindName string, text string, row, col int) SyntaxTreeNode
+ ShiftError(kindName string) SyntaxTreeNode
+ Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode
+ Accept(f SyntaxTreeNode)
+}
+
+var _ SyntaxTreeBuilder = &DefaulSyntaxTreeBuilder{}
+
+// DefaulSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder.
+type DefaulSyntaxTreeBuilder struct {
+ tree *Node
+}
+
+// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder.
+func NewDefaultSyntaxTreeBuilder() *DefaulSyntaxTreeBuilder {
+ return &DefaulSyntaxTreeBuilder{}
+}
+
+// Shift is a implementation of SyntaxTreeBuilder.Shift.
+func (b *DefaulSyntaxTreeBuilder) Shift(kindName string, text string, row, col int) SyntaxTreeNode {
+ return &Node{
+ Type: NodeTypeTerminal,
+ KindName: kindName,
+ Text: text,
+ Row: row,
+ Col: col,
+ }
+}
+
+// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError.
+func (b *DefaulSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode {
+ return &Node{
+ Type: NodeTypeError,
+ KindName: kindName,
+ }
+}
+
+// Reduce is a implementation of SyntaxTreeBuilder.Reduce.
+func (b *DefaulSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode {
+ cNodes := make([]*Node, len(children))
+ for i, c := range children {
+ cNodes[i] = c.(*Node)
+ }
+ return &Node{
+ Type: NodeTypeNonTerminal,
+ KindName: kindName,
+ Children: cNodes,
+ }
+}
+
+// Accept is a implementation of SyntaxTreeBuilder.Accept.
+func (b *DefaulSyntaxTreeBuilder) Accept(f SyntaxTreeNode) {
+ b.tree = f.(*Node)
+}
+
+// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil.
+func (b *DefaulSyntaxTreeBuilder) Tree() *Node {
+ return b.tree
+}
+
+// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree.
+type SyntaxTreeActionSet struct {
+ gram Grammar
+ builder SyntaxTreeBuilder
+ semStack *semanticStack
+ disableASTAction bool
+}
+
+// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree).
+// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them.
+func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
+ return &SyntaxTreeActionSet{
+ gram: gram,
+ builder: builder,
+ semStack: newSemanticStack(),
+ }
+}
+
+// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree).
+// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them.
+func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
+ return &SyntaxTreeActionSet{
+ gram: gram,
+ builder: builder,
+ semStack: newSemanticStack(),
+ disableASTAction: true,
+ }
+}
+
+// Shift is a implementation of SemanticActionSet.Shift method.
+func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) {
+ term := a.tokenToTerminal(tok)
+ row, col := tok.Position()
+ a.semStack.push(a.builder.Shift(a.gram.Terminal(term), string(tok.Lexeme()), row, col))
+}
+
+// Reduce is a implementation of SemanticActionSet.Reduce method.
+func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) {
+ lhs := a.gram.LHS(prodNum)
+
+ // When an alternative is empty, `n` will be 0, and `handle` will be empty slice.
+ n := a.gram.AlternativeSymbolCount(prodNum)
+ handle := a.semStack.pop(n)
+
+ var astAct []int
+ if !a.disableASTAction {
+ astAct = a.gram.ASTAction(prodNum)
+ }
+ var children []SyntaxTreeNode
+ if astAct != nil {
+ // Count the number of children in advance to avoid frequent growth in a slice for children.
+ {
+ l := 0
+ for _, e := range astAct {
+ if e > 0 {
+ l++
+ } else {
+ offset := e*-1 - 1
+ l += handle[offset].ChildCount()
+ }
+ }
+
+ children = make([]SyntaxTreeNode, l)
+ }
+
+ p := 0
+ for _, e := range astAct {
+ if e > 0 {
+ offset := e - 1
+ children[p] = handle[offset]
+ p++
+ } else {
+ offset := e*-1 - 1
+ for _, c := range handle[offset].ExpandChildren() {
+ children[p] = c
+ p++
+ }
+ }
+ }
+ } else {
+ // If an alternative has no AST action, a driver generates
+ // a node with the same structure as a CST.
+ children = handle
+ }
+
+ a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children))
+}
+
+// Accept is a implementation of SemanticActionSet.Accept method.
+func (a *SyntaxTreeActionSet) Accept() {
+ top := a.semStack.pop(1)
+ a.builder.Accept(top[0])
+}
+
+// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method.
+func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) {
+ a.semStack.pop(popped)
+ a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error())))
+}
+
+// MissError is a implementation of SemanticActionSet.MissError method.
+func (a *SyntaxTreeActionSet) MissError(cause VToken) {
+}
+
+func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int {
+ if tok.EOF() {
+ return a.gram.EOF()
+ }
+
+ return tok.TerminalID()
+}
+
+type semanticStack struct {
+ frames []SyntaxTreeNode
+}
+
+func newSemanticStack() *semanticStack {
+ return &semanticStack{
+ frames: make([]SyntaxTreeNode, 0, 100),
+ }
+}
+
+func (s *semanticStack) push(f SyntaxTreeNode) {
+ s.frames = append(s.frames, f)
+}
+
+func (s *semanticStack) pop(n int) []SyntaxTreeNode {
+ fs := s.frames[len(s.frames)-n:]
+ s.frames = s.frames[:len(s.frames)-n]
+
+ return fs
+}
+
+type NodeType int
+
+const (
+ NodeTypeError = 0
+ NodeTypeTerminal = 1
+ NodeTypeNonTerminal = 2
+)
+
+// Node is a implementation of SyntaxTreeNode interface.
+type Node struct {
+ Type NodeType
+ KindName string
+ Text string
+ Row int
+ Col int
+ Children []*Node
+}
+
+func (n *Node) MarshalJSON() ([]byte, error) {
+ switch n.Type {
+ case NodeTypeError:
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ })
+ case NodeTypeTerminal:
+ if n.KindName == "" {
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ Text string `json:"text"`
+ Row int `json:"row"`
+ Col int `json:"col"`
+ }{
+ Type: n.Type,
+ Text: n.Text,
+ Row: n.Row,
+ Col: n.Col,
+ })
+ }
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ Text string `json:"text"`
+ Row int `json:"row"`
+ Col int `json:"col"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ Text: n.Text,
+ Row: n.Row,
+ Col: n.Col,
+ })
+ case NodeTypeNonTerminal:
+ return json.Marshal(struct {
+ Type NodeType `json:"type"`
+ KindName string `json:"kind_name"`
+ Children []*Node `json:"children"`
+ }{
+ Type: n.Type,
+ KindName: n.KindName,
+ Children: n.Children,
+ })
+ default:
+ return nil, fmt.Errorf("invalid node type: %v", n.Type)
+ }
+}
+
+// ChildCount is a implementation of SyntaxTreeNode.ChildCount.
+func (n *Node) ChildCount() int {
+ return len(n.Children)
+}
+
+// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren.
+func (n *Node) ExpandChildren() []SyntaxTreeNode {
+ fs := make([]SyntaxTreeNode, len(n.Children))
+ for i, n := range n.Children {
+ fs[i] = n
+ }
+ return fs
+}
+
+// PrintTree prints a syntax tree whose root is `node`.
+func PrintTree(w io.Writer, node *Node) {
+ printTree(w, node, "", "")
+}
+
+func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) {
+ if node == nil {
+ return
+ }
+
+ switch node.Type {
+ case NodeTypeError:
+ fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
+ case NodeTypeTerminal:
+ fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text))
+ case NodeTypeNonTerminal:
+ fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
+
+ num := len(node.Children)
+ for i, child := range node.Children {
+ var line string
+ if num > 1 && i < num-1 {
+ line = "├─ "
+ } else {
+ line = "└─ "
+ }
+
+ var prefix string
+ if i >= num-1 {
+ prefix = " "
+ } else {
+ prefix = "│ "
+ }
+
+ printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+ }
+}
diff --git a/src/urubu/spec/test/parser.go b/src/urubu/spec/test/parser.go
deleted file mode 100644
index b7265d7..0000000
--- a/src/urubu/spec/test/parser.go
+++ /dev/null
@@ -1,336 +0,0 @@
-//go:generate vartan compile tree.vartan -o tree.json
-//go:generate vartan-go tree.json --package test
-
-package test
-
-import (
- "bufio"
- "bytes"
- "errors"
- "fmt"
- "io"
- "regexp"
- "strconv"
- "strings"
- "unicode/utf8"
-)
-
-type TreeDiff struct {
- ExpectedPath string
- ActualPath string
- Message string
-}
-
-func newTreeDiff(expected, actual *Tree, message string) *TreeDiff {
- return &TreeDiff{
- ExpectedPath: expected.path(),
- ActualPath: actual.path(),
- Message: message,
- }
-}
-
-type Tree struct {
- Parent *Tree
- Offset int
- Kind string
- Children []*Tree
- Lexeme string
-}
-
-func NewNonTerminalTree(kind string, children ...*Tree) *Tree {
- return &Tree{
- Kind: kind,
- Children: children,
- }
-}
-
-func NewTerminalNode(kind string, lexeme string) *Tree {
- return &Tree{
- Kind: kind,
- Lexeme: lexeme,
- }
-}
-
-func (t *Tree) Fill() *Tree {
- for i, c := range t.Children {
- c.Parent = t
- c.Offset = i
- c.Fill()
- }
- return t
-}
-
-func (t *Tree) path() string {
- if t.Parent == nil {
- return t.Kind
- }
- return fmt.Sprintf("%v.[%v]%v", t.Parent.path(), t.Offset, t.Kind)
-}
-
-func (t *Tree) Format() []byte {
- var b bytes.Buffer
- t.format(&b, 0)
- return b.Bytes()
-}
-
-func (t *Tree) format(buf *bytes.Buffer, depth int) {
- for i := 0; i < depth; i++ {
- buf.WriteString(" ")
- }
- buf.WriteString("(")
- buf.WriteString(t.Kind)
- if len(t.Children) > 0 {
- buf.WriteString("\n")
- for i, c := range t.Children {
- c.format(buf, depth+1)
- if i < len(t.Children)-1 {
- buf.WriteString("\n")
- }
- }
- }
- buf.WriteString(")")
-}
-
-func DiffTree(expected, actual *Tree) []*TreeDiff {
- if expected == nil && actual == nil {
- return nil
- }
- if actual.Kind != expected.Kind {
- msg := fmt.Sprintf("unexpected kind: expected '%v' but got '%v'", expected.Kind, actual.Kind)
- return []*TreeDiff{
- newTreeDiff(expected, actual, msg),
- }
- }
- if expected.Lexeme != actual.Lexeme {
- msg := fmt.Sprintf("unexpected lexeme: expected '%v' but got '%v'", expected.Lexeme, actual.Lexeme)
- return []*TreeDiff{
- newTreeDiff(expected, actual, msg),
- }
- }
- if len(actual.Children) != len(expected.Children) {
- msg := fmt.Sprintf("unexpected node count: expected %v but got %v", len(expected.Children), len(actual.Children))
- return []*TreeDiff{
- newTreeDiff(expected, actual, msg),
- }
- }
- var diffs []*TreeDiff
- for i, exp := range expected.Children {
- if ds := DiffTree(exp, actual.Children[i]); len(ds) > 0 {
- diffs = append(diffs, ds...)
- }
- }
- return diffs
-}
-
-type TestCase struct {
- Description string
- Source []byte
- Output *Tree
-}
-
-func ParseTestCase(r io.Reader) (*TestCase, error) {
- parts, err := splitIntoParts(r)
- if err != nil {
- return nil, err
- }
- if len(parts) != 3 {
- return nil, fmt.Errorf("too many or too few part delimiters: a test case consists of just tree parts: %v parts found", len(parts))
- }
-
- tp := &treeParser{
- lineOffset: parts[0].lineCount + parts[1].lineCount + 2,
- }
- tree, err := tp.parseTree(bytes.NewReader(parts[2].buf))
- if err != nil {
- return nil, err
- }
-
- return &TestCase{
- Description: string(parts[0].buf),
- Source: parts[1].buf,
- Output: tree,
- }, nil
-}
-
-type testCasePart struct {
- buf []byte
- lineCount int
-}
-
-func splitIntoParts(r io.Reader) ([]*testCasePart, error) {
- var bufs []*testCasePart
- s := bufio.NewScanner(r)
- for {
- buf, lineCount, err := readPart(s)
- if err != nil {
- return nil, err
- }
- if buf == nil {
- break
- }
- bufs = append(bufs, &testCasePart{
- buf: buf,
- lineCount: lineCount,
- })
- }
- if err := s.Err(); err != nil {
- return nil, err
- }
- return bufs, nil
-}
-
-var reDelim = regexp.MustCompile(`^\s*---+\s*$`)
-
-func readPart(s *bufio.Scanner) ([]byte, int, error) {
- if !s.Scan() {
- return nil, 0, s.Err()
- }
- buf := &bytes.Buffer{}
- line := s.Bytes()
- if reDelim.Match(line) {
- // Return an empty slice because (*bytes.Buffer).Bytes() returns nil if we have never written data.
- return []byte{}, 0, nil
- }
- _, err := buf.Write(line)
- if err != nil {
- return nil, 0, err
- }
- lineCount := 1
- for s.Scan() {
- line := s.Bytes()
- if reDelim.Match(line) {
- return buf.Bytes(), lineCount, nil
- }
- _, err := buf.Write([]byte("\n"))
- if err != nil {
- return nil, 0, err
- }
- _, err = buf.Write(line)
- if err != nil {
- return nil, 0, err
- }
- lineCount++
- }
- if err := s.Err(); err != nil {
- return nil, 0, err
- }
- return buf.Bytes(), lineCount, nil
-}
-
-type treeParser struct {
- lineOffset int
-}
-
-func (tp *treeParser) parseTree(src io.Reader) (*Tree, error) {
- toks, err := NewTokenStream(src)
- if err != nil {
- return nil, err
- }
- gram := NewGrammar()
- tb := NewDefaultSyntaxTreeBuilder()
- p, err := NewParser(toks, gram, SemanticAction(NewASTActionSet(gram, tb)))
- if err != nil {
- return nil, err
- }
- err = p.Parse()
- if err != nil {
- return nil, err
- }
- synErrs := p.SyntaxErrors()
- if len(synErrs) > 0 {
- var b strings.Builder
- b.Write(formatSyntaxError(synErrs[0], gram, tp.lineOffset))
- for _, synErr := range synErrs[1:] {
- b.WriteRune('\n')
- b.Write(formatSyntaxError(synErr, gram, tp.lineOffset))
- }
- return nil, errors.New(b.String())
- }
- t, err := tp.genTree(tb.Tree())
- if err != nil {
- return nil, err
- }
- return t.Fill(), nil
-}
-
-func formatSyntaxError(synErr *SyntaxError, gram Grammar, lineOffset int) []byte {
- var b bytes.Buffer
-
- b.WriteString(fmt.Sprintf("%v:%v: %v: ", lineOffset+synErr.Row+1, synErr.Col+1, synErr.Message))
-
- tok := synErr.Token
- switch {
- case tok.EOF():
- b.WriteString("<eof>")
- case tok.Invalid():
- b.WriteString(fmt.Sprintf("'%v' (<invalid>)", string(tok.Lexeme())))
- default:
- if term := gram.Terminal(tok.TerminalID()); term != "" {
- b.WriteString(fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), term))
- } else {
- b.WriteString(fmt.Sprintf("'%v'", string(tok.Lexeme())))
- }
- }
- b.WriteString(fmt.Sprintf(": expected: %v", synErr.ExpectedTerminals[0]))
- for _, t := range synErr.ExpectedTerminals[1:] {
- b.WriteString(fmt.Sprintf(", %v", t))
- }
-
- return b.Bytes()
-}
-
-func (tp *treeParser) genTree(node *Node) (*Tree, error) {
- // A node labeled 'error' cannot have children. It always must be (error).
- if sym := node.Children[0]; sym.Text == "error" {
- if len(node.Children) > 1 {
- return nil, fmt.Errorf("%v:%v: error node cannot take children", tp.lineOffset+sym.Row+1, sym.Col+1)
- }
- return NewTerminalNode(sym.Text, ""), nil
- }
-
- if len(node.Children) == 2 && node.Children[1].KindName == "string" {
- var text string
- str := node.Children[1].Children[0]
- switch str.KindName {
- case "raw_string":
- text = str.Children[0].Text
- case "interpreted_string":
- var b strings.Builder
- for _, c := range str.Children {
- switch c.KindName {
- case "escaped_seq":
- b.WriteString(strings.TrimPrefix(`\`, c.Text))
- case "escape_char":
- return nil, fmt.Errorf("%v:%v: incomplete escape sequence", tp.lineOffset+c.Row+1, c.Col+1)
- case "codepoint_expr":
- cp := c.Children[0]
- n, err := strconv.ParseInt(cp.Text, 16, 64)
- if err != nil {
- return nil, fmt.Errorf("%v:%v: %v", tp.lineOffset+cp.Row+1, cp.Col+1, err)
- }
- if !utf8.ValidRune(rune(n)) {
- return nil, fmt.Errorf("%v:%v: invalid code point: %v", tp.lineOffset+cp.Row+1, cp.Col+1, cp.Text)
- }
- b.WriteRune(rune(n))
- default:
- b.WriteString(c.Text)
- }
- }
- text = b.String()
- }
- return NewTerminalNode(node.Children[0].Text, text), nil
- }
-
- var children []*Tree
- if len(node.Children) > 1 {
- children = make([]*Tree, len(node.Children)-1)
- for i, c := range node.Children[1:] {
- var err error
- children[i], err = tp.genTree(c)
- if err != nil {
- return nil, err
- }
- }
- }
- return NewNonTerminalTree(node.Children[0].Text, children...), nil
-}
diff --git a/src/urubu/spec/test/tree_parser.go b/src/urubu/spec/test/tree_parser.go
deleted file mode 100644
index 528d259..0000000
--- a/src/urubu/spec/test/tree_parser.go
+++ /dev/null
@@ -1,716 +0,0 @@
-// Code generated by vartan-go. DO NOT EDIT.
-package test
-
-import (
- "fmt"
- "io"
-)
-
-type Grammar interface {
- // InitialState returns the initial state of a parser.
- InitialState() int
-
- // StartProduction returns the start production of grammar.
- StartProduction() int
-
- // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair.
- Action(state int, terminal int) int
-
- // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair.
- GoTo(state int, lhs int) int
-
- // ErrorTrapperState returns true when a state can shift the error symbol.
- ErrorTrapperState(state int) bool
-
- // LHS returns a LHS symbol of a production.
- LHS(prod int) int
-
- // AlternativeSymbolCount returns a symbol count of p production.
- AlternativeSymbolCount(prod int) int
-
- // RecoverProduction returns true when a production has the recover directive.
- RecoverProduction(prod int) bool
-
- // NonTerminal retuns a string representaion of a non-terminal symbol.
- NonTerminal(nonTerminal int) string
-
- // TerminalCount returns a terminal symbol count of grammar.
- TerminalCount() int
-
- // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis.
- SkipTerminal(terminal int) bool
-
- // EOF returns the EOF symbol.
- EOF() int
-
- // Error returns the error symbol.
- Error() int
-
- // Terminal retuns a string representaion of a terminal symbol.
- Terminal(terminal int) string
-
- // ASTAction returns an AST action entries.
- ASTAction(prod int) []int
-}
-
-type VToken interface {
- // TerminalID returns a terminal ID.
- TerminalID() int
-
- // Lexeme returns a lexeme.
- Lexeme() []byte
-
- // EOF returns true when a token represents EOF.
- EOF() bool
-
- // Invalid returns true when a token is invalid.
- Invalid() bool
-
- // Position returns (row, column) pair.
- Position() (int, int)
-}
-
-type TokenStream interface {
- Next() (VToken, error)
-}
-
-type SyntaxError struct {
- Row int
- Col int
- Message string
- Token VToken
- ExpectedTerminals []string
-}
-
-type ParserOption func(p *Parser) error
-
-// DisableLAC disables LAC (lookahead correction). LAC is enabled by default.
-func DisableLAC() ParserOption {
- return func(p *Parser) error {
- p.disableLAC = true
- return nil
- }
-}
-
-func SemanticAction(semAct SemanticActionSet) ParserOption {
- return func(p *Parser) error {
- p.semAct = semAct
- return nil
- }
-}
-
-type Parser struct {
- toks TokenStream
- gram Grammar
- stateStack *stateStack
- semAct SemanticActionSet
- disableLAC bool
- onError bool
- shiftCount int
- synErrs []*SyntaxError
-}
-
-func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) {
- p := &Parser{
- toks: toks,
- gram: gram,
- stateStack: &stateStack{},
- }
-
- for _, opt := range opts {
- err := opt(p)
- if err != nil {
- return nil, err
- }
- }
-
- return p, nil
-}
-
-func (p *Parser) Parse() error {
- p.stateStack.push(p.gram.InitialState())
- tok, err := p.nextToken()
- if err != nil {
- return err
- }
-
-ACTION_LOOP:
- for {
- act := p.lookupAction(tok)
-
- switch {
- case act < 0: // Shift
- nextState := act * -1
-
- recovered := false
- if p.onError {
- p.shiftCount++
-
- // When the parser performs shift three times, the parser recovers from the error state.
- if p.shiftCount >= 3 {
- p.onError = false
- p.shiftCount = 0
- recovered = true
- }
- }
-
- p.shift(nextState)
-
- if p.semAct != nil {
- p.semAct.Shift(tok, recovered)
- }
-
- tok, err = p.nextToken()
- if err != nil {
- return err
- }
- case act > 0: // Reduce
- prodNum := act
-
- recovered := false
- if p.onError && p.gram.RecoverProduction(prodNum) {
- p.onError = false
- p.shiftCount = 0
- recovered = true
- }
-
- accepted := p.reduce(prodNum)
- if accepted {
- if p.semAct != nil {
- p.semAct.Accept()
- }
-
- return nil
- }
-
- if p.semAct != nil {
- p.semAct.Reduce(prodNum, recovered)
- }
- default: // Error
- if p.onError {
- tok, err = p.nextToken()
- if err != nil {
- return err
- }
- if tok.EOF() {
- if p.semAct != nil {
- p.semAct.MissError(tok)
- }
-
- return nil
- }
-
- continue ACTION_LOOP
- }
-
- row, col := tok.Position()
- p.synErrs = append(p.synErrs, &SyntaxError{
- Row: row,
- Col: col,
- Message: "unexpected token",
- Token: tok,
- ExpectedTerminals: p.searchLookahead(p.stateStack.top()),
- })
-
- count, ok := p.trapError()
- if !ok {
- if p.semAct != nil {
- p.semAct.MissError(tok)
- }
-
- return nil
- }
-
- p.onError = true
- p.shiftCount = 0
-
- act, err := p.lookupActionOnError()
- if err != nil {
- return err
- }
-
- p.shift(act * -1)
-
- if p.semAct != nil {
- p.semAct.TrapAndShiftError(tok, count)
- }
- }
- }
-}
-
-// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid,
-// this method returns `true`.
-func (p *Parser) validateLookahead(term int) bool {
- p.stateStack.enableExploratoryMode()
- defer p.stateStack.disableExploratoryMode()
-
- for {
- act := p.gram.Action(p.stateStack.topExploratorily(), term)
-
- switch {
- case act < 0: // Shift
- return true
- case act > 0: // Reduce
- prodNum := act
-
- lhs := p.gram.LHS(prodNum)
- if lhs == p.gram.LHS(p.gram.StartProduction()) {
- return true
- }
- n := p.gram.AlternativeSymbolCount(prodNum)
- p.stateStack.popExploratorily(n)
- state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs)
- p.stateStack.pushExploratorily(state)
- default: // Error
- return false
- }
- }
-}
-
-func (p *Parser) nextToken() (VToken, error) {
- for {
- // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0,
- // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect
- // a syntax error because the parser cannot find an entry corresponding to the invalid token.
- tok, err := p.toks.Next()
- if err != nil {
- return nil, err
- }
-
- if p.gram.SkipTerminal(tok.TerminalID()) {
- continue
- }
-
- return tok, nil
- }
-}
-
-func (p *Parser) tokenToTerminal(tok VToken) int {
- if tok.EOF() {
- return p.gram.EOF()
- }
-
- return tok.TerminalID()
-}
-
-func (p *Parser) lookupAction(tok VToken) int {
- if !p.disableLAC {
- term := p.tokenToTerminal(tok)
- if !p.validateLookahead(term) {
- return 0
- }
- }
-
- return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok))
-}
-
-func (p *Parser) lookupActionOnError() (int, error) {
- act := p.gram.Action(p.stateStack.top(), p.gram.Error())
- if act >= 0 {
- return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error()))
- }
-
- return act, nil
-}
-
-func (p *Parser) shift(nextState int) {
- p.stateStack.push(nextState)
-}
-
-func (p *Parser) reduce(prodNum int) bool {
- lhs := p.gram.LHS(prodNum)
- if lhs == p.gram.LHS(p.gram.StartProduction()) {
- return true
- }
- n := p.gram.AlternativeSymbolCount(prodNum)
- p.stateStack.pop(n)
- nextState := p.gram.GoTo(p.stateStack.top(), lhs)
- p.stateStack.push(nextState)
- return false
-}
-
-func (p *Parser) trapError() (int, bool) {
- count := 0
- for {
- if p.gram.ErrorTrapperState(p.stateStack.top()) {
- return count, true
- }
-
- if p.stateStack.top() != p.gram.InitialState() {
- p.stateStack.pop(1)
- count++
- } else {
- return 0, false
- }
- }
-}
-
-func (p *Parser) SyntaxErrors() []*SyntaxError {
- return p.synErrs
-}
-
-func (p *Parser) searchLookahead(state int) []string {
- kinds := []string{}
- termCount := p.gram.TerminalCount()
- for term := 0; term < termCount; term++ {
- if p.disableLAC {
- if p.gram.Action(p.stateStack.top(), term) == 0 {
- continue
- }
- } else {
- if !p.validateLookahead(term) {
- continue
- }
- }
-
- // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol
- // intentionally.
- if term == p.gram.Error() {
- continue
- }
-
- kinds = append(kinds, p.gram.Terminal(term))
- }
-
- return kinds
-}
-
-type stateStack struct {
- items []int
- itemsExp []int
-}
-
-func (s *stateStack) enableExploratoryMode() {
- s.itemsExp = make([]int, len(s.items))
- copy(s.itemsExp, s.items)
-}
-
-func (s *stateStack) disableExploratoryMode() {
- s.itemsExp = nil
-}
-
-func (s *stateStack) top() int {
- return s.items[len(s.items)-1]
-}
-
-func (s *stateStack) topExploratorily() int {
- return s.itemsExp[len(s.itemsExp)-1]
-}
-
-func (s *stateStack) push(state int) {
- s.items = append(s.items, state)
-}
-
-func (s *stateStack) pushExploratorily(state int) {
- s.itemsExp = append(s.itemsExp, state)
-}
-
-func (s *stateStack) pop(n int) {
- s.items = s.items[:len(s.items)-n]
-}
-
-func (s *stateStack) popExploratorily(n int) {
- s.itemsExp = s.itemsExp[:len(s.itemsExp)-n]
-}
-
-type grammarImpl struct {
- recoverProductions []int
- action []int
- goTo []int
- alternativeSymbolCounts []int
- errorTrapperStates []int
- nonTerminals []string
- lhsSymbols []int
- terminals []string
- terminalSkip []int
- astActions [][]int
-}
-
-func NewGrammar() *grammarImpl {
- return &grammarImpl{
- recoverProductions: []int{
- 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- },
- action: []int{
- 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -3, 0, 0, 0, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0,
- -2, 7, 0, -11, 0, 0, -12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4,
- 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -14, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 12, 12, 0, 0, -17, 12, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -22, 0, 16, 16, 0, 0, 0, 0, 0, -23,
- -24, -25, -26, -27, -28, -29, 16, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -30, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- -31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -23, -24, -25, -26, -27, -28, -29, 15,
- 0, 18, 0, 0, 18, 18, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 0,
- 25, 0, 0, 25, 25, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -33, 0, 19, 0,
- 0, 19, 19, 0, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, -34, 0, 0, 0, 0, 0, 0, 20, 0, 0, 20,
- 20, 0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 20, 20, 20, 0, 21, 0, 0, 21, 21,
- 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 21, 21, 0, 22, 0, 0, 22, 22, 0,
- 0, 0, 0, 0, 22, 22, 22, 22, 22, 22, 22, 22, 0, 23, 0, 0, 23, 23, 0, 0,
- 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23, 0, 24, 0, 0, 24, 24, 0, 0, 0,
- 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 0, 10, 0, 0, 10, 10, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 13, 13, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 17, 17, 0, 0, 0, 0, 0, 17,
- 17, 17, 17, 17, 17, 17, 17, 0, 14, 0, 0, 14, 14, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, -35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -36,
- 0, 0, 0, 0, 0, 26, 0, 0, 26, 26, 0, 0, 0, 0, 0, 26, 26, 26, 26, 26,
- 26, 26, 26,
- },
- goTo: []int{
- 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 0, 10, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 19, 20, 21, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 21,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0,
- },
- alternativeSymbolCounts: []int{
- 0, 1, 4, 4, 3, 2, 1, 0, 1, 1, 3, 1, 0, 3, 3, 1, 0, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 4,
- },
- errorTrapperStates: []int{
- 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- },
- nonTerminals: []string{
- "",
- "tree'",
- "tree",
- "tree_list",
- "string",
- "raw_string",
- "opt_raw_string_body",
- "interpreted_string",
- "opt_interpreted_string_body",
- "interpreted_string_body",
- "interpreted_string_elem",
- "codepoint_expr",
- },
- lhsSymbols: []int{
- 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 10, 10, 10, 10, 10, 10, 11,
- },
- terminals: []string{
- "",
- "<eof>",
- "error",
- "ws",
- "l_paren",
- "r_paren",
- "identifier",
- "raw_string_open",
- "raw_string_body",
- "raw_string_close",
- "interpreted_string_open",
- "interpreted_seq",
- "codepoint_prefix",
- "l_brace",
- "r_brace",
- "hex_digits",
- "escaped_seq",
- "escape_char",
- "interpreted_string_close",
- },
- terminalSkip: []int{
- 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- },
- astActions: [][]int{
- nil,
- nil,
- {
- 2, -3,
- },
- {
- 2, 3,
- },
- {
- 2,
- },
- {
- -1, 2,
- },
- nil,
- nil,
- nil,
- nil,
- {
- -2,
- },
- nil,
- nil,
- {
- -2,
- },
- {
- 2,
- },
- {
- -1,
- },
- nil,
- {
- -1, -2,
- },
- {
- -1,
- },
- nil,
- nil,
- nil,
- nil,
- nil,
- nil,
- nil,
- {
- 3,
- },
- },
- }
-}
-
-func (g *grammarImpl) InitialState() int {
- return 0
-}
-
-func (g *grammarImpl) StartProduction() int {
- return 1
-}
-
-func (g *grammarImpl) RecoverProduction(prod int) bool {
- return g.recoverProductions[prod] != 0
-}
-
-func (g *grammarImpl) Action(state int, terminal int) int {
- return g.action[state*19+terminal]
-}
-
-func (g *grammarImpl) GoTo(state int, lhs int) int {
- return g.goTo[state*12+lhs]
-}
-
-func (g *grammarImpl) AlternativeSymbolCount(prod int) int {
- return g.alternativeSymbolCounts[prod]
-}
-
-func (g *grammarImpl) TerminalCount() int {
- return 19
-}
-
-func (g *grammarImpl) SkipTerminal(terminal int) bool {
- return g.terminalSkip[terminal] == 1
-}
-
-func (g *grammarImpl) ErrorTrapperState(state int) bool {
- return g.errorTrapperStates[state] != 0
-}
-
-func (g *grammarImpl) NonTerminal(nonTerminal int) string {
- return g.nonTerminals[nonTerminal]
-}
-
-func (g *grammarImpl) LHS(prod int) int {
- return g.lhsSymbols[prod]
-}
-
-func (g *grammarImpl) EOF() int {
- return 1
-}
-
-func (g *grammarImpl) Error() int {
- return 2
-}
-
-func (g *grammarImpl) Terminal(terminal int) string {
- return g.terminals[terminal]
-}
-
-func (g *grammarImpl) ASTAction(prod int) []int {
- return g.astActions[prod]
-}
-
-type vToken struct {
- terminalID int
- tok *Token
-}
-
-func (t *vToken) TerminalID() int {
- return t.terminalID
-}
-
-func (t *vToken) Lexeme() []byte {
- return t.tok.Lexeme
-}
-
-func (t *vToken) EOF() bool {
- return t.tok.EOF
-}
-
-func (t *vToken) Invalid() bool {
- return t.tok.Invalid
-}
-
-func (t *vToken) Position() (int, int) {
- return t.tok.Row, t.tok.Col
-}
-
-var kindToTerminal = []int{
- 0, 3, 4, 5, 6, 7, 10, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18,
-}
-
-type tokenStream struct {
- lex *Lexer
- kindToTerminal []int
-}
-
-func NewTokenStream(src io.Reader) (*tokenStream, error) {
- lex, err := NewLexer(NewLexSpec(), src)
- if err != nil {
- return nil, err
- }
-
- return &tokenStream{
- lex: lex,
- }, nil
-}
-
-func (t *tokenStream) Next() (VToken, error) {
- tok, err := t.lex.Next()
- if err != nil {
- return nil, err
- }
- return &vToken{
- terminalID: kindToTerminal[tok.KindID],
- tok: tok,
- }, nil
-}
diff --git a/src/urubu/spec/test/tree_semantic_action.go b/src/urubu/spec/test/tree_semantic_action.go
deleted file mode 100644
index c1d5a25..0000000
--- a/src/urubu/spec/test/tree_semantic_action.go
+++ /dev/null
@@ -1,367 +0,0 @@
-// Code generated by vartan-go. DO NOT EDIT.
-package test
-
-import (
- "encoding/json"
- "fmt"
- "io"
- "strconv"
-)
-
-// SemanticActionSet is a set of semantic actions a parser calls.
-type SemanticActionSet interface {
- // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol.
- // When the parser recovered from an error state by shifting the token, `recovered` is true.
- Shift(tok VToken, recovered bool)
-
- // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production.
- // When the parser recovered from an error state by reducing the production, `recovered` is true.
- Reduce(prodNum int, recovered bool)
-
- // Accept runs when the parser accepts an input.
- Accept()
-
- // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack.
- // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards
- // from the state stack.
- // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token
- // corresponding to the error symbol doesn't exist.
- TrapAndShiftError(cause VToken, popped int)
-
- // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error.
- MissError(cause VToken)
-}
-
-var _ SemanticActionSet = &SyntaxTreeActionSet{}
-
-// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface.
-type SyntaxTreeNode interface {
- // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast`
- // directive with `...` operator.
- ChildCount() int
-
- // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast`
- // directive with `...` operator.
- ExpandChildren() []SyntaxTreeNode
-}
-
-var _ SyntaxTreeNode = &Node{}
-
-// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types.
-// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface.
-type SyntaxTreeBuilder interface {
- Shift(kindName string, text string, row, col int) SyntaxTreeNode
- ShiftError(kindName string) SyntaxTreeNode
- Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode
- Accept(f SyntaxTreeNode)
-}
-
-var _ SyntaxTreeBuilder = &DefaulSyntaxTreeBuilder{}
-
-// DefaulSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder.
-type DefaulSyntaxTreeBuilder struct {
- tree *Node
-}
-
-// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder.
-func NewDefaultSyntaxTreeBuilder() *DefaulSyntaxTreeBuilder {
- return &DefaulSyntaxTreeBuilder{}
-}
-
-// Shift is a implementation of SyntaxTreeBuilder.Shift.
-func (b *DefaulSyntaxTreeBuilder) Shift(kindName string, text string, row, col int) SyntaxTreeNode {
- return &Node{
- Type: NodeTypeTerminal,
- KindName: kindName,
- Text: text,
- Row: row,
- Col: col,
- }
-}
-
-// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError.
-func (b *DefaulSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode {
- return &Node{
- Type: NodeTypeError,
- KindName: kindName,
- }
-}
-
-// Reduce is a implementation of SyntaxTreeBuilder.Reduce.
-func (b *DefaulSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode {
- cNodes := make([]*Node, len(children))
- for i, c := range children {
- cNodes[i] = c.(*Node)
- }
- return &Node{
- Type: NodeTypeNonTerminal,
- KindName: kindName,
- Children: cNodes,
- }
-}
-
-// Accept is a implementation of SyntaxTreeBuilder.Accept.
-func (b *DefaulSyntaxTreeBuilder) Accept(f SyntaxTreeNode) {
- b.tree = f.(*Node)
-}
-
-// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil.
-func (b *DefaulSyntaxTreeBuilder) Tree() *Node {
- return b.tree
-}
-
-// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree.
-type SyntaxTreeActionSet struct {
- gram Grammar
- builder SyntaxTreeBuilder
- semStack *semanticStack
- disableASTAction bool
-}
-
-// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree).
-// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them.
-func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
- return &SyntaxTreeActionSet{
- gram: gram,
- builder: builder,
- semStack: newSemanticStack(),
- }
-}
-
-// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree).
-// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them.
-func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet {
- return &SyntaxTreeActionSet{
- gram: gram,
- builder: builder,
- semStack: newSemanticStack(),
- disableASTAction: true,
- }
-}
-
-// Shift is a implementation of SemanticActionSet.Shift method.
-func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) {
- term := a.tokenToTerminal(tok)
- row, col := tok.Position()
- a.semStack.push(a.builder.Shift(a.gram.Terminal(term), string(tok.Lexeme()), row, col))
-}
-
-// Reduce is a implementation of SemanticActionSet.Reduce method.
-func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) {
- lhs := a.gram.LHS(prodNum)
-
- // When an alternative is empty, `n` will be 0, and `handle` will be empty slice.
- n := a.gram.AlternativeSymbolCount(prodNum)
- handle := a.semStack.pop(n)
-
- var astAct []int
- if !a.disableASTAction {
- astAct = a.gram.ASTAction(prodNum)
- }
- var children []SyntaxTreeNode
- if astAct != nil {
- // Count the number of children in advance to avoid frequent growth in a slice for children.
- {
- l := 0
- for _, e := range astAct {
- if e > 0 {
- l++
- } else {
- offset := e*-1 - 1
- l += handle[offset].ChildCount()
- }
- }
-
- children = make([]SyntaxTreeNode, l)
- }
-
- p := 0
- for _, e := range astAct {
- if e > 0 {
- offset := e - 1
- children[p] = handle[offset]
- p++
- } else {
- offset := e*-1 - 1
- for _, c := range handle[offset].ExpandChildren() {
- children[p] = c
- p++
- }
- }
- }
- } else {
- // If an alternative has no AST action, a driver generates
- // a node with the same structure as a CST.
- children = handle
- }
-
- a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children))
-}
-
-// Accept is a implementation of SemanticActionSet.Accept method.
-func (a *SyntaxTreeActionSet) Accept() {
- top := a.semStack.pop(1)
- a.builder.Accept(top[0])
-}
-
-// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method.
-func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) {
- a.semStack.pop(popped)
- a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error())))
-}
-
-// MissError is a implementation of SemanticActionSet.MissError method.
-func (a *SyntaxTreeActionSet) MissError(cause VToken) {
-}
-
-func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int {
- if tok.EOF() {
- return a.gram.EOF()
- }
-
- return tok.TerminalID()
-}
-
-type semanticStack struct {
- frames []SyntaxTreeNode
-}
-
-func newSemanticStack() *semanticStack {
- return &semanticStack{
- frames: make([]SyntaxTreeNode, 0, 100),
- }
-}
-
-func (s *semanticStack) push(f SyntaxTreeNode) {
- s.frames = append(s.frames, f)
-}
-
-func (s *semanticStack) pop(n int) []SyntaxTreeNode {
- fs := s.frames[len(s.frames)-n:]
- s.frames = s.frames[:len(s.frames)-n]
-
- return fs
-}
-
-type NodeType int
-
-const (
- NodeTypeError = 0
- NodeTypeTerminal = 1
- NodeTypeNonTerminal = 2
-)
-
-// Node is a implementation of SyntaxTreeNode interface.
-type Node struct {
- Type NodeType
- KindName string
- Text string
- Row int
- Col int
- Children []*Node
-}
-
-func (n *Node) MarshalJSON() ([]byte, error) {
- switch n.Type {
- case NodeTypeError:
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- })
- case NodeTypeTerminal:
- if n.KindName == "" {
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- Text string `json:"text"`
- Row int `json:"row"`
- Col int `json:"col"`
- }{
- Type: n.Type,
- Text: n.Text,
- Row: n.Row,
- Col: n.Col,
- })
- }
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- Text string `json:"text"`
- Row int `json:"row"`
- Col int `json:"col"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- Text: n.Text,
- Row: n.Row,
- Col: n.Col,
- })
- case NodeTypeNonTerminal:
- return json.Marshal(struct {
- Type NodeType `json:"type"`
- KindName string `json:"kind_name"`
- Children []*Node `json:"children"`
- }{
- Type: n.Type,
- KindName: n.KindName,
- Children: n.Children,
- })
- default:
- return nil, fmt.Errorf("invalid node type: %v", n.Type)
- }
-}
-
-// ChildCount is a implementation of SyntaxTreeNode.ChildCount.
-func (n *Node) ChildCount() int {
- return len(n.Children)
-}
-
-// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren.
-func (n *Node) ExpandChildren() []SyntaxTreeNode {
- fs := make([]SyntaxTreeNode, len(n.Children))
- for i, n := range n.Children {
- fs[i] = n
- }
- return fs
-}
-
-// PrintTree prints a syntax tree whose root is `node`.
-func PrintTree(w io.Writer, node *Node) {
- printTree(w, node, "", "")
-}
-
-func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) {
- if node == nil {
- return
- }
-
- switch node.Type {
- case NodeTypeError:
- fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
- case NodeTypeTerminal:
- fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text))
- case NodeTypeNonTerminal:
- fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName)
-
- num := len(node.Children)
- for i, child := range node.Children {
- var line string
- if num > 1 && i < num-1 {
- line = "├─ "
- } else {
- line = "└─ "
- }
-
- var prefix string
- if i >= num-1 {
- prefix = " "
- } else {
- prefix = "│ "
- }
-
- printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
- }
- }
-}
diff --git a/src/urubu/spec/test/tree-report.json b/src/urubu/spec/tree-report.json
index c2018e5..c2018e5 100644
--- a/src/urubu/spec/test/tree-report.json
+++ b/src/urubu/spec/tree-report.json
diff --git a/src/urubu/spec/test/tree.json b/src/urubu/spec/tree.json
index f05c2f2..f05c2f2 100644
--- a/src/urubu/spec/test/tree.json
+++ b/src/urubu/spec/tree.json
diff --git a/src/urubu/spec/test/tree.vartan b/src/urubu/spec/tree.vartan
index aa8f733..aa8f733 100644
--- a/src/urubu/spec/test/tree.vartan
+++ b/src/urubu/spec/tree.vartan
diff --git a/src/urubu/tester/tester.go b/src/urubu/tester.go
index cae52b2..cae52b2 100644
--- a/src/urubu/tester/tester.go
+++ b/src/urubu/tester.go
diff --git a/src/urubu/ucd/codepoint.go b/src/urubu/ucd.go
index e9b411e..3c3da17 100644
--- a/src/urubu/ucd/codepoint.go
+++ b/src/urubu/ucd.go
@@ -1,7 +1,190 @@
-// Code generated by generator/main.go; DO NOT EDIT.
+//go:generate go run ../cmd/ucdgen/main.go
+//go:generate go fmt codepoint.go
package ucd
+import (
+ "bufio"
+ "encoding/binary"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "regexp"
+ "strings"
+)
+
+const (
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
+ // 3.4 Characters and Encoding
+ // > D9 Unicode codespace: A range of integers from 0 to 10FFFF16.
+ codePointMin = 0x0
+ codePointMax = 0x10FFFF
+)
+
+func NormalizeCharacterProperty(propName, propVal string) (string, error) {
+ if propName == "" {
+ propName = "gc"
+ }
+
+ name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)]
+ if !ok {
+ return "", fmt.Errorf("unsupported character property name: %v", propName)
+ }
+ props, ok := derivedCoreProperties[name]
+ if !ok {
+ return "", nil
+ }
+ var b strings.Builder
+ yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return "", fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ fmt.Fprint(&b, "[")
+ } else {
+ fmt.Fprint(&b, "[^")
+ }
+ for _, prop := range props {
+ fmt.Fprint(&b, prop)
+ }
+ fmt.Fprint(&b, "]")
+
+ return b.String(), nil
+}
+
+func IsContributoryProperty(propName string) bool {
+ if propName == "" {
+ return false
+ }
+
+ for _, p := range contributoryProperties {
+ if propName == p {
+ return true
+ }
+ }
+ return false
+}
+
+func FindCodePointRanges(propName, propVal string) ([]*CodePointRange, bool, error) {
+ if propName == "" {
+ propName = "gc"
+ }
+
+ name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property name: %v", propName)
+ }
+ switch name {
+ case "gc":
+ val, ok := generalCategoryValueAbbs[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if val == generalCategoryValueAbbs[normalizeSymbolicValue(generalCategoryDefaultValue)] {
+ var allCPs []*CodePointRange
+ if generalCategoryDefaultRange.From > codePointMin {
+ allCPs = append(allCPs, &CodePointRange{
+ From: codePointMin,
+ To: generalCategoryDefaultRange.From - 1,
+ })
+ }
+ if generalCategoryDefaultRange.To < codePointMax {
+ allCPs = append(allCPs, &CodePointRange{
+ From: generalCategoryDefaultRange.To + 1,
+ To: codePointMax,
+ })
+ }
+ for _, cp := range generalCategoryCodePoints {
+ allCPs = append(allCPs, cp...)
+ }
+ return allCPs, true, nil
+ }
+ vals, ok := compositGeneralCategories[val]
+ if !ok {
+ vals = []string{val}
+ }
+ var ranges []*CodePointRange
+ for _, v := range vals {
+ rs, ok := generalCategoryCodePoints[v]
+ if !ok {
+ return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v)
+ }
+ ranges = append(ranges, rs...)
+ }
+ return ranges, false, nil
+ case "sc":
+ val, ok := scriptValueAbbs[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if val == scriptValueAbbs[normalizeSymbolicValue(scriptDefaultValue)] {
+ var allCPs []*CodePointRange
+ if scriptDefaultRange.From > codePointMin {
+ allCPs = append(allCPs, &CodePointRange{
+ From: codePointMin,
+ To: scriptDefaultRange.From - 1,
+ })
+ }
+ if scriptDefaultRange.To < codePointMax {
+ allCPs = append(allCPs, &CodePointRange{
+ From: scriptDefaultRange.To + 1,
+ To: codePointMax,
+ })
+ }
+ for _, cp := range scriptCodepoints {
+ allCPs = append(allCPs, cp...)
+ }
+ return allCPs, true, nil
+ }
+ return scriptCodepoints[val], false, nil
+ case "oalpha":
+ yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return otherAlphabeticCodePoints, false, nil
+ } else {
+ return otherAlphabeticCodePoints, true, nil
+ }
+ case "olower":
+ yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return otherLowercaseCodePoints, false, nil
+ } else {
+ return otherLowercaseCodePoints, true, nil
+ }
+ case "oupper":
+ yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return otherUppercaseCodePoints, false, nil
+ } else {
+ return otherUppercaseCodePoints, true, nil
+ }
+ case "wspace":
+ yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return whiteSpaceCodePoints, false, nil
+ } else {
+ return whiteSpaceCodePoints, true, nil
+ }
+ }
+
+ // If the process reaches this code, it's a bug. We must handle all of the properties registered with
+ // the `propertyNameAbbs`.
+ return nil, false, fmt.Errorf("character property '%v' is unavailable", propName)
+}
+// Code generated by generator/main.go; DO NOT EDIT.
+
// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
var generalCategoryValueAbbs = map[string]string{
"c": "c",
@@ -6550,3 +6733,467 @@ var whiteSpaceCodePoints = []*CodePointRange{
&CodePointRange{From: rune(8287), To: rune(8287)},
&CodePointRange{From: rune(12288), To: rune(12288)},
}
+
+type CodePointRange struct {
+ From rune
+ To rune
+}
+
+var codePointRangeNil = &CodePointRange{
+ From: 0,
+ To: 0,
+}
+
+type field string
+
+func (f field) codePointRange() (*CodePointRange, error) {
+ var from, to rune
+ var err error
+ cp := reCodePointRange.FindStringSubmatch(string(f))
+ from, err = decodeHexToRune(cp[1])
+ if err != nil {
+ return codePointRangeNil, err
+ }
+ if cp[2] != "" {
+ to, err = decodeHexToRune(cp[2])
+ if err != nil {
+ return codePointRangeNil, err
+ }
+ } else {
+ to = from
+ }
+ return &CodePointRange{
+ From: from,
+ To: to,
+ }, nil
+}
+
+func decodeHexToRune(hexCodePoint string) (rune, error) {
+ h := hexCodePoint
+ if len(h)%2 != 0 {
+ h = "0" + h
+ }
+ b, err := hex.DecodeString(h)
+ if err != nil {
+ return 0, err
+ }
+ l := len(b)
+ for i := 0; i < 4-l; i++ {
+ b = append([]byte{0}, b...)
+ }
+ n := binary.BigEndian.Uint32(b)
+ return rune(n), nil
+}
+
+func (f field) symbol() string {
+ return string(f)
+}
+
+func (f field) normalizedSymbol() string {
+ return normalizeSymbolicValue(string(f))
+}
+
+var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")
+
+// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3.
+//
+// https://www.unicode.org/reports/tr44/#UAX44-LM3
+func normalizeSymbolicValue(s string) string {
+ v := strings.ToLower(symValReplacer.Replace(s))
+ if strings.HasPrefix(v, "is") && v != "is" {
+ return v[2:]
+ }
+ return v
+}
+
+var (
+ reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
+ reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)
+
+ specialCommentPrefix = "# @missing:"
+)
+
+// This parser can parse data files of Unicode Character Database (UCD).
+// Specifically, it has the following two functions:
+// - Converts each line of the data files into a slice of fields.
+// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields.
+//
+// However, for practical purposes, each field needs to be analyzed more specifically.
+// For instance, in UnicodeData.txt, the first field represents a range of code points,
+// so it needs to be recognized as a hexadecimal string.
+// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser.
+//
+// https://www.unicode.org/reports/tr44/#Format_Conventions
+type parser struct {
+ scanner *bufio.Scanner
+ fields []field
+ defaultFields []field
+ err error
+
+ fieldBuf []field
+ defaultFieldBuf []field
+}
+
+func newParser(r io.Reader) *parser {
+ return &parser{
+ scanner: bufio.NewScanner(r),
+ fieldBuf: make([]field, 50),
+ defaultFieldBuf: make([]field, 50),
+ }
+}
+
+func (p *parser) parse() bool {
+ for p.scanner.Scan() {
+ p.parseRecord(p.scanner.Text())
+ if p.fields != nil || p.defaultFields != nil {
+ return true
+ }
+ }
+ p.err = p.scanner.Err()
+ return false
+}
+
+func (p *parser) parseRecord(src string) {
+ ms := reLine.FindStringSubmatch(src)
+ mFields := ms[1]
+ mComment := ms[2]
+ if mFields != "" {
+ p.fields = parseFields(p.fieldBuf, mFields)
+ } else {
+ p.fields = nil
+ }
+ if strings.HasPrefix(mComment, specialCommentPrefix) {
+ p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1))
+ } else {
+ p.defaultFields = nil
+ }
+}
+
+func parseFields(buf []field, src string) []field {
+ n := 0
+ for _, f := range strings.Split(src, ";") {
+ buf[n] = field(strings.TrimSpace(f))
+ n++
+ }
+
+ return buf[:n]
+}
+
+// contributoryProperties is a set of contributory properties vartan uses internally.
+// Property statuses are defined in the following table.
+//
+// https://unicode.org/reports/tr44/#Property_List_Table
+var contributoryProperties = []string{
+ "oalpha",
+ "olower",
+ "oupper",
+}
+
+func ContributoryProperties() []string {
+ return contributoryProperties
+}
+
+// https://www.unicode.org/reports/tr44/#GC_Values_Table
+var compositGeneralCategories = map[string][]string{
+ // Cased_Letter
+ "lc": {"lu", "ll", "lt"},
+ // Letter
+ "l": {"lu", "ll", "lt", "lm", "lo"},
+ // Mark
+ "m": {"mm", "mc", "me"},
+ // Number
+ "n": {"nd", "nl", "no"},
+ // Punctuation
+ "p": {"pc", "pd", "ps", "pi", "pe", "pf", "po"},
+ // Symbol
+ "s": {"sm", "sc", "sk", "so"},
+ // Separator
+ "z": {"zs", "zl", "zp"},
+ // Other
+ "c": {"cc", "cf", "cs", "co", "cn"},
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt
+var derivedCoreProperties = map[string][]string{
+ // Alphabetic
+ "alpha": {
+ `\p{Lowercase=yes}`,
+ `\p{Uppercase=yes}`,
+ `\p{Lt}`,
+ `\p{Lm}`,
+ `\p{Lo}`,
+ `\p{Nl}`,
+ `\p{Other_Alphabetic=yes}`,
+ },
+ // Lowercase
+ "lower": {
+ `\p{Ll}`,
+ `\p{Other_Lowercase=yes}`,
+ },
+ // Uppercase
+ "upper": {
+ `\p{Lu}`,
+ `\p{Other_Uppercase=yes}`,
+ },
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
+var propertyNameAbbs = map[string]string{
+ "generalcategory": "gc",
+ "gc": "gc",
+ "script": "sc",
+ "sc": "sc",
+ "alphabetic": "alpha",
+ "alpha": "alpha",
+ "otheralphabetic": "oalpha",
+ "oalpha": "oalpha",
+ "lowercase": "lower",
+ "lower": "lower",
+ "uppercase": "upper",
+ "upper": "upper",
+ "otherlowercase": "olower",
+ "olower": "olower",
+ "otheruppercase": "oupper",
+ "oupper": "oupper",
+ "whitespace": "wspace",
+ "wspace": "wspace",
+ "space": "wspace",
+}
+
+// https://www.unicode.org/reports/tr44/#Type_Key_Table
+// https://www.unicode.org/reports/tr44/#Binary_Values_Table
+var binaryValues = map[string]bool{
+ "yes": true,
+ "y": true,
+ "true": true,
+ "t": true,
+ "no": false,
+ "n": false,
+ "false": false,
+ "f": false,
+}
+
+type PropertyValueAliases struct {
+ GeneralCategory map[string]string
+ GeneralCategoryDefaultRange *CodePointRange
+ GeneralCategoryDefaultValue string
+
+ Script map[string]string
+}
+
+// ParsePropertyValueAliases parses the PropertyValueAliases.txt.
+func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
+ gcAbbs := map[string]string{}
+ var defaultGCCPRange *CodePointRange
+ var defaultGCVal string
+ scAbbs := map[string]string{}
+ p := newParser(r)
+ for p.parse() {
+ // https://www.unicode.org/reports/tr44/#Property_Value_Aliases
+ // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property,
+ // > the second field specifies an abbreviated symbolic name for a value of that property, and the third
+ // > field specifies the long symbolic name for that value of that property. These are the preferred
+ // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent
+ // > fields.
+ if len(p.fields) > 0 {
+ switch p.fields[0].symbol() {
+ case "gc":
+ gcShort := p.fields[1].normalizedSymbol()
+ gcLong := p.fields[2].normalizedSymbol()
+ gcAbbs[gcShort] = gcShort
+ gcAbbs[gcLong] = gcShort
+ for _, f := range p.fields[3:] {
+ gcShortOther := f.normalizedSymbol()
+ gcAbbs[gcShortOther] = gcShort
+ }
+ case "sc":
+ scShort := p.fields[1].normalizedSymbol()
+ scLong := p.fields[2].normalizedSymbol()
+ scAbbs[scShort] = scShort
+ scAbbs[scLong] = scShort
+ for _, f := range p.fields[3:] {
+ scShortOther := f.normalizedSymbol()
+ scAbbs[scShortOther] = scShort
+ }
+ }
+ }
+
+ // https://www.unicode.org/reports/tr44/#Missing_Conventions
+ // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt.
+ // > ...
+ // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below:
+ // > 1. code_point_range; default_prop_val
+ // > 2. code_point_range; property_name; default_prop_val
+ // > ...
+ // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which
+ // > contain values associated with many properties. For example:
+ // > # @missing: 0000..10FFFF; NFD_QC; Yes
+ if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" {
+ var err error
+ defaultGCCPRange, err = p.defaultFields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ defaultGCVal = p.defaultFields[2].normalizedSymbol()
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ return &PropertyValueAliases{
+ GeneralCategory: gcAbbs,
+ GeneralCategoryDefaultRange: defaultGCCPRange,
+ GeneralCategoryDefaultValue: defaultGCVal,
+ Script: scAbbs,
+ }, nil
+}
+
+func (a *PropertyValueAliases) gcAbb(gc string) string {
+ return a.GeneralCategory[gc]
+}
+
+type PropList struct {
+ OtherAlphabetic []*CodePointRange
+ OtherLowercase []*CodePointRange
+ OtherUppercase []*CodePointRange
+ WhiteSpace []*CodePointRange
+}
+
+// ParsePropList parses the PropList.txt.
+func ParsePropList(r io.Reader) (*PropList, error) {
+ var oa []*CodePointRange
+ var ol []*CodePointRange
+ var ou []*CodePointRange
+ var ws []*CodePointRange
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+
+ cp, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+
+ switch p.fields[1].symbol() {
+ case "Other_Alphabetic":
+ oa = append(oa, cp)
+ case "Other_Lowercase":
+ ol = append(ol, cp)
+ case "Other_Uppercase":
+ ou = append(ou, cp)
+ case "White_Space":
+ ws = append(ws, cp)
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+
+ return &PropList{
+ OtherAlphabetic: oa,
+ OtherLowercase: ol,
+ OtherUppercase: ou,
+ WhiteSpace: ws,
+ }, nil
+}
+
+type Scripts struct {
+ Script map[string][]*CodePointRange
+ ScriptDefaultRange *CodePointRange
+ ScriptDefaultValue string
+}
+
+// ParseScripts parses the Scripts.txt.
+func ParseScripts(r io.Reader, propValAliases *PropertyValueAliases) (*Scripts, error) {
+ ss := map[string][]*CodePointRange{}
+ var defaultRange *CodePointRange
+ var defaultValue string
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) > 0 {
+ cp, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+
+ name, ok := propValAliases.Script[p.fields[1].normalizedSymbol()]
+ if !ok {
+ return nil, fmt.Errorf("unknown property: %v", p.fields[1].symbol())
+ }
+ ss[name] = append(ss[name], cp)
+ }
+
+ if len(p.defaultFields) > 0 {
+ var err error
+ defaultRange, err = p.defaultFields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ defaultValue = p.defaultFields[1].normalizedSymbol()
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+
+ return &Scripts{
+ Script: ss,
+ ScriptDefaultRange: defaultRange,
+ ScriptDefaultValue: defaultValue,
+ }, nil
+}
+
+type UnicodeData struct {
+ GeneralCategory map[string][]*CodePointRange
+
+ propValAliases *PropertyValueAliases
+}
+
+// ParseUnicodeData parses the UnicodeData.txt.
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+ unicodeData := &UnicodeData{
+ GeneralCategory: map[string][]*CodePointRange{},
+ propValAliases: propValAliases,
+ }
+
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ cp, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ gc := p.fields[2].normalizedSymbol()
+ unicodeData.addGC(gc, cp)
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+
+ return unicodeData, nil
+}
+
+func (u *UnicodeData) addGC(gc string, cp *CodePointRange) {
+ // https://www.unicode.org/reports/tr44/#Empty_Fields
+ // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line
+ // > for a code point is empty, that indicates that the property takes the default value for that code point.
+ if gc == "" {
+ return
+ }
+
+ cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)]
+ if ok {
+ c := cps[len(cps)-1]
+ if cp.From-c.To == 1 {
+ c.To = cp.To
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp)
+ }
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp}
+ }
+}
diff --git a/src/urubu/ucd/api.go b/src/urubu/ucd/api.go
deleted file mode 100644
index 8265d54..0000000
--- a/src/urubu/ucd/api.go
+++ /dev/null
@@ -1,180 +0,0 @@
-//go:generate go run ../cmd/ucdgen/main.go
-//go:generate go fmt codepoint.go
-
-package ucd
-
-import (
- "fmt"
- "strings"
-)
-
-const (
- // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
- // 3.4 Characters and Encoding
- // > D9 Unicode codespace: A range of integers from 0 to 10FFFF16.
- codePointMin = 0x0
- codePointMax = 0x10FFFF
-)
-
-func NormalizeCharacterProperty(propName, propVal string) (string, error) {
- if propName == "" {
- propName = "gc"
- }
-
- name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)]
- if !ok {
- return "", fmt.Errorf("unsupported character property name: %v", propName)
- }
- props, ok := derivedCoreProperties[name]
- if !ok {
- return "", nil
- }
- var b strings.Builder
- yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
- if !ok {
- return "", fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if yes {
- fmt.Fprint(&b, "[")
- } else {
- fmt.Fprint(&b, "[^")
- }
- for _, prop := range props {
- fmt.Fprint(&b, prop)
- }
- fmt.Fprint(&b, "]")
-
- return b.String(), nil
-}
-
-func IsContributoryProperty(propName string) bool {
- if propName == "" {
- return false
- }
-
- for _, p := range contributoryProperties {
- if propName == p {
- return true
- }
- }
- return false
-}
-
-func FindCodePointRanges(propName, propVal string) ([]*CodePointRange, bool, error) {
- if propName == "" {
- propName = "gc"
- }
-
- name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property name: %v", propName)
- }
- switch name {
- case "gc":
- val, ok := generalCategoryValueAbbs[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if val == generalCategoryValueAbbs[normalizeSymbolicValue(generalCategoryDefaultValue)] {
- var allCPs []*CodePointRange
- if generalCategoryDefaultRange.From > codePointMin {
- allCPs = append(allCPs, &CodePointRange{
- From: codePointMin,
- To: generalCategoryDefaultRange.From - 1,
- })
- }
- if generalCategoryDefaultRange.To < codePointMax {
- allCPs = append(allCPs, &CodePointRange{
- From: generalCategoryDefaultRange.To + 1,
- To: codePointMax,
- })
- }
- for _, cp := range generalCategoryCodePoints {
- allCPs = append(allCPs, cp...)
- }
- return allCPs, true, nil
- }
- vals, ok := compositGeneralCategories[val]
- if !ok {
- vals = []string{val}
- }
- var ranges []*CodePointRange
- for _, v := range vals {
- rs, ok := generalCategoryCodePoints[v]
- if !ok {
- return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v)
- }
- ranges = append(ranges, rs...)
- }
- return ranges, false, nil
- case "sc":
- val, ok := scriptValueAbbs[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if val == scriptValueAbbs[normalizeSymbolicValue(scriptDefaultValue)] {
- var allCPs []*CodePointRange
- if scriptDefaultRange.From > codePointMin {
- allCPs = append(allCPs, &CodePointRange{
- From: codePointMin,
- To: scriptDefaultRange.From - 1,
- })
- }
- if scriptDefaultRange.To < codePointMax {
- allCPs = append(allCPs, &CodePointRange{
- From: scriptDefaultRange.To + 1,
- To: codePointMax,
- })
- }
- for _, cp := range scriptCodepoints {
- allCPs = append(allCPs, cp...)
- }
- return allCPs, true, nil
- }
- return scriptCodepoints[val], false, nil
- case "oalpha":
- yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if yes {
- return otherAlphabeticCodePoints, false, nil
- } else {
- return otherAlphabeticCodePoints, true, nil
- }
- case "olower":
- yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if yes {
- return otherLowercaseCodePoints, false, nil
- } else {
- return otherLowercaseCodePoints, true, nil
- }
- case "oupper":
- yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if yes {
- return otherUppercaseCodePoints, false, nil
- } else {
- return otherUppercaseCodePoints, true, nil
- }
- case "wspace":
- yes, ok := binaryValues[normalizeSymbolicValue(propVal)]
- if !ok {
- return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
- }
- if yes {
- return whiteSpaceCodePoints, false, nil
- } else {
- return whiteSpaceCodePoints, true, nil
- }
- }
-
- // If the process reaches this code, it's a bug. We must handle all of the properties registered with
- // the `propertyNameAbbs`.
- return nil, false, fmt.Errorf("character property '%v' is unavailable", propName)
-}
diff --git a/src/urubu/ucd/codepoint.go.tmpl b/src/urubu/ucd/codepoint.go.tmpl
deleted file mode 100644
index cc0d48e..0000000
--- a/src/urubu/ucd/codepoint.go.tmpl
+++ /dev/null
@@ -1,65 +0,0 @@
-// Code generated by {{ .GeneratorName }}; DO NOT EDIT.
-
-package ucd
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
-var generalCategoryValueAbbs = map[string]string{ {{ range $long, $abb := .PropertyValueAliases.GeneralCategory }}
- "{{ $long }}": "{{ $abb }}",{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
-var scriptValueAbbs = map[string]string{ {{ range $long, $abb := .PropertyValueAliases.Script }}
- "{{ $long }}": "{{ $abb }}",{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
-var (
- generalCategoryDefaultRange = &CodePointRange{
- From: rune({{ .PropertyValueAliases.GeneralCategoryDefaultRange.From }}),
- To: rune({{ .PropertyValueAliases.GeneralCategoryDefaultRange.To }}),
- }
- generalCategoryDefaultValue = "{{ .PropertyValueAliases.GeneralCategoryDefaultValue }}"
-)
-
-// https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
-var generalCategoryCodePoints = map[string][]*CodePointRange{ {{ range $propName, $codePoints := .UnicodeData.GeneralCategory }}
- "{{ $propName }}": { {{ range $codePoints }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
- },{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt
-var (
- scriptDefaultRange = &CodePointRange{
- From: rune({{ .Scripts.ScriptDefaultRange.From }}),
- To: rune({{ .Scripts.ScriptDefaultRange.To }}),
- }
- scriptDefaultValue = "{{ .Scripts.ScriptDefaultValue }}"
-)
-
-// https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt
-var scriptCodepoints = map[string][]*CodePointRange{ {{ range $script, $codePoints := .Scripts.Script }}
- "{{ $script }}": { {{ range $codePoints }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
- },{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
-var otherAlphabeticCodePoints = []*CodePointRange{ {{ range .PropList.OtherAlphabetic }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
-var otherLowercaseCodePoints = []*CodePointRange{ {{ range .PropList.OtherLowercase }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
-var otherUppercaseCodePoints = []*CodePointRange{ {{ range .PropList.OtherUppercase }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
-var whiteSpaceCodePoints = []*CodePointRange{ {{ range .PropList.WhiteSpace }}
- &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
-}
diff --git a/src/urubu/ucd/parser.go b/src/urubu/ucd/parser.go
deleted file mode 100644
index 88d7134..0000000
--- a/src/urubu/ucd/parser.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package ucd
-
-import (
- "bufio"
- "encoding/binary"
- "encoding/hex"
- "io"
- "regexp"
- "strings"
-)
-
-type CodePointRange struct {
- From rune
- To rune
-}
-
-var codePointRangeNil = &CodePointRange{
- From: 0,
- To: 0,
-}
-
-type field string
-
-func (f field) codePointRange() (*CodePointRange, error) {
- var from, to rune
- var err error
- cp := reCodePointRange.FindStringSubmatch(string(f))
- from, err = decodeHexToRune(cp[1])
- if err != nil {
- return codePointRangeNil, err
- }
- if cp[2] != "" {
- to, err = decodeHexToRune(cp[2])
- if err != nil {
- return codePointRangeNil, err
- }
- } else {
- to = from
- }
- return &CodePointRange{
- From: from,
- To: to,
- }, nil
-}
-
-func decodeHexToRune(hexCodePoint string) (rune, error) {
- h := hexCodePoint
- if len(h)%2 != 0 {
- h = "0" + h
- }
- b, err := hex.DecodeString(h)
- if err != nil {
- return 0, err
- }
- l := len(b)
- for i := 0; i < 4-l; i++ {
- b = append([]byte{0}, b...)
- }
- n := binary.BigEndian.Uint32(b)
- return rune(n), nil
-}
-
-func (f field) symbol() string {
- return string(f)
-}
-
-func (f field) normalizedSymbol() string {
- return normalizeSymbolicValue(string(f))
-}
-
-var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")
-
-// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3.
-//
-// https://www.unicode.org/reports/tr44/#UAX44-LM3
-func normalizeSymbolicValue(s string) string {
- v := strings.ToLower(symValReplacer.Replace(s))
- if strings.HasPrefix(v, "is") && v != "is" {
- return v[2:]
- }
- return v
-}
-
-var (
- reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
- reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)
-
- specialCommentPrefix = "# @missing:"
-)
-
-// This parser can parse data files of Unicode Character Database (UCD).
-// Specifically, it has the following two functions:
-// - Converts each line of the data files into a slice of fields.
-// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields.
-//
-// However, for practical purposes, each field needs to be analyzed more specifically.
-// For instance, in UnicodeData.txt, the first field represents a range of code points,
-// so it needs to be recognized as a hexadecimal string.
-// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser.
-//
-// https://www.unicode.org/reports/tr44/#Format_Conventions
-type parser struct {
- scanner *bufio.Scanner
- fields []field
- defaultFields []field
- err error
-
- fieldBuf []field
- defaultFieldBuf []field
-}
-
-func newParser(r io.Reader) *parser {
- return &parser{
- scanner: bufio.NewScanner(r),
- fieldBuf: make([]field, 50),
- defaultFieldBuf: make([]field, 50),
- }
-}
-
-func (p *parser) parse() bool {
- for p.scanner.Scan() {
- p.parseRecord(p.scanner.Text())
- if p.fields != nil || p.defaultFields != nil {
- return true
- }
- }
- p.err = p.scanner.Err()
- return false
-}
-
-func (p *parser) parseRecord(src string) {
- ms := reLine.FindStringSubmatch(src)
- mFields := ms[1]
- mComment := ms[2]
- if mFields != "" {
- p.fields = parseFields(p.fieldBuf, mFields)
- } else {
- p.fields = nil
- }
- if strings.HasPrefix(mComment, specialCommentPrefix) {
- p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1))
- } else {
- p.defaultFields = nil
- }
-}
-
-func parseFields(buf []field, src string) []field {
- n := 0
- for _, f := range strings.Split(src, ";") {
- buf[n] = field(strings.TrimSpace(f))
- n++
- }
-
- return buf[:n]
-}
diff --git a/src/urubu/ucd/prop_list.go b/src/urubu/ucd/prop_list.go
deleted file mode 100644
index 31db70c..0000000
--- a/src/urubu/ucd/prop_list.go
+++ /dev/null
@@ -1,50 +0,0 @@
-package ucd
-
-import "io"
-
-type PropList struct {
- OtherAlphabetic []*CodePointRange
- OtherLowercase []*CodePointRange
- OtherUppercase []*CodePointRange
- WhiteSpace []*CodePointRange
-}
-
-// ParsePropList parses the PropList.txt.
-func ParsePropList(r io.Reader) (*PropList, error) {
- var oa []*CodePointRange
- var ol []*CodePointRange
- var ou []*CodePointRange
- var ws []*CodePointRange
- p := newParser(r)
- for p.parse() {
- if len(p.fields) == 0 {
- continue
- }
-
- cp, err := p.fields[0].codePointRange()
- if err != nil {
- return nil, err
- }
-
- switch p.fields[1].symbol() {
- case "Other_Alphabetic":
- oa = append(oa, cp)
- case "Other_Lowercase":
- ol = append(ol, cp)
- case "Other_Uppercase":
- ou = append(ou, cp)
- case "White_Space":
- ws = append(ws, cp)
- }
- }
- if p.err != nil {
- return nil, p.err
- }
-
- return &PropList{
- OtherAlphabetic: oa,
- OtherLowercase: ol,
- OtherUppercase: ou,
- WhiteSpace: ws,
- }, nil
-}
diff --git a/src/urubu/ucd/property.go b/src/urubu/ucd/property.go
deleted file mode 100644
index ba60e80..0000000
--- a/src/urubu/ucd/property.go
+++ /dev/null
@@ -1,95 +0,0 @@
-package ucd
-
-// contributoryProperties is a set of contributory properties vartan uses internally.
-// Property statuses are defined in the following table.
-//
-// https://unicode.org/reports/tr44/#Property_List_Table
-var contributoryProperties = []string{
- "oalpha",
- "olower",
- "oupper",
-}
-
-func ContributoryProperties() []string {
- return contributoryProperties
-}
-
-// https://www.unicode.org/reports/tr44/#GC_Values_Table
-var compositGeneralCategories = map[string][]string{
- // Cased_Letter
- "lc": {"lu", "ll", "lt"},
- // Letter
- "l": {"lu", "ll", "lt", "lm", "lo"},
- // Mark
- "m": {"mm", "mc", "me"},
- // Number
- "n": {"nd", "nl", "no"},
- // Punctuation
- "p": {"pc", "pd", "ps", "pi", "pe", "pf", "po"},
- // Symbol
- "s": {"sm", "sc", "sk", "so"},
- // Separator
- "z": {"zs", "zl", "zp"},
- // Other
- "c": {"cc", "cf", "cs", "co", "cn"},
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt
-var derivedCoreProperties = map[string][]string{
- // Alphabetic
- "alpha": {
- `\p{Lowercase=yes}`,
- `\p{Uppercase=yes}`,
- `\p{Lt}`,
- `\p{Lm}`,
- `\p{Lo}`,
- `\p{Nl}`,
- `\p{Other_Alphabetic=yes}`,
- },
- // Lowercase
- "lower": {
- `\p{Ll}`,
- `\p{Other_Lowercase=yes}`,
- },
- // Uppercase
- "upper": {
- `\p{Lu}`,
- `\p{Other_Uppercase=yes}`,
- },
-}
-
-// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
-var propertyNameAbbs = map[string]string{
- "generalcategory": "gc",
- "gc": "gc",
- "script": "sc",
- "sc": "sc",
- "alphabetic": "alpha",
- "alpha": "alpha",
- "otheralphabetic": "oalpha",
- "oalpha": "oalpha",
- "lowercase": "lower",
- "lower": "lower",
- "uppercase": "upper",
- "upper": "upper",
- "otherlowercase": "olower",
- "olower": "olower",
- "otheruppercase": "oupper",
- "oupper": "oupper",
- "whitespace": "wspace",
- "wspace": "wspace",
- "space": "wspace",
-}
-
-// https://www.unicode.org/reports/tr44/#Type_Key_Table
-// https://www.unicode.org/reports/tr44/#Binary_Values_Table
-var binaryValues = map[string]bool{
- "yes": true,
- "y": true,
- "true": true,
- "t": true,
- "no": false,
- "n": false,
- "false": false,
- "f": false,
-}
diff --git a/src/urubu/ucd/property_value_aliases.go b/src/urubu/ucd/property_value_aliases.go
deleted file mode 100644
index 4bc69db..0000000
--- a/src/urubu/ucd/property_value_aliases.go
+++ /dev/null
@@ -1,82 +0,0 @@
-package ucd
-
-import "io"
-
-type PropertyValueAliases struct {
- GeneralCategory map[string]string
- GeneralCategoryDefaultRange *CodePointRange
- GeneralCategoryDefaultValue string
-
- Script map[string]string
-}
-
-// ParsePropertyValueAliases parses the PropertyValueAliases.txt.
-func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
- gcAbbs := map[string]string{}
- var defaultGCCPRange *CodePointRange
- var defaultGCVal string
- scAbbs := map[string]string{}
- p := newParser(r)
- for p.parse() {
- // https://www.unicode.org/reports/tr44/#Property_Value_Aliases
- // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property,
- // > the second field specifies an abbreviated symbolic name for a value of that property, and the third
- // > field specifies the long symbolic name for that value of that property. These are the preferred
- // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent
- // > fields.
- if len(p.fields) > 0 {
- switch p.fields[0].symbol() {
- case "gc":
- gcShort := p.fields[1].normalizedSymbol()
- gcLong := p.fields[2].normalizedSymbol()
- gcAbbs[gcShort] = gcShort
- gcAbbs[gcLong] = gcShort
- for _, f := range p.fields[3:] {
- gcShortOther := f.normalizedSymbol()
- gcAbbs[gcShortOther] = gcShort
- }
- case "sc":
- scShort := p.fields[1].normalizedSymbol()
- scLong := p.fields[2].normalizedSymbol()
- scAbbs[scShort] = scShort
- scAbbs[scLong] = scShort
- for _, f := range p.fields[3:] {
- scShortOther := f.normalizedSymbol()
- scAbbs[scShortOther] = scShort
- }
- }
- }
-
- // https://www.unicode.org/reports/tr44/#Missing_Conventions
- // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt.
- // > ...
- // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below:
- // > 1. code_point_range; default_prop_val
- // > 2. code_point_range; property_name; default_prop_val
- // > ...
- // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which
- // > contain values associated with many properties. For example:
- // > # @missing: 0000..10FFFF; NFD_QC; Yes
- if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" {
- var err error
- defaultGCCPRange, err = p.defaultFields[0].codePointRange()
- if err != nil {
- return nil, err
- }
- defaultGCVal = p.defaultFields[2].normalizedSymbol()
- }
- }
- if p.err != nil {
- return nil, p.err
- }
- return &PropertyValueAliases{
- GeneralCategory: gcAbbs,
- GeneralCategoryDefaultRange: defaultGCCPRange,
- GeneralCategoryDefaultValue: defaultGCVal,
- Script: scAbbs,
- }, nil
-}
-
-func (a *PropertyValueAliases) gcAbb(gc string) string {
- return a.GeneralCategory[gc]
-}
diff --git a/src/urubu/ucd/scripts.go b/src/urubu/ucd/scripts.go
deleted file mode 100644
index 5040283..0000000
--- a/src/urubu/ucd/scripts.go
+++ /dev/null
@@ -1,52 +0,0 @@
-package ucd
-
-import (
- "fmt"
- "io"
-)
-
-type Scripts struct {
- Script map[string][]*CodePointRange
- ScriptDefaultRange *CodePointRange
- ScriptDefaultValue string
-}
-
-// ParseScripts parses the Scripts.txt.
-func ParseScripts(r io.Reader, propValAliases *PropertyValueAliases) (*Scripts, error) {
- ss := map[string][]*CodePointRange{}
- var defaultRange *CodePointRange
- var defaultValue string
- p := newParser(r)
- for p.parse() {
- if len(p.fields) > 0 {
- cp, err := p.fields[0].codePointRange()
- if err != nil {
- return nil, err
- }
-
- name, ok := propValAliases.Script[p.fields[1].normalizedSymbol()]
- if !ok {
- return nil, fmt.Errorf("unknown property: %v", p.fields[1].symbol())
- }
- ss[name] = append(ss[name], cp)
- }
-
- if len(p.defaultFields) > 0 {
- var err error
- defaultRange, err = p.defaultFields[0].codePointRange()
- if err != nil {
- return nil, err
- }
- defaultValue = p.defaultFields[1].normalizedSymbol()
- }
- }
- if p.err != nil {
- return nil, p.err
- }
-
- return &Scripts{
- Script: ss,
- ScriptDefaultRange: defaultRange,
- ScriptDefaultValue: defaultValue,
- }, nil
-}
diff --git a/src/urubu/ucd/unicode_data.go b/src/urubu/ucd/unicode_data.go
deleted file mode 100644
index e2a8e87..0000000
--- a/src/urubu/ucd/unicode_data.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package ucd
-
-import "io"
-
-type UnicodeData struct {
- GeneralCategory map[string][]*CodePointRange
-
- propValAliases *PropertyValueAliases
-}
-
-// ParseUnicodeData parses the UnicodeData.txt.
-func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
- unicodeData := &UnicodeData{
- GeneralCategory: map[string][]*CodePointRange{},
- propValAliases: propValAliases,
- }
-
- p := newParser(r)
- for p.parse() {
- if len(p.fields) == 0 {
- continue
- }
- cp, err := p.fields[0].codePointRange()
- if err != nil {
- return nil, err
- }
- gc := p.fields[2].normalizedSymbol()
- unicodeData.addGC(gc, cp)
- }
- if p.err != nil {
- return nil, p.err
- }
-
- return unicodeData, nil
-}
-
-func (u *UnicodeData) addGC(gc string, cp *CodePointRange) {
- // https://www.unicode.org/reports/tr44/#Empty_Fields
- // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line
- // > for a code point is empty, that indicates that the property takes the default value for that code point.
- if gc == "" {
- return
- }
-
- cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)]
- if ok {
- c := cps[len(cps)-1]
- if cp.From-c.To == 1 {
- c.To = cp.To
- } else {
- u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp)
- }
- } else {
- u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp}
- }
-}
diff --git a/src/urubu/utf8/utf8.go b/src/urubu/utf8.go
index 4f52bd4..4f52bd4 100644
--- a/src/urubu/utf8/utf8.go
+++ b/src/urubu/utf8.go