diff options
author | EuAndreh <eu@euandre.org> | 2024-12-11 16:26:16 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2024-12-11 16:26:21 -0300 |
commit | 38f4733efc322f10eda7800259a95d682139592a (patch) | |
tree | e34e46a0ff17218fedd724e3ebd8eff477190fa3 | |
parent | rm LICENSE (diff) | |
download | urubu-38f4733efc322f10eda7800259a95d682139592a.tar.gz urubu-38f4733efc322f10eda7800259a95d682139592a.tar.xz |
Consolidate packages spread across multiple files into single one
61 files changed, 10003 insertions, 10246 deletions
@@ -125,11 +125,11 @@ $(libs.a): src/$(NAME).go src/version.go $(existing.a): - go tool compile -I src -o $@ -p `echo $* | sed 's,^src/,,'` `find $*/*.go` + go tool compile -I src -o $@ -p `echo $* | sed 's,^src/,,'` $*.go $(xtests.a): p="`echo $* | sed 's,^tests/unit/,urubu/,'`"; \ - go tool compile -I src -o $@ -p $$p src/$$p/*.go $*/*.go + go tool compile -I src -o $@ -p $$p src/$$p.go $*/*.go $(xmains.a): go tool compile -I src -o $@ -p main $*/*.go @@ -100,21 +100,21 @@ src/urubu/cmd/vartan-go.bin: src/urubu/cmd/vartan-go.a -src/urubu/compressor.a: src/urubu/compressor/compressor.go -src/urubu/error.a: src/urubu/error/error.go -src/urubu/grammar/symbol.a: src/urubu/grammar/symbol/symbol.go -src/urubu/spec/grammar.a: src/urubu/spec/grammar/description.go src/urubu/spec/grammar/grammar.go src/urubu/spec/grammar/util.go -src/urubu/spec/test.a: src/urubu/spec/test/parser.go src/urubu/spec/test/tree_lexer.go src/urubu/spec/test/tree_parser.go src/urubu/spec/test/tree_semantic_action.go -src/urubu/ucd.a: src/urubu/ucd/api.go src/urubu/ucd/codepoint.go src/urubu/ucd/parser.go src/urubu/ucd/property.go src/urubu/ucd/property_value_aliases.go src/urubu/ucd/prop_list.go src/urubu/ucd/scripts.go src/urubu/ucd/unicode_data.go -src/urubu/utf8.a: src/urubu/utf8/utf8.go -src/urubu/spec/grammar/parser.a: src/urubu/spec/grammar/parser/lexer.go src/urubu/spec/grammar/parser/parser.go src/urubu/spec/grammar/parser/syntax_error.go src/urubu/spec/grammar/parser/vartan_lexer.go -src/urubu/grammar.a: src/urubu/grammar/first.go src/urubu/grammar/grammar.go src/urubu/grammar/item.go src/urubu/grammar/lalr1.go src/urubu/grammar/lr0.go src/urubu/grammar/parsing_table.go src/urubu/grammar/production.go src/urubu/grammar/semantic_error.go -src/urubu/tester.a: src/urubu/tester/tester.go -src/urubu/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa/dfa.go src/urubu/grammar/lexical/dfa/symbol_position.go src/urubu/grammar/lexical/dfa/tree.go -src/urubu/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser/error.go src/urubu/grammar/lexical/parser/fragment.go src/urubu/grammar/lexical/parser/lexer.go src/urubu/grammar/lexical/parser/parser.go src/urubu/grammar/lexical/parser/tree.go -src/urubu/grammar/lexical.a: src/urubu/grammar/lexical/compiler.go src/urubu/grammar/lexical/entry.go -src/urubu/driver/lexer.a: src/urubu/driver/lexer/lexer.go src/urubu/driver/lexer/spec.go src/urubu/driver/lexer/spec.go -src/urubu/driver/parser.a: src/urubu/driver/parser/parser.go src/urubu/driver/parser/semantic_action.go src/urubu/driver/parser/spec.go src/urubu/driver/parser/template.go src/urubu/driver/parser/token_stream.go +src/urubu/compressor.a: src/urubu/compressor.go +src/urubu/error.a: src/urubu/error.go +src/urubu/grammar/symbol.a: src/urubu/grammar/symbol.go +src/urubu/spec/grammar.a: src/urubu/spec/grammar.go +src/urubu/spec/test.a: src/urubu/spec/test.go +src/urubu/ucd.a: src/urubu/ucd.go +src/urubu/utf8.a: src/urubu/utf8.go +src/urubu/spec/grammar/parser.a: src/urubu/spec/grammar/parser.go +src/urubu/grammar.a: src/urubu/grammar.go +src/urubu/tester.a: src/urubu/tester.go +src/urubu/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa.go +src/urubu/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser.go +src/urubu/grammar/lexical.a: src/urubu/grammar/lexical.go +src/urubu/driver/lexer.a: src/urubu/driver/lexer.go +src/urubu/driver/parser.a: src/urubu/driver/parser.go src/urubu/cmd/ucdgen.a: src/urubu/cmd/ucdgen/main.go src/urubu/cmd/vartan.a: src/urubu/cmd/vartan/compile.go src/urubu/cmd/vartan/main.go src/urubu/cmd/vartan/parse.go src/urubu/cmd/vartan/root.go src/urubu/cmd/vartan/show.go src/urubu/cmd/vartan/test.go @@ -122,15 +122,15 @@ src/urubu/cmd/vartan-go.a: src/urubu/cmd/vartan-go/generate.go src/urubu/cmd/var -tests/unit/compressor.a: src/urubu/compressor/compressor.go tests/unit/compressor/compressor_test.go -tests/unit/grammar/symbol.a: src/urubu/grammar/symbol/symbol.go tests/unit/grammar/symbol/symbol_test.go -tests/unit/spec/test.a: src/urubu/spec/test/parser.go src/urubu/spec/test/tree_lexer.go src/urubu/spec/test/tree_parser.go src/urubu/spec/test/tree_semantic_action.go tests/unit/spec/test/parser_test.go -tests/unit/utf8.a: src/urubu/utf8/utf8.go tests/unit/utf8/utf8_test.go -tests/unit/spec/grammar/parser.a: src/urubu/spec/grammar/parser/lexer.go src/urubu/spec/grammar/parser/parser.go src/urubu/spec/grammar/parser/syntax_error.go src/urubu/spec/grammar/parser/vartan_lexer.go tests/unit/spec/grammar/parser/lexer_test.go tests/unit/spec/grammar/parser/parser_test.go -tests/unit/grammar.a: src/urubu/grammar/first.go src/urubu/grammar/grammar.go src/urubu/grammar/item.go src/urubu/grammar/lalr1.go src/urubu/grammar/lr0.go src/urubu/grammar/parsing_table.go src/urubu/grammar/production.go src/urubu/grammar/semantic_error.go tests/unit/grammar/first_test.go tests/unit/grammar/grammar_test.go tests/unit/grammar/lalr1_test.go tests/unit/grammar/lr0_test.go tests/unit/grammar/parsing_table_test.go tests/unit/grammar/test_helper_test.go -tests/unit/tester.a: src/urubu/tester/tester.go tests/unit/tester/tester_test.go -tests/unit/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa/dfa.go src/urubu/grammar/lexical/dfa/symbol_position.go src/urubu/grammar/lexical/dfa/tree.go tests/unit/grammar/lexical/dfa/dfa_test.go tests/unit/grammar/lexical/dfa/symbol_position_test.go tests/unit/grammar/lexical/dfa/tree_test.go -tests/unit/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser/error.go src/urubu/grammar/lexical/parser/fragment.go src/urubu/grammar/lexical/parser/lexer.go src/urubu/grammar/lexical/parser/parser.go src/urubu/grammar/lexical/parser/tree.go tests/unit/grammar/lexical/parser/lexer_test.go tests/unit/grammar/lexical/parser/parser_test.go -tests/unit/grammar/lexical.a: src/urubu/grammar/lexical/compiler.go src/urubu/grammar/lexical/entry.go tests/unit/grammar/lexical/compiler_test.go -tests/unit/driver/lexer.a: src/urubu/driver/lexer/lexer.go src/urubu/driver/lexer/spec.go src/urubu/driver/lexer/spec.go tests/unit/driver/lexer/lexer_test.go -tests/unit/driver/parser.a: src/urubu/driver/parser/parser.go src/urubu/driver/parser/semantic_action.go src/urubu/driver/parser/spec.go src/urubu/driver/parser/template.go src/urubu/driver/parser/token_stream.go tests/unit/driver/parser/conflict_test.go tests/unit/driver/parser/lac_test.go tests/unit/driver/parser/parser_test.go tests/unit/driver/parser/semantic_action_test.go tests/unit/driver/parser/syntax_error_test.go +tests/unit/compressor.a: src/urubu/compressor.go tests/unit/compressor/compressor_test.go +tests/unit/grammar/symbol.a: src/urubu/grammar/symbol.go tests/unit/grammar/symbol/symbol_test.go +tests/unit/spec/test.a: src/urubu/spec/test.go tests/unit/spec/test/parser_test.go +tests/unit/utf8.a: src/urubu/utf8.go tests/unit/utf8/utf8_test.go +tests/unit/spec/grammar/parser.a: src/urubu/spec/grammar/parser.go tests/unit/spec/grammar/parser/lexer_test.go tests/unit/spec/grammar/parser/parser_test.go +tests/unit/grammar.a: src/urubu/grammar.go tests/unit/grammar/first_test.go tests/unit/grammar/grammar_test.go tests/unit/grammar/lalr1_test.go tests/unit/grammar/lr0_test.go tests/unit/grammar/parsing_table_test.go tests/unit/grammar/test_helper_test.go +tests/unit/tester.a: src/urubu/tester.go tests/unit/tester/tester_test.go +tests/unit/grammar/lexical/dfa.a: src/urubu/grammar/lexical/dfa.go tests/unit/grammar/lexical/dfa/dfa_test.go tests/unit/grammar/lexical/dfa/symbol_position_test.go tests/unit/grammar/lexical/dfa/tree_test.go +tests/unit/grammar/lexical/parser.a: src/urubu/grammar/lexical/parser.go tests/unit/grammar/lexical/parser/lexer_test.go tests/unit/grammar/lexical/parser/parser_test.go +tests/unit/grammar/lexical.a: src/urubu/grammar/lexical.go tests/unit/grammar/lexical/compiler_test.go +tests/unit/driver/lexer.a: src/urubu/driver/lexer.go tests/unit/driver/lexer/lexer_test.go +tests/unit/driver/parser.a: src/urubu/driver/parser.go tests/unit/driver/parser/conflict_test.go tests/unit/driver/parser/lac_test.go tests/unit/driver/parser/parser_test.go tests/unit/driver/parser/semantic_action_test.go tests/unit/driver/parser/syntax_error_test.go diff --git a/src/urubu/compressor/compressor.go b/src/urubu/compressor.go index cdfeacb..cdfeacb 100644 --- a/src/urubu/compressor/compressor.go +++ b/src/urubu/compressor.go diff --git a/src/urubu/driver/lexer/template.go b/src/urubu/driver/lexer.go index 35dfd93..7423668 100644 --- a/src/urubu/driver/lexer/template.go +++ b/src/urubu/driver/lexer.go @@ -8,6 +8,7 @@ import ( "go/format" "go/parser" "go/token" + "io" "strings" "text/template" @@ -15,6 +16,403 @@ import ( spec "urubu/spec/grammar" ) +type ModeID int + +func (id ModeID) Int() int { + return int(id) +} + +type StateID int + +func (id StateID) Int() int { + return int(id) +} + +type KindID int + +func (id KindID) Int() int { + return int(id) +} + +type ModeKindID int + +func (id ModeKindID) Int() int { + return int(id) +} + +type LexSpec interface { + InitialMode() ModeID + Pop(mode ModeID, modeKind ModeKindID) bool + Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) + ModeName(mode ModeID) string + InitialState(mode ModeID) StateID + NextState(mode ModeID, state StateID, v int) (StateID, bool) + Accept(mode ModeID, state StateID) (ModeKindID, bool) + KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) +} + +// Token representes a token. +type Token struct { + // ModeID is an ID of a lex mode. + ModeID ModeID + + // KindID is an ID of a kind. This is unique among all modes. + KindID KindID + + // ModeKindID is an ID of a lexical kind. This is unique only within a mode. + // Note that you need to use KindID field if you want to identify a kind across all modes. + ModeKindID ModeKindID + + // BytePos is a byte position where a token appears. + BytePos int + + // ByteLen is a length of a token. + ByteLen int + + // Row is a row number where a token appears. + Row int + + // Col is a column number where a token appears. + // Note that Col is counted in code points, not bytes. + Col int + + // Lexeme is a byte sequence matched a pattern of a lexical specification. + Lexeme []byte + + // When this field is true, it means the token is the EOF token. + EOF bool + + // When this field is true, it means the token is an error token. + Invalid bool +} + +type LexerOption func(l *Lexer) error + +// DisableModeTransition disables the active mode transition. Thus, even if the lexical specification has the push and pop +// operations, the lexer doesn't perform these operations. When the lexical specification has multiple modes, and this option is +// enabled, you need to call the Lexer.Push and Lexer.Pop methods to perform the mode transition. You can use the Lexer.Mode method +// to know the current lex mode. +func DisableModeTransition() LexerOption { + return func(l *Lexer) error { + l.passiveModeTran = true + return nil + } +} + +type lexerState struct { + srcPtr int + row int + col int +} + +type Lexer struct { + spec LexSpec + src []byte + state lexerState + lastAcceptedState lexerState + tokBuf []*Token + modeStack []ModeID + passiveModeTran bool +} + +// NewLexer returns a new lexer. +func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) { + b, err := io.ReadAll(src) + if err != nil { + return nil, err + } + l := &Lexer{ + spec: spec, + src: b, + state: lexerState{ + srcPtr: 0, + row: 0, + col: 0, + }, + lastAcceptedState: lexerState{ + srcPtr: 0, + row: 0, + col: 0, + }, + modeStack: []ModeID{ + spec.InitialMode(), + }, + passiveModeTran: false, + } + for _, opt := range opts { + err := opt(l) + if err != nil { + return nil, err + } + } + + return l, nil +} + +// Next returns a next token. +func (l *Lexer) Next() (*Token, error) { + if len(l.tokBuf) > 0 { + tok := l.tokBuf[0] + l.tokBuf = l.tokBuf[1:] + return tok, nil + } + + tok, err := l.nextAndTransition() + if err != nil { + return nil, err + } + if !tok.Invalid { + return tok, nil + } + errTok := tok + for { + tok, err = l.nextAndTransition() + if err != nil { + return nil, err + } + if !tok.Invalid { + break + } + errTok.ByteLen += tok.ByteLen + errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...) + } + l.tokBuf = append(l.tokBuf, tok) + + return errTok, nil +} + +func (l *Lexer) nextAndTransition() (*Token, error) { + tok, err := l.next() + if err != nil { + return nil, err + } + if tok.EOF || tok.Invalid { + return tok, nil + } + if l.passiveModeTran { + return tok, nil + } + mode := l.Mode() + if l.spec.Pop(mode, tok.ModeKindID) { + err := l.PopMode() + if err != nil { + return nil, err + } + } + if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok { + l.PushMode(mode) + } + // The checking length of the mode stack must be at after pop and push operations because those operations can be performed + // at the same time. When the mode stack has just one element and popped it, the mode stack will be temporarily emptied. + // However, since a push operation may be performed immediately after it, the lexer allows the stack to be temporarily empty. + if len(l.modeStack) == 0 { + return nil, fmt.Errorf("a mode stack must have at least one element") + } + return tok, nil +} + +func (l *Lexer) next() (*Token, error) { + mode := l.Mode() + state := l.spec.InitialState(mode) + buf := []byte{} + startPos := l.state.srcPtr + row := l.state.row + col := l.state.col + var tok *Token + for { + v, eof := l.read() + if eof { + if tok != nil { + l.revert() + return tok, nil + } + // When `buf` has unaccepted data and reads the EOF, the lexer treats the buffered data as an invalid token. + if len(buf) > 0 { + return &Token{ + ModeID: mode, + ModeKindID: 0, + BytePos: startPos, + ByteLen: l.state.srcPtr - startPos, + Lexeme: buf, + Row: row, + Col: col, + Invalid: true, + }, nil + } + return &Token{ + ModeID: mode, + ModeKindID: 0, + BytePos: startPos, + Row: row, + Col: col, + EOF: true, + }, nil + } + buf = append(buf, v) + nextState, ok := l.spec.NextState(mode, state, int(v)) + if !ok { + if tok != nil { + l.revert() + return tok, nil + } + return &Token{ + ModeID: mode, + ModeKindID: 0, + BytePos: startPos, + ByteLen: l.state.srcPtr - startPos, + Lexeme: buf, + Row: row, + Col: col, + Invalid: true, + }, nil + } + state = nextState + if modeKindID, ok := l.spec.Accept(mode, state); ok { + kindID, _ := l.spec.KindIDAndName(mode, modeKindID) + tok = &Token{ + ModeID: mode, + KindID: kindID, + ModeKindID: modeKindID, + BytePos: startPos, + ByteLen: l.state.srcPtr - startPos, + Lexeme: buf, + Row: row, + Col: col, + } + l.accept() + } + } +} + +// Mode returns the current lex mode. +func (l *Lexer) Mode() ModeID { + return l.modeStack[len(l.modeStack)-1] +} + +// PushMode adds a lex mode onto the mode stack. +func (l *Lexer) PushMode(mode ModeID) { + l.modeStack = append(l.modeStack, mode) +} + +// PopMode removes a lex mode from the top of the mode stack. +func (l *Lexer) PopMode() error { + sLen := len(l.modeStack) + if sLen == 0 { + return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more") + } + l.modeStack = l.modeStack[:sLen-1] + return nil +} + +func (l *Lexer) read() (byte, bool) { + if l.state.srcPtr >= len(l.src) { + return 0, true + } + + b := l.src[l.state.srcPtr] + l.state.srcPtr++ + + // Count the token positions. + // The driver treats LF as the end of lines and counts columns in code points, not bytes. + // To count in code points, we refer to the First Byte column in the Table 3-6. + // + // Reference: + // - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6. UTF-8 Bit Distribution + if b < 128 { + // 0x0A is LF. + if b == 0x0A { + l.state.row++ + l.state.col = 0 + } else { + l.state.col++ + } + } else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 { + l.state.col++ + } + + return b, false +} + +// accept saves the current state. +func (l *Lexer) accept() { + l.lastAcceptedState = l.state +} + +// revert reverts the lexer state to the last accepted state. +// +// We must not call this function consecutively. +func (l *Lexer) revert() { + l.state = l.lastAcceptedState +} + +type lexSpec struct { + spec *spec.LexicalSpec +} + +func NewLexSpec(spec *spec.LexicalSpec) *lexSpec { + return &lexSpec{ + spec: spec, + } +} + +func (s *lexSpec) InitialMode() ModeID { + return ModeID(s.spec.InitialModeID.Int()) +} + +func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool { + return s.spec.Specs[mode].Pop[modeKind] == 1 +} + +func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) { + modeID := s.spec.Specs[mode].Push[modeKind] + return ModeID(modeID.Int()), !modeID.IsNil() +} + +func (s *lexSpec) ModeName(mode ModeID) string { + return s.spec.ModeNames[mode].String() +} + +func (s *lexSpec) InitialState(mode ModeID) StateID { + return StateID(s.spec.Specs[mode].DFA.InitialStateID.Int()) +} + +func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) { + switch s.spec.CompressionLevel { + case 2: + tran := s.spec.Specs[mode].DFA.Transition + rowNum := tran.RowNums[state] + d := tran.UniqueEntries.RowDisplacement[rowNum] + if tran.UniqueEntries.Bounds[d+v] != rowNum { + return StateID(tran.UniqueEntries.EmptyValue.Int()), false + } + return StateID(tran.UniqueEntries.Entries[d+v].Int()), true + case 1: + tran := s.spec.Specs[mode].DFA.Transition + next := tran.UncompressedUniqueEntries[tran.RowNums[state]*tran.OriginalColCount+v] + if next == spec.StateIDNil { + return StateID(spec.StateIDNil.Int()), false + } + return StateID(next.Int()), true + } + + modeSpec := s.spec.Specs[mode] + next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v] + if next == spec.StateIDNil { + return StateID(spec.StateIDNil), false + } + return StateID(next.Int()), true +} + +func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) { + modeKindID := s.spec.Specs[mode].DFA.AcceptingStates[state] + return ModeKindID(modeKindID.Int()), modeKindID != spec.LexModeKindIDNil +} + +func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) { + kindID := s.spec.KindIDs[mode][modeKind] + return KindID(kindID.Int()), s.spec.KindNames[kindID].String() +} + // go:embed lexer.go var lexerCoreSrc string diff --git a/src/urubu/driver/lexer/lexer.go b/src/urubu/driver/lexer/lexer.go deleted file mode 100644 index 3f9712e..0000000 --- a/src/urubu/driver/lexer/lexer.go +++ /dev/null @@ -1,335 +0,0 @@ -package lexer - -import ( - "fmt" - "io" -) - -type ModeID int - -func (id ModeID) Int() int { - return int(id) -} - -type StateID int - -func (id StateID) Int() int { - return int(id) -} - -type KindID int - -func (id KindID) Int() int { - return int(id) -} - -type ModeKindID int - -func (id ModeKindID) Int() int { - return int(id) -} - -type LexSpec interface { - InitialMode() ModeID - Pop(mode ModeID, modeKind ModeKindID) bool - Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) - ModeName(mode ModeID) string - InitialState(mode ModeID) StateID - NextState(mode ModeID, state StateID, v int) (StateID, bool) - Accept(mode ModeID, state StateID) (ModeKindID, bool) - KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) -} - -// Token representes a token. -type Token struct { - // ModeID is an ID of a lex mode. - ModeID ModeID - - // KindID is an ID of a kind. This is unique among all modes. - KindID KindID - - // ModeKindID is an ID of a lexical kind. This is unique only within a mode. - // Note that you need to use KindID field if you want to identify a kind across all modes. - ModeKindID ModeKindID - - // BytePos is a byte position where a token appears. - BytePos int - - // ByteLen is a length of a token. - ByteLen int - - // Row is a row number where a token appears. - Row int - - // Col is a column number where a token appears. - // Note that Col is counted in code points, not bytes. - Col int - - // Lexeme is a byte sequence matched a pattern of a lexical specification. - Lexeme []byte - - // When this field is true, it means the token is the EOF token. - EOF bool - - // When this field is true, it means the token is an error token. - Invalid bool -} - -type LexerOption func(l *Lexer) error - -// DisableModeTransition disables the active mode transition. Thus, even if the lexical specification has the push and pop -// operations, the lexer doesn't perform these operations. When the lexical specification has multiple modes, and this option is -// enabled, you need to call the Lexer.Push and Lexer.Pop methods to perform the mode transition. You can use the Lexer.Mode method -// to know the current lex mode. -func DisableModeTransition() LexerOption { - return func(l *Lexer) error { - l.passiveModeTran = true - return nil - } -} - -type lexerState struct { - srcPtr int - row int - col int -} - -type Lexer struct { - spec LexSpec - src []byte - state lexerState - lastAcceptedState lexerState - tokBuf []*Token - modeStack []ModeID - passiveModeTran bool -} - -// NewLexer returns a new lexer. -func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) { - b, err := io.ReadAll(src) - if err != nil { - return nil, err - } - l := &Lexer{ - spec: spec, - src: b, - state: lexerState{ - srcPtr: 0, - row: 0, - col: 0, - }, - lastAcceptedState: lexerState{ - srcPtr: 0, - row: 0, - col: 0, - }, - modeStack: []ModeID{ - spec.InitialMode(), - }, - passiveModeTran: false, - } - for _, opt := range opts { - err := opt(l) - if err != nil { - return nil, err - } - } - - return l, nil -} - -// Next returns a next token. -func (l *Lexer) Next() (*Token, error) { - if len(l.tokBuf) > 0 { - tok := l.tokBuf[0] - l.tokBuf = l.tokBuf[1:] - return tok, nil - } - - tok, err := l.nextAndTransition() - if err != nil { - return nil, err - } - if !tok.Invalid { - return tok, nil - } - errTok := tok - for { - tok, err = l.nextAndTransition() - if err != nil { - return nil, err - } - if !tok.Invalid { - break - } - errTok.ByteLen += tok.ByteLen - errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...) - } - l.tokBuf = append(l.tokBuf, tok) - - return errTok, nil -} - -func (l *Lexer) nextAndTransition() (*Token, error) { - tok, err := l.next() - if err != nil { - return nil, err - } - if tok.EOF || tok.Invalid { - return tok, nil - } - if l.passiveModeTran { - return tok, nil - } - mode := l.Mode() - if l.spec.Pop(mode, tok.ModeKindID) { - err := l.PopMode() - if err != nil { - return nil, err - } - } - if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok { - l.PushMode(mode) - } - // The checking length of the mode stack must be at after pop and push operations because those operations can be performed - // at the same time. When the mode stack has just one element and popped it, the mode stack will be temporarily emptied. - // However, since a push operation may be performed immediately after it, the lexer allows the stack to be temporarily empty. - if len(l.modeStack) == 0 { - return nil, fmt.Errorf("a mode stack must have at least one element") - } - return tok, nil -} - -func (l *Lexer) next() (*Token, error) { - mode := l.Mode() - state := l.spec.InitialState(mode) - buf := []byte{} - startPos := l.state.srcPtr - row := l.state.row - col := l.state.col - var tok *Token - for { - v, eof := l.read() - if eof { - if tok != nil { - l.revert() - return tok, nil - } - // When `buf` has unaccepted data and reads the EOF, the lexer treats the buffered data as an invalid token. - if len(buf) > 0 { - return &Token{ - ModeID: mode, - ModeKindID: 0, - BytePos: startPos, - ByteLen: l.state.srcPtr - startPos, - Lexeme: buf, - Row: row, - Col: col, - Invalid: true, - }, nil - } - return &Token{ - ModeID: mode, - ModeKindID: 0, - BytePos: startPos, - Row: row, - Col: col, - EOF: true, - }, nil - } - buf = append(buf, v) - nextState, ok := l.spec.NextState(mode, state, int(v)) - if !ok { - if tok != nil { - l.revert() - return tok, nil - } - return &Token{ - ModeID: mode, - ModeKindID: 0, - BytePos: startPos, - ByteLen: l.state.srcPtr - startPos, - Lexeme: buf, - Row: row, - Col: col, - Invalid: true, - }, nil - } - state = nextState - if modeKindID, ok := l.spec.Accept(mode, state); ok { - kindID, _ := l.spec.KindIDAndName(mode, modeKindID) - tok = &Token{ - ModeID: mode, - KindID: kindID, - ModeKindID: modeKindID, - BytePos: startPos, - ByteLen: l.state.srcPtr - startPos, - Lexeme: buf, - Row: row, - Col: col, - } - l.accept() - } - } -} - -// Mode returns the current lex mode. -func (l *Lexer) Mode() ModeID { - return l.modeStack[len(l.modeStack)-1] -} - -// PushMode adds a lex mode onto the mode stack. -func (l *Lexer) PushMode(mode ModeID) { - l.modeStack = append(l.modeStack, mode) -} - -// PopMode removes a lex mode from the top of the mode stack. -func (l *Lexer) PopMode() error { - sLen := len(l.modeStack) - if sLen == 0 { - return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more") - } - l.modeStack = l.modeStack[:sLen-1] - return nil -} - -func (l *Lexer) read() (byte, bool) { - if l.state.srcPtr >= len(l.src) { - return 0, true - } - - b := l.src[l.state.srcPtr] - l.state.srcPtr++ - - // Count the token positions. - // The driver treats LF as the end of lines and counts columns in code points, not bytes. - // To count in code points, we refer to the First Byte column in the Table 3-6. - // - // Reference: - // - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6. UTF-8 Bit Distribution - if b < 128 { - // 0x0A is LF. - if b == 0x0A { - l.state.row++ - l.state.col = 0 - } else { - l.state.col++ - } - } else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 { - l.state.col++ - } - - return b, false -} - -// accept saves the current state. -func (l *Lexer) accept() { - l.lastAcceptedState = l.state -} - -// revert reverts the lexer state to the last accepted state. -// -// We must not call this function consecutively. -func (l *Lexer) revert() { - l.state = l.lastAcceptedState -} diff --git a/src/urubu/driver/lexer/spec.go b/src/urubu/driver/lexer/spec.go deleted file mode 100644 index 75c74af..0000000 --- a/src/urubu/driver/lexer/spec.go +++ /dev/null @@ -1,71 +0,0 @@ -package lexer - -import spec "urubu/spec/grammar" - -type lexSpec struct { - spec *spec.LexicalSpec -} - -func NewLexSpec(spec *spec.LexicalSpec) *lexSpec { - return &lexSpec{ - spec: spec, - } -} - -func (s *lexSpec) InitialMode() ModeID { - return ModeID(s.spec.InitialModeID.Int()) -} - -func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool { - return s.spec.Specs[mode].Pop[modeKind] == 1 -} - -func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) { - modeID := s.spec.Specs[mode].Push[modeKind] - return ModeID(modeID.Int()), !modeID.IsNil() -} - -func (s *lexSpec) ModeName(mode ModeID) string { - return s.spec.ModeNames[mode].String() -} - -func (s *lexSpec) InitialState(mode ModeID) StateID { - return StateID(s.spec.Specs[mode].DFA.InitialStateID.Int()) -} - -func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) { - switch s.spec.CompressionLevel { - case 2: - tran := s.spec.Specs[mode].DFA.Transition - rowNum := tran.RowNums[state] - d := tran.UniqueEntries.RowDisplacement[rowNum] - if tran.UniqueEntries.Bounds[d+v] != rowNum { - return StateID(tran.UniqueEntries.EmptyValue.Int()), false - } - return StateID(tran.UniqueEntries.Entries[d+v].Int()), true - case 1: - tran := s.spec.Specs[mode].DFA.Transition - next := tran.UncompressedUniqueEntries[tran.RowNums[state]*tran.OriginalColCount+v] - if next == spec.StateIDNil { - return StateID(spec.StateIDNil.Int()), false - } - return StateID(next.Int()), true - } - - modeSpec := s.spec.Specs[mode] - next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v] - if next == spec.StateIDNil { - return StateID(spec.StateIDNil), false - } - return StateID(next.Int()), true -} - -func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) { - modeKindID := s.spec.Specs[mode].DFA.AcceptingStates[state] - return ModeKindID(modeKindID.Int()), modeKindID != spec.LexModeKindIDNil -} - -func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) { - kindID := s.spec.KindIDs[mode][modeKind] - return KindID(kindID.Int()), s.spec.KindNames[kindID].String() -} diff --git a/src/urubu/driver/parser.go b/src/urubu/driver/parser.go new file mode 100644 index 0000000..89cb240 --- /dev/null +++ b/src/urubu/driver/parser.go @@ -0,0 +1,1439 @@ +package parser + +import ( + "bytes" + _ "embed" + "encoding/json" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/token" + goToken "go/token" + "io" + "strconv" + "strings" + "text/template" + + "urubu/driver/lexer" + spec "urubu/spec/grammar" +) + +type Grammar interface { + // InitialState returns the initial state of a parser. + InitialState() int + + // StartProduction returns the start production of grammar. + StartProduction() int + + // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair. + Action(state int, terminal int) int + + // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair. + GoTo(state int, lhs int) int + + // ErrorTrapperState returns true when a state can shift the error symbol. + ErrorTrapperState(state int) bool + + // LHS returns a LHS symbol of a production. + LHS(prod int) int + + // AlternativeSymbolCount returns a symbol count of p production. + AlternativeSymbolCount(prod int) int + + // RecoverProduction returns true when a production has the recover directive. + RecoverProduction(prod int) bool + + // NonTerminal retuns a string representaion of a non-terminal symbol. + NonTerminal(nonTerminal int) string + + // TerminalCount returns a terminal symbol count of grammar. + TerminalCount() int + + // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis. + SkipTerminal(terminal int) bool + + // EOF returns the EOF symbol. + EOF() int + + // Error returns the error symbol. + Error() int + + // Terminal retuns a string representaion of a terminal symbol. + Terminal(terminal int) string + + // ASTAction returns an AST action entries. + ASTAction(prod int) []int +} + +type VToken interface { + // TerminalID returns a terminal ID. + TerminalID() int + + // Lexeme returns a lexeme. + Lexeme() []byte + + // EOF returns true when a token represents EOF. + EOF() bool + + // Invalid returns true when a token is invalid. + Invalid() bool + + // BytePosition returns (position, length) pair. + // `position` is a byte position where a token appears and `length` is a length in bytes. + BytePosition() (int, int) + + // Position returns (row, column) pair. + Position() (int, int) +} + +type TokenStream interface { + Next() (VToken, error) +} + +type SyntaxError struct { + Row int + Col int + Message string + Token VToken + ExpectedTerminals []string +} + +type ParserOption func(p *Parser) error + +// DisableLAC disables LAC (lookahead correction). LAC is enabled by default. +func DisableLAC() ParserOption { + return func(p *Parser) error { + p.disableLAC = true + return nil + } +} + +func SemanticAction(semAct SemanticActionSet) ParserOption { + return func(p *Parser) error { + p.semAct = semAct + return nil + } +} + +type Parser struct { + toks TokenStream + gram Grammar + stateStack *stateStack + semAct SemanticActionSet + disableLAC bool + onError bool + shiftCount int + synErrs []*SyntaxError +} + +func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) { + p := &Parser{ + toks: toks, + gram: gram, + stateStack: &stateStack{}, + } + + for _, opt := range opts { + err := opt(p) + if err != nil { + return nil, err + } + } + + return p, nil +} + +func (p *Parser) Parse() error { + p.stateStack.push(p.gram.InitialState()) + tok, err := p.nextToken() + if err != nil { + return err + } + +ACTION_LOOP: + for { + act := p.lookupAction(tok) + + switch { + case act < 0: // Shift + nextState := act * -1 + + recovered := false + if p.onError { + p.shiftCount++ + + // When the parser performs shift three times, the parser recovers from the error state. + if p.shiftCount >= 3 { + p.onError = false + p.shiftCount = 0 + recovered = true + } + } + + p.shift(nextState) + + if p.semAct != nil { + p.semAct.Shift(tok, recovered) + } + + tok, err = p.nextToken() + if err != nil { + return err + } + case act > 0: // Reduce + prodNum := act + + recovered := false + if p.onError && p.gram.RecoverProduction(prodNum) { + p.onError = false + p.shiftCount = 0 + recovered = true + } + + accepted := p.reduce(prodNum) + if accepted { + if p.semAct != nil { + p.semAct.Accept() + } + + return nil + } + + if p.semAct != nil { + p.semAct.Reduce(prodNum, recovered) + } + default: // Error + if p.onError { + tok, err = p.nextToken() + if err != nil { + return err + } + if tok.EOF() { + if p.semAct != nil { + p.semAct.MissError(tok) + } + + return nil + } + + continue ACTION_LOOP + } + + row, col := tok.Position() + p.synErrs = append(p.synErrs, &SyntaxError{ + Row: row, + Col: col, + Message: "unexpected token", + Token: tok, + ExpectedTerminals: p.searchLookahead(p.stateStack.top()), + }) + + count, ok := p.trapError() + if !ok { + if p.semAct != nil { + p.semAct.MissError(tok) + } + + return nil + } + + p.onError = true + p.shiftCount = 0 + + act, err := p.lookupActionOnError() + if err != nil { + return err + } + + p.shift(act * -1) + + if p.semAct != nil { + p.semAct.TrapAndShiftError(tok, count) + } + } + } +} + +// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid, +// this method returns `true`. +func (p *Parser) validateLookahead(term int) bool { + p.stateStack.enableExploratoryMode() + defer p.stateStack.disableExploratoryMode() + + for { + act := p.gram.Action(p.stateStack.topExploratorily(), term) + + switch { + case act < 0: // Shift + return true + case act > 0: // Reduce + prodNum := act + + lhs := p.gram.LHS(prodNum) + if lhs == p.gram.LHS(p.gram.StartProduction()) { + return true + } + n := p.gram.AlternativeSymbolCount(prodNum) + p.stateStack.popExploratorily(n) + state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs) + p.stateStack.pushExploratorily(state) + default: // Error + return false + } + } +} + +func (p *Parser) nextToken() (VToken, error) { + for { + // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0, + // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect + // a syntax error because the parser cannot find an entry corresponding to the invalid token. + tok, err := p.toks.Next() + if err != nil { + return nil, err + } + + if p.gram.SkipTerminal(tok.TerminalID()) { + continue + } + + return tok, nil + } +} + +func (p *Parser) tokenToTerminal(tok VToken) int { + if tok.EOF() { + return p.gram.EOF() + } + + return tok.TerminalID() +} + +func (p *Parser) lookupAction(tok VToken) int { + if !p.disableLAC { + term := p.tokenToTerminal(tok) + if !p.validateLookahead(term) { + return 0 + } + } + + return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok)) +} + +func (p *Parser) lookupActionOnError() (int, error) { + act := p.gram.Action(p.stateStack.top(), p.gram.Error()) + if act >= 0 { + return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error())) + } + + return act, nil +} + +func (p *Parser) shift(nextState int) { + p.stateStack.push(nextState) +} + +func (p *Parser) reduce(prodNum int) bool { + lhs := p.gram.LHS(prodNum) + if lhs == p.gram.LHS(p.gram.StartProduction()) { + return true + } + n := p.gram.AlternativeSymbolCount(prodNum) + p.stateStack.pop(n) + nextState := p.gram.GoTo(p.stateStack.top(), lhs) + p.stateStack.push(nextState) + return false +} + +func (p *Parser) trapError() (int, bool) { + count := 0 + for { + if p.gram.ErrorTrapperState(p.stateStack.top()) { + return count, true + } + + if p.stateStack.top() != p.gram.InitialState() { + p.stateStack.pop(1) + count++ + } else { + return 0, false + } + } +} + +func (p *Parser) SyntaxErrors() []*SyntaxError { + return p.synErrs +} + +func (p *Parser) searchLookahead(state int) []string { + kinds := []string{} + termCount := p.gram.TerminalCount() + for term := 0; term < termCount; term++ { + if p.disableLAC { + if p.gram.Action(p.stateStack.top(), term) == 0 { + continue + } + } else { + if !p.validateLookahead(term) { + continue + } + } + + // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol + // intentionally. + if term == p.gram.Error() { + continue + } + + kinds = append(kinds, p.gram.Terminal(term)) + } + + return kinds +} + +type stateStack struct { + items []int + itemsExp []int +} + +func (s *stateStack) enableExploratoryMode() { + s.itemsExp = make([]int, len(s.items)) + copy(s.itemsExp, s.items) +} + +func (s *stateStack) disableExploratoryMode() { + s.itemsExp = nil +} + +func (s *stateStack) top() int { + return s.items[len(s.items)-1] +} + +func (s *stateStack) topExploratorily() int { + return s.itemsExp[len(s.itemsExp)-1] +} + +func (s *stateStack) push(state int) { + s.items = append(s.items, state) +} + +func (s *stateStack) pushExploratorily(state int) { + s.itemsExp = append(s.itemsExp, state) +} + +func (s *stateStack) pop(n int) { + s.items = s.items[:len(s.items)-n] +} + +func (s *stateStack) popExploratorily(n int) { + s.itemsExp = s.itemsExp[:len(s.itemsExp)-n] +} + +// SemanticActionSet is a set of semantic actions a parser calls. +type SemanticActionSet interface { + // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol. + // When the parser recovered from an error state by shifting the token, `recovered` is true. + Shift(tok VToken, recovered bool) + + // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production. + // When the parser recovered from an error state by reducing the production, `recovered` is true. + Reduce(prodNum int, recovered bool) + + // Accept runs when the parser accepts an input. + Accept() + + // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack. + // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards + // from the state stack. + // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token + // corresponding to the error symbol doesn't exist. + TrapAndShiftError(cause VToken, popped int) + + // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error. + MissError(cause VToken) +} + +var _ SemanticActionSet = &SyntaxTreeActionSet{} + +// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface. +type SyntaxTreeNode interface { + // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast` + // directive with `...` operator. + ChildCount() int + + // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast` + // directive with `...` operator. + ExpandChildren() []SyntaxTreeNode +} + +var _ SyntaxTreeNode = &Node{} + +// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types. +// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface. +type SyntaxTreeBuilder interface { + Shift(kindName string, tok VToken) SyntaxTreeNode + ShiftError(kindName string) SyntaxTreeNode + Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode + Accept(f SyntaxTreeNode) +} + +var _ SyntaxTreeBuilder = &DefaultSyntaxTreeBuilder{} + +// DefaultSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder. +type DefaultSyntaxTreeBuilder struct { + tree *Node +} + +// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder. +func NewDefaultSyntaxTreeBuilder() *DefaultSyntaxTreeBuilder { + return &DefaultSyntaxTreeBuilder{} +} + +// Shift is a implementation of SyntaxTreeBuilder.Shift. +func (b *DefaultSyntaxTreeBuilder) Shift(kindName string, tok VToken) SyntaxTreeNode { + bytePos, byteLen := tok.BytePosition() + row, col := tok.Position() + return &Node{ + Type: NodeTypeTerminal, + KindName: kindName, + Text: string(tok.Lexeme()), + BytePos: bytePos, + ByteLen: byteLen, + Row: row, + Col: col, + } +} + +// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError. +func (b *DefaultSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode { + return &Node{ + Type: NodeTypeError, + KindName: kindName, + } +} + +// Reduce is a implementation of SyntaxTreeBuilder.Reduce. +func (b *DefaultSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode { + cNodes := make([]*Node, len(children)) + for i, c := range children { + cNodes[i] = c.(*Node) + } + return &Node{ + Type: NodeTypeNonTerminal, + KindName: kindName, + Children: cNodes, + } +} + +// Accept is a implementation of SyntaxTreeBuilder.Accept. +func (b *DefaultSyntaxTreeBuilder) Accept(f SyntaxTreeNode) { + b.tree = f.(*Node) +} + +// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil. +func (b *DefaultSyntaxTreeBuilder) Tree() *Node { + return b.tree +} + +// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree. +type SyntaxTreeActionSet struct { + gram Grammar + builder SyntaxTreeBuilder + semStack *semanticStack + disableASTAction bool +} + +// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree). +// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them. +func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { + return &SyntaxTreeActionSet{ + gram: gram, + builder: builder, + semStack: newSemanticStack(), + } +} + +// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree). +// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them. +func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { + return &SyntaxTreeActionSet{ + gram: gram, + builder: builder, + semStack: newSemanticStack(), + disableASTAction: true, + } +} + +// Shift is a implementation of SemanticActionSet.Shift method. +func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) { + term := a.tokenToTerminal(tok) + a.semStack.push(a.builder.Shift(a.gram.Terminal(term), tok)) +} + +// Reduce is a implementation of SemanticActionSet.Reduce method. +func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { + lhs := a.gram.LHS(prodNum) + + // When an alternative is empty, `n` will be 0, and `handle` will be empty slice. + n := a.gram.AlternativeSymbolCount(prodNum) + handle := a.semStack.pop(n) + + var astAct []int + if !a.disableASTAction { + astAct = a.gram.ASTAction(prodNum) + } + var children []SyntaxTreeNode + if astAct != nil { + // Count the number of children in advance to avoid frequent growth in a slice for children. + { + l := 0 + for _, e := range astAct { + if e > 0 { + l++ + } else { + offset := e*-1 - 1 + l += handle[offset].ChildCount() + } + } + + children = make([]SyntaxTreeNode, l) + } + + p := 0 + for _, e := range astAct { + if e > 0 { + offset := e - 1 + children[p] = handle[offset] + p++ + } else { + offset := e*-1 - 1 + for _, c := range handle[offset].ExpandChildren() { + children[p] = c + p++ + } + } + } + } else { + // If an alternative has no AST action, a driver generates + // a node with the same structure as a CST. + children = handle + } + + a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children)) +} + +// Accept is a implementation of SemanticActionSet.Accept method. +func (a *SyntaxTreeActionSet) Accept() { + top := a.semStack.pop(1) + a.builder.Accept(top[0]) +} + +// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method. +func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) { + a.semStack.pop(popped) + a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error()))) +} + +// MissError is a implementation of SemanticActionSet.MissError method. +func (a *SyntaxTreeActionSet) MissError(cause VToken) { +} + +func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int { + if tok.EOF() { + return a.gram.EOF() + } + + return tok.TerminalID() +} + +type semanticStack struct { + frames []SyntaxTreeNode +} + +func newSemanticStack() *semanticStack { + return &semanticStack{ + frames: make([]SyntaxTreeNode, 0, 100), + } +} + +func (s *semanticStack) push(f SyntaxTreeNode) { + s.frames = append(s.frames, f) +} + +func (s *semanticStack) pop(n int) []SyntaxTreeNode { + fs := s.frames[len(s.frames)-n:] + s.frames = s.frames[:len(s.frames)-n] + + return fs +} + +type NodeType int + +const ( + NodeTypeError = 0 + NodeTypeTerminal = 1 + NodeTypeNonTerminal = 2 +) + +// Node is a implementation of SyntaxTreeNode interface. +type Node struct { + Type NodeType + KindName string + Text string + BytePos int + ByteLen int + Row int + Col int + Children []*Node +} + +func (n *Node) MarshalJSON() ([]byte, error) { + switch n.Type { + case NodeTypeError: + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + }{ + Type: n.Type, + KindName: n.KindName, + }) + case NodeTypeTerminal: + if n.KindName == "" { + return json.Marshal(struct { + Type NodeType `json:"type"` + Text string `json:"text"` + Row int `json:"row"` + Col int `json:"col"` + }{ + Type: n.Type, + Text: n.Text, + Row: n.Row, + Col: n.Col, + }) + } + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + Text string `json:"text"` + Row int `json:"row"` + Col int `json:"col"` + }{ + Type: n.Type, + KindName: n.KindName, + Text: n.Text, + Row: n.Row, + Col: n.Col, + }) + case NodeTypeNonTerminal: + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + Children []*Node `json:"children"` + }{ + Type: n.Type, + KindName: n.KindName, + Children: n.Children, + }) + default: + return nil, fmt.Errorf("invalid node type: %v", n.Type) + } +} + +// ChildCount is a implementation of SyntaxTreeNode.ChildCount. +func (n *Node) ChildCount() int { + return len(n.Children) +} + +// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren. +func (n *Node) ExpandChildren() []SyntaxTreeNode { + fs := make([]SyntaxTreeNode, len(n.Children)) + for i, n := range n.Children { + fs[i] = n + } + return fs +} + +// PrintTree prints a syntax tree whose root is `node`. +func PrintTree(w io.Writer, node *Node) { + printTree(w, node, "", "") +} + +func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) { + if node == nil { + return + } + + switch node.Type { + case NodeTypeError: + fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) + case NodeTypeTerminal: + fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text)) + case NodeTypeNonTerminal: + fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) + + num := len(node.Children) + for i, child := range node.Children { + var line string + if num > 1 && i < num-1 { + line = "├─ " + } else { + line = "└─ " + } + + var prefix string + if i >= num-1 { + prefix = " " + } else { + prefix = "│ " + } + + printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } + } +} + +type grammarImpl struct { + g *spec.CompiledGrammar +} + +func NewGrammar(g *spec.CompiledGrammar) *grammarImpl { + return &grammarImpl{ + g: g, + } +} + +func (g *grammarImpl) InitialState() int { + return g.g.Syntactic.InitialState +} + +func (g *grammarImpl) StartProduction() int { + return g.g.Syntactic.StartProduction +} + +func (g *grammarImpl) RecoverProduction(prod int) bool { + return g.g.Syntactic.RecoverProductions[prod] != 0 +} + +func (g *grammarImpl) Action(state int, terminal int) int { + return g.g.Syntactic.Action[state*g.g.Syntactic.TerminalCount+terminal] +} + +func (g *grammarImpl) GoTo(state int, lhs int) int { + return g.g.Syntactic.GoTo[state*g.g.Syntactic.NonTerminalCount+lhs] +} + +func (g *grammarImpl) AlternativeSymbolCount(prod int) int { + return g.g.Syntactic.AlternativeSymbolCounts[prod] +} + +func (g *grammarImpl) TerminalCount() int { + return g.g.Syntactic.TerminalCount +} + +func (g *grammarImpl) SkipTerminal(terminal int) bool { + return g.g.Syntactic.TerminalSkip[terminal] == 1 +} + +func (g *grammarImpl) ErrorTrapperState(state int) bool { + return g.g.Syntactic.ErrorTrapperStates[state] != 0 +} + +func (g *grammarImpl) NonTerminal(nonTerminal int) string { + return g.g.Syntactic.NonTerminals[nonTerminal] +} + +func (g *grammarImpl) LHS(prod int) int { + return g.g.Syntactic.LHSSymbols[prod] +} + +func (g *grammarImpl) EOF() int { + return g.g.Syntactic.EOFSymbol +} + +func (g *grammarImpl) Error() int { + return g.g.Syntactic.ErrorSymbol +} + +func (g *grammarImpl) Terminal(terminal int) string { + return g.g.Syntactic.Terminals[terminal] +} + +func (g *grammarImpl) ASTAction(prod int) []int { + return g.g.ASTAction.Entries[prod] +} + +// go:embed parser.go +var parserCoreSrc string + +// go:embed semantic_action.go +var semActSrc string + +func GenParser(cgram *spec.CompiledGrammar, pkgName string) ([]byte, error) { + var parserSrc string + { + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "parser.go", parserCoreSrc, parser.ParseComments) + if err != nil { + return nil, err + } + + var b strings.Builder + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + parserSrc = b.String() + } + + var grammarSrc string + { + t, err := template.New("").Funcs(genGrammarTemplateFuncs(cgram)).Parse(grammarSrcTmplate) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]interface{}{ + "initialState": cgram.Syntactic.InitialState, + "startProduction": cgram.Syntactic.StartProduction, + "terminalCount": cgram.Syntactic.TerminalCount, + "nonTerminalCount": cgram.Syntactic.NonTerminalCount, + "eofSymbol": cgram.Syntactic.EOFSymbol, + "errorSymbol": cgram.Syntactic.ErrorSymbol, + }) + if err != nil { + return nil, err + } + + grammarSrc = b.String() + } + + var lexerSrc string + { + t, err := template.New("").Funcs(genLexerTemplateFuncs(cgram)).Parse(lexerSrcTmplate) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, nil) + if err != nil { + return nil, err + } + + lexerSrc = b.String() + } + + var src string + { + tmpl := `// Code generated by vartan-go. DO NOT EDIT. +{{ .parserSrc }} + +{{ .grammarSrc }} + +{{ .lexerSrc }} +` + t, err := template.New("").Parse(tmpl) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]string{ + "parserSrc": parserSrc, + "grammarSrc": grammarSrc, + "lexerSrc": lexerSrc, + }) + if err != nil { + return nil, err + } + + src = b.String() + } + + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "", src, parser.ParseComments) + if err != nil { + return nil, err + } + + f.Name = ast.NewIdent(pkgName) + + // Complete an import statement. + for _, d := range f.Decls { + gd, ok := d.(*ast.GenDecl) + if !ok || gd.Tok != token.IMPORT { + continue + } + gd.Specs = append(gd.Specs, &ast.ImportSpec{ + Path: &ast.BasicLit{ + Value: `"io"`, + }, + }) + break + } + + var b bytes.Buffer + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + return b.Bytes(), nil +} + +const grammarSrcTmplate = ` +type grammarImpl struct { + recoverProductions []int + action []int + goTo []int + alternativeSymbolCounts []int + errorTrapperStates []int + nonTerminals []string + lhsSymbols []int + terminals []string + terminalSkip []int + astActions [][]int +} + +func NewGrammar() *grammarImpl { + return &grammarImpl{ + recoverProductions: {{ genRecoverProductions }}, + action: {{ genAction }}, + goTo: {{ genGoTo }}, + alternativeSymbolCounts: {{ genAlternativeSymbolCounts }}, + errorTrapperStates: {{ genErrorTrapperStates }}, + nonTerminals: {{ genNonTerminals }}, + lhsSymbols: {{ genLHSSymbols }}, + terminals: {{ genTerminals }}, + terminalSkip: {{ genTerminalSkip }}, + astActions: {{ genASTActions }}, + } +} + +func (g *grammarImpl) InitialState() int { + return {{ .initialState }} +} + +func (g *grammarImpl) StartProduction() int { + return {{ .startProduction }} +} + +func (g *grammarImpl) RecoverProduction(prod int) bool { + return g.recoverProductions[prod] != 0 +} + +func (g *grammarImpl) Action(state int, terminal int) int { + return g.action[state*{{ .terminalCount }}+terminal] +} + +func (g *grammarImpl) GoTo(state int, lhs int) int { + return g.goTo[state*{{ .nonTerminalCount }}+lhs] +} + +func (g *grammarImpl) AlternativeSymbolCount(prod int) int { + return g.alternativeSymbolCounts[prod] +} + +func (g *grammarImpl) TerminalCount() int { + return {{ .terminalCount }} +} + +func (g *grammarImpl) SkipTerminal(terminal int) bool { + return g.terminalSkip[terminal] == 1 +} + +func (g *grammarImpl) ErrorTrapperState(state int) bool { + return g.errorTrapperStates[state] != 0 +} + +func (g *grammarImpl) NonTerminal(nonTerminal int) string { + return g.nonTerminals[nonTerminal] +} + +func (g *grammarImpl) LHS(prod int) int { + return g.lhsSymbols[prod] +} + +func (g *grammarImpl) EOF() int { + return {{ .eofSymbol }} +} + +func (g *grammarImpl) Error() int { + return {{ .errorSymbol }} +} + +func (g *grammarImpl) Terminal(terminal int) string { + return g.terminals[terminal] +} + +func (g *grammarImpl) ASTAction(prod int) []int { + return g.astActions[prod] +} +` + +func genGrammarTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { + return template.FuncMap{ + "genRecoverProductions": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.RecoverProductions { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genAction": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.Action { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genGoTo": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.GoTo { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genAlternativeSymbolCounts": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.AlternativeSymbolCounts { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genErrorTrapperStates": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.ErrorTrapperStates { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genNonTerminals": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]string{\n") + for _, v := range cgram.Syntactic.NonTerminals { + fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genLHSSymbols": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.LHSSymbols { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genTerminals": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]string{\n") + for _, v := range cgram.Syntactic.Terminals { + fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genTerminalSkip": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.TerminalSkip { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genASTActions": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[][]int{\n") + for _, entries := range cgram.ASTAction.Entries { + if len(entries) == 0 { + fmt.Fprintf(&b, "nil,\n") + continue + } + + fmt.Fprintf(&b, "{\n") + c := 1 + for _, v := range entries { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "},\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + } +} + +const lexerSrcTmplate = ` +type vToken struct { + terminalID int + tok *Token +} + +func (t *vToken) TerminalID() int { + return t.terminalID +} + +func (t *vToken) Lexeme() []byte { + return t.tok.Lexeme +} + +func (t *vToken) EOF() bool { + return t.tok.EOF +} + +func (t *vToken) Invalid() bool { + return t.tok.Invalid +} + +func (t *vToken) BytePosition() (int, int) { + return t.tok.BytePos, t.tok.ByteLen +} + +func (t *vToken) Position() (int, int) { + return t.tok.Row, t.tok.Col +} + +var kindToTerminal = {{ genKindToTerminal }} + +type tokenStream struct { + lex *Lexer + kindToTerminal []int +} + +func NewTokenStream(src io.Reader) (*tokenStream, error) { + lex, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + + return &tokenStream{ + lex: lex, + }, nil +} + +func (t *tokenStream) Next() (VToken, error) { + tok, err := t.lex.Next() + if err != nil { + return nil, err + } + return &vToken{ + terminalID: kindToTerminal[tok.KindID], + tok: tok, + }, nil +} +` + +func genLexerTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { + return template.FuncMap{ + "genKindToTerminal": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.Syntactic.KindToTerminal { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + } +} + +func GenSemanticAction(pkgName string) ([]byte, error) { + var src string + { + tmpl := `// Code generated by vartan-go. DO NOT EDIT. +{{ .semActSrc }} +` + t, err := template.New("").Parse(tmpl) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]string{ + "semActSrc": semActSrc, + }) + if err != nil { + return nil, err + } + + src = b.String() + } + + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "", src, parser.ParseComments) + if err != nil { + return nil, err + } + + f.Name = ast.NewIdent(pkgName) + + var b bytes.Buffer + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + return b.Bytes(), nil +} + +type vToken struct { + terminalID int + tok *lexer.Token +} + +func (t *vToken) TerminalID() int { + return t.terminalID +} + +func (t *vToken) Lexeme() []byte { + return t.tok.Lexeme +} + +func (t *vToken) EOF() bool { + return t.tok.EOF +} + +func (t *vToken) Invalid() bool { + return t.tok.Invalid +} + +func (t *vToken) BytePosition() (int, int) { + return t.tok.BytePos, t.tok.ByteLen +} + +func (t *vToken) Position() (int, int) { + return t.tok.Row, t.tok.Col +} + +type tokenStream struct { + lex *lexer.Lexer + kindToTerminal []int +} + +func NewTokenStream(g *spec.CompiledGrammar, src io.Reader) (TokenStream, error) { + lex, err := lexer.NewLexer(lexer.NewLexSpec(g.Lexical), src) + if err != nil { + return nil, err + } + + return &tokenStream{ + lex: lex, + kindToTerminal: g.Syntactic.KindToTerminal, + }, nil +} + +func (l *tokenStream) Next() (VToken, error) { + tok, err := l.lex.Next() + if err != nil { + return nil, err + } + return &vToken{ + terminalID: l.kindToTerminal[tok.KindID], + tok: tok, + }, nil +} diff --git a/src/urubu/driver/parser/parser.go b/src/urubu/driver/parser/parser.go deleted file mode 100644 index 2eaa678..0000000 --- a/src/urubu/driver/parser/parser.go +++ /dev/null @@ -1,416 +0,0 @@ -package parser - -import ( - "fmt" -) - -type Grammar interface { - // InitialState returns the initial state of a parser. - InitialState() int - - // StartProduction returns the start production of grammar. - StartProduction() int - - // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair. - Action(state int, terminal int) int - - // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair. - GoTo(state int, lhs int) int - - // ErrorTrapperState returns true when a state can shift the error symbol. - ErrorTrapperState(state int) bool - - // LHS returns a LHS symbol of a production. - LHS(prod int) int - - // AlternativeSymbolCount returns a symbol count of p production. - AlternativeSymbolCount(prod int) int - - // RecoverProduction returns true when a production has the recover directive. - RecoverProduction(prod int) bool - - // NonTerminal retuns a string representaion of a non-terminal symbol. - NonTerminal(nonTerminal int) string - - // TerminalCount returns a terminal symbol count of grammar. - TerminalCount() int - - // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis. - SkipTerminal(terminal int) bool - - // EOF returns the EOF symbol. - EOF() int - - // Error returns the error symbol. - Error() int - - // Terminal retuns a string representaion of a terminal symbol. - Terminal(terminal int) string - - // ASTAction returns an AST action entries. - ASTAction(prod int) []int -} - -type VToken interface { - // TerminalID returns a terminal ID. - TerminalID() int - - // Lexeme returns a lexeme. - Lexeme() []byte - - // EOF returns true when a token represents EOF. - EOF() bool - - // Invalid returns true when a token is invalid. - Invalid() bool - - // BytePosition returns (position, length) pair. - // `position` is a byte position where a token appears and `length` is a length in bytes. - BytePosition() (int, int) - - // Position returns (row, column) pair. - Position() (int, int) -} - -type TokenStream interface { - Next() (VToken, error) -} - -type SyntaxError struct { - Row int - Col int - Message string - Token VToken - ExpectedTerminals []string -} - -type ParserOption func(p *Parser) error - -// DisableLAC disables LAC (lookahead correction). LAC is enabled by default. -func DisableLAC() ParserOption { - return func(p *Parser) error { - p.disableLAC = true - return nil - } -} - -func SemanticAction(semAct SemanticActionSet) ParserOption { - return func(p *Parser) error { - p.semAct = semAct - return nil - } -} - -type Parser struct { - toks TokenStream - gram Grammar - stateStack *stateStack - semAct SemanticActionSet - disableLAC bool - onError bool - shiftCount int - synErrs []*SyntaxError -} - -func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) { - p := &Parser{ - toks: toks, - gram: gram, - stateStack: &stateStack{}, - } - - for _, opt := range opts { - err := opt(p) - if err != nil { - return nil, err - } - } - - return p, nil -} - -func (p *Parser) Parse() error { - p.stateStack.push(p.gram.InitialState()) - tok, err := p.nextToken() - if err != nil { - return err - } - -ACTION_LOOP: - for { - act := p.lookupAction(tok) - - switch { - case act < 0: // Shift - nextState := act * -1 - - recovered := false - if p.onError { - p.shiftCount++ - - // When the parser performs shift three times, the parser recovers from the error state. - if p.shiftCount >= 3 { - p.onError = false - p.shiftCount = 0 - recovered = true - } - } - - p.shift(nextState) - - if p.semAct != nil { - p.semAct.Shift(tok, recovered) - } - - tok, err = p.nextToken() - if err != nil { - return err - } - case act > 0: // Reduce - prodNum := act - - recovered := false - if p.onError && p.gram.RecoverProduction(prodNum) { - p.onError = false - p.shiftCount = 0 - recovered = true - } - - accepted := p.reduce(prodNum) - if accepted { - if p.semAct != nil { - p.semAct.Accept() - } - - return nil - } - - if p.semAct != nil { - p.semAct.Reduce(prodNum, recovered) - } - default: // Error - if p.onError { - tok, err = p.nextToken() - if err != nil { - return err - } - if tok.EOF() { - if p.semAct != nil { - p.semAct.MissError(tok) - } - - return nil - } - - continue ACTION_LOOP - } - - row, col := tok.Position() - p.synErrs = append(p.synErrs, &SyntaxError{ - Row: row, - Col: col, - Message: "unexpected token", - Token: tok, - ExpectedTerminals: p.searchLookahead(p.stateStack.top()), - }) - - count, ok := p.trapError() - if !ok { - if p.semAct != nil { - p.semAct.MissError(tok) - } - - return nil - } - - p.onError = true - p.shiftCount = 0 - - act, err := p.lookupActionOnError() - if err != nil { - return err - } - - p.shift(act * -1) - - if p.semAct != nil { - p.semAct.TrapAndShiftError(tok, count) - } - } - } -} - -// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid, -// this method returns `true`. -func (p *Parser) validateLookahead(term int) bool { - p.stateStack.enableExploratoryMode() - defer p.stateStack.disableExploratoryMode() - - for { - act := p.gram.Action(p.stateStack.topExploratorily(), term) - - switch { - case act < 0: // Shift - return true - case act > 0: // Reduce - prodNum := act - - lhs := p.gram.LHS(prodNum) - if lhs == p.gram.LHS(p.gram.StartProduction()) { - return true - } - n := p.gram.AlternativeSymbolCount(prodNum) - p.stateStack.popExploratorily(n) - state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs) - p.stateStack.pushExploratorily(state) - default: // Error - return false - } - } -} - -func (p *Parser) nextToken() (VToken, error) { - for { - // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0, - // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect - // a syntax error because the parser cannot find an entry corresponding to the invalid token. - tok, err := p.toks.Next() - if err != nil { - return nil, err - } - - if p.gram.SkipTerminal(tok.TerminalID()) { - continue - } - - return tok, nil - } -} - -func (p *Parser) tokenToTerminal(tok VToken) int { - if tok.EOF() { - return p.gram.EOF() - } - - return tok.TerminalID() -} - -func (p *Parser) lookupAction(tok VToken) int { - if !p.disableLAC { - term := p.tokenToTerminal(tok) - if !p.validateLookahead(term) { - return 0 - } - } - - return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok)) -} - -func (p *Parser) lookupActionOnError() (int, error) { - act := p.gram.Action(p.stateStack.top(), p.gram.Error()) - if act >= 0 { - return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error())) - } - - return act, nil -} - -func (p *Parser) shift(nextState int) { - p.stateStack.push(nextState) -} - -func (p *Parser) reduce(prodNum int) bool { - lhs := p.gram.LHS(prodNum) - if lhs == p.gram.LHS(p.gram.StartProduction()) { - return true - } - n := p.gram.AlternativeSymbolCount(prodNum) - p.stateStack.pop(n) - nextState := p.gram.GoTo(p.stateStack.top(), lhs) - p.stateStack.push(nextState) - return false -} - -func (p *Parser) trapError() (int, bool) { - count := 0 - for { - if p.gram.ErrorTrapperState(p.stateStack.top()) { - return count, true - } - - if p.stateStack.top() != p.gram.InitialState() { - p.stateStack.pop(1) - count++ - } else { - return 0, false - } - } -} - -func (p *Parser) SyntaxErrors() []*SyntaxError { - return p.synErrs -} - -func (p *Parser) searchLookahead(state int) []string { - kinds := []string{} - termCount := p.gram.TerminalCount() - for term := 0; term < termCount; term++ { - if p.disableLAC { - if p.gram.Action(p.stateStack.top(), term) == 0 { - continue - } - } else { - if !p.validateLookahead(term) { - continue - } - } - - // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol - // intentionally. - if term == p.gram.Error() { - continue - } - - kinds = append(kinds, p.gram.Terminal(term)) - } - - return kinds -} - -type stateStack struct { - items []int - itemsExp []int -} - -func (s *stateStack) enableExploratoryMode() { - s.itemsExp = make([]int, len(s.items)) - copy(s.itemsExp, s.items) -} - -func (s *stateStack) disableExploratoryMode() { - s.itemsExp = nil -} - -func (s *stateStack) top() int { - return s.items[len(s.items)-1] -} - -func (s *stateStack) topExploratorily() int { - return s.itemsExp[len(s.itemsExp)-1] -} - -func (s *stateStack) push(state int) { - s.items = append(s.items, state) -} - -func (s *stateStack) pushExploratorily(state int) { - s.itemsExp = append(s.itemsExp, state) -} - -func (s *stateStack) pop(n int) { - s.items = s.items[:len(s.items)-n] -} - -func (s *stateStack) popExploratorily(n int) { - s.itemsExp = s.itemsExp[:len(s.itemsExp)-n] -} diff --git a/src/urubu/driver/parser/semantic_action.go b/src/urubu/driver/parser/semantic_action.go deleted file mode 100644 index 6bb78cf..0000000 --- a/src/urubu/driver/parser/semantic_action.go +++ /dev/null @@ -1,371 +0,0 @@ -package parser - -import ( - "encoding/json" - "fmt" - "io" - "strconv" -) - -// SemanticActionSet is a set of semantic actions a parser calls. -type SemanticActionSet interface { - // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol. - // When the parser recovered from an error state by shifting the token, `recovered` is true. - Shift(tok VToken, recovered bool) - - // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production. - // When the parser recovered from an error state by reducing the production, `recovered` is true. - Reduce(prodNum int, recovered bool) - - // Accept runs when the parser accepts an input. - Accept() - - // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack. - // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards - // from the state stack. - // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token - // corresponding to the error symbol doesn't exist. - TrapAndShiftError(cause VToken, popped int) - - // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error. - MissError(cause VToken) -} - -var _ SemanticActionSet = &SyntaxTreeActionSet{} - -// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface. -type SyntaxTreeNode interface { - // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast` - // directive with `...` operator. - ChildCount() int - - // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast` - // directive with `...` operator. - ExpandChildren() []SyntaxTreeNode -} - -var _ SyntaxTreeNode = &Node{} - -// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types. -// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface. -type SyntaxTreeBuilder interface { - Shift(kindName string, tok VToken) SyntaxTreeNode - ShiftError(kindName string) SyntaxTreeNode - Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode - Accept(f SyntaxTreeNode) -} - -var _ SyntaxTreeBuilder = &DefaultSyntaxTreeBuilder{} - -// DefaultSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder. -type DefaultSyntaxTreeBuilder struct { - tree *Node -} - -// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder. -func NewDefaultSyntaxTreeBuilder() *DefaultSyntaxTreeBuilder { - return &DefaultSyntaxTreeBuilder{} -} - -// Shift is a implementation of SyntaxTreeBuilder.Shift. -func (b *DefaultSyntaxTreeBuilder) Shift(kindName string, tok VToken) SyntaxTreeNode { - bytePos, byteLen := tok.BytePosition() - row, col := tok.Position() - return &Node{ - Type: NodeTypeTerminal, - KindName: kindName, - Text: string(tok.Lexeme()), - BytePos: bytePos, - ByteLen: byteLen, - Row: row, - Col: col, - } -} - -// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError. -func (b *DefaultSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode { - return &Node{ - Type: NodeTypeError, - KindName: kindName, - } -} - -// Reduce is a implementation of SyntaxTreeBuilder.Reduce. -func (b *DefaultSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode { - cNodes := make([]*Node, len(children)) - for i, c := range children { - cNodes[i] = c.(*Node) - } - return &Node{ - Type: NodeTypeNonTerminal, - KindName: kindName, - Children: cNodes, - } -} - -// Accept is a implementation of SyntaxTreeBuilder.Accept. -func (b *DefaultSyntaxTreeBuilder) Accept(f SyntaxTreeNode) { - b.tree = f.(*Node) -} - -// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil. -func (b *DefaultSyntaxTreeBuilder) Tree() *Node { - return b.tree -} - -// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree. -type SyntaxTreeActionSet struct { - gram Grammar - builder SyntaxTreeBuilder - semStack *semanticStack - disableASTAction bool -} - -// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree). -// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them. -func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { - return &SyntaxTreeActionSet{ - gram: gram, - builder: builder, - semStack: newSemanticStack(), - } -} - -// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree). -// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them. -func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { - return &SyntaxTreeActionSet{ - gram: gram, - builder: builder, - semStack: newSemanticStack(), - disableASTAction: true, - } -} - -// Shift is a implementation of SemanticActionSet.Shift method. -func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) { - term := a.tokenToTerminal(tok) - a.semStack.push(a.builder.Shift(a.gram.Terminal(term), tok)) -} - -// Reduce is a implementation of SemanticActionSet.Reduce method. -func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { - lhs := a.gram.LHS(prodNum) - - // When an alternative is empty, `n` will be 0, and `handle` will be empty slice. - n := a.gram.AlternativeSymbolCount(prodNum) - handle := a.semStack.pop(n) - - var astAct []int - if !a.disableASTAction { - astAct = a.gram.ASTAction(prodNum) - } - var children []SyntaxTreeNode - if astAct != nil { - // Count the number of children in advance to avoid frequent growth in a slice for children. - { - l := 0 - for _, e := range astAct { - if e > 0 { - l++ - } else { - offset := e*-1 - 1 - l += handle[offset].ChildCount() - } - } - - children = make([]SyntaxTreeNode, l) - } - - p := 0 - for _, e := range astAct { - if e > 0 { - offset := e - 1 - children[p] = handle[offset] - p++ - } else { - offset := e*-1 - 1 - for _, c := range handle[offset].ExpandChildren() { - children[p] = c - p++ - } - } - } - } else { - // If an alternative has no AST action, a driver generates - // a node with the same structure as a CST. - children = handle - } - - a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children)) -} - -// Accept is a implementation of SemanticActionSet.Accept method. -func (a *SyntaxTreeActionSet) Accept() { - top := a.semStack.pop(1) - a.builder.Accept(top[0]) -} - -// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method. -func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) { - a.semStack.pop(popped) - a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error()))) -} - -// MissError is a implementation of SemanticActionSet.MissError method. -func (a *SyntaxTreeActionSet) MissError(cause VToken) { -} - -func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int { - if tok.EOF() { - return a.gram.EOF() - } - - return tok.TerminalID() -} - -type semanticStack struct { - frames []SyntaxTreeNode -} - -func newSemanticStack() *semanticStack { - return &semanticStack{ - frames: make([]SyntaxTreeNode, 0, 100), - } -} - -func (s *semanticStack) push(f SyntaxTreeNode) { - s.frames = append(s.frames, f) -} - -func (s *semanticStack) pop(n int) []SyntaxTreeNode { - fs := s.frames[len(s.frames)-n:] - s.frames = s.frames[:len(s.frames)-n] - - return fs -} - -type NodeType int - -const ( - NodeTypeError = 0 - NodeTypeTerminal = 1 - NodeTypeNonTerminal = 2 -) - -// Node is a implementation of SyntaxTreeNode interface. -type Node struct { - Type NodeType - KindName string - Text string - BytePos int - ByteLen int - Row int - Col int - Children []*Node -} - -func (n *Node) MarshalJSON() ([]byte, error) { - switch n.Type { - case NodeTypeError: - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - }{ - Type: n.Type, - KindName: n.KindName, - }) - case NodeTypeTerminal: - if n.KindName == "" { - return json.Marshal(struct { - Type NodeType `json:"type"` - Text string `json:"text"` - Row int `json:"row"` - Col int `json:"col"` - }{ - Type: n.Type, - Text: n.Text, - Row: n.Row, - Col: n.Col, - }) - } - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - Text string `json:"text"` - Row int `json:"row"` - Col int `json:"col"` - }{ - Type: n.Type, - KindName: n.KindName, - Text: n.Text, - Row: n.Row, - Col: n.Col, - }) - case NodeTypeNonTerminal: - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - Children []*Node `json:"children"` - }{ - Type: n.Type, - KindName: n.KindName, - Children: n.Children, - }) - default: - return nil, fmt.Errorf("invalid node type: %v", n.Type) - } -} - -// ChildCount is a implementation of SyntaxTreeNode.ChildCount. -func (n *Node) ChildCount() int { - return len(n.Children) -} - -// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren. -func (n *Node) ExpandChildren() []SyntaxTreeNode { - fs := make([]SyntaxTreeNode, len(n.Children)) - for i, n := range n.Children { - fs[i] = n - } - return fs -} - -// PrintTree prints a syntax tree whose root is `node`. -func PrintTree(w io.Writer, node *Node) { - printTree(w, node, "", "") -} - -func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) { - if node == nil { - return - } - - switch node.Type { - case NodeTypeError: - fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) - case NodeTypeTerminal: - fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text)) - case NodeTypeNonTerminal: - fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) - - num := len(node.Children) - for i, child := range node.Children { - var line string - if num > 1 && i < num-1 { - line = "├─ " - } else { - line = "└─ " - } - - var prefix string - if i >= num-1 { - prefix = " " - } else { - prefix = "│ " - } - - printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } - } -} diff --git a/src/urubu/driver/parser/spec.go b/src/urubu/driver/parser/spec.go deleted file mode 100644 index 6dc7c3f..0000000 --- a/src/urubu/driver/parser/spec.go +++ /dev/null @@ -1,73 +0,0 @@ -package parser - -import spec "urubu/spec/grammar" - -type grammarImpl struct { - g *spec.CompiledGrammar -} - -func NewGrammar(g *spec.CompiledGrammar) *grammarImpl { - return &grammarImpl{ - g: g, - } -} - -func (g *grammarImpl) InitialState() int { - return g.g.Syntactic.InitialState -} - -func (g *grammarImpl) StartProduction() int { - return g.g.Syntactic.StartProduction -} - -func (g *grammarImpl) RecoverProduction(prod int) bool { - return g.g.Syntactic.RecoverProductions[prod] != 0 -} - -func (g *grammarImpl) Action(state int, terminal int) int { - return g.g.Syntactic.Action[state*g.g.Syntactic.TerminalCount+terminal] -} - -func (g *grammarImpl) GoTo(state int, lhs int) int { - return g.g.Syntactic.GoTo[state*g.g.Syntactic.NonTerminalCount+lhs] -} - -func (g *grammarImpl) AlternativeSymbolCount(prod int) int { - return g.g.Syntactic.AlternativeSymbolCounts[prod] -} - -func (g *grammarImpl) TerminalCount() int { - return g.g.Syntactic.TerminalCount -} - -func (g *grammarImpl) SkipTerminal(terminal int) bool { - return g.g.Syntactic.TerminalSkip[terminal] == 1 -} - -func (g *grammarImpl) ErrorTrapperState(state int) bool { - return g.g.Syntactic.ErrorTrapperStates[state] != 0 -} - -func (g *grammarImpl) NonTerminal(nonTerminal int) string { - return g.g.Syntactic.NonTerminals[nonTerminal] -} - -func (g *grammarImpl) LHS(prod int) int { - return g.g.Syntactic.LHSSymbols[prod] -} - -func (g *grammarImpl) EOF() int { - return g.g.Syntactic.EOFSymbol -} - -func (g *grammarImpl) Error() int { - return g.g.Syntactic.ErrorSymbol -} - -func (g *grammarImpl) Terminal(terminal int) string { - return g.g.Syntactic.Terminals[terminal] -} - -func (g *grammarImpl) ASTAction(prod int) []int { - return g.g.ASTAction.Entries[prod] -} diff --git a/src/urubu/driver/parser/template.go b/src/urubu/driver/parser/template.go deleted file mode 100644 index 33d097c..0000000 --- a/src/urubu/driver/parser/template.go +++ /dev/null @@ -1,535 +0,0 @@ -package parser - -import ( - "bytes" - _ "embed" - "fmt" - "go/ast" - "go/format" - "go/parser" - "go/token" - goToken "go/token" - "strconv" - "strings" - "text/template" - - spec "urubu/spec/grammar" -) - -// go:embed parser.go -var parserCoreSrc string - -// go:embed semantic_action.go -var semActSrc string - -func GenParser(cgram *spec.CompiledGrammar, pkgName string) ([]byte, error) { - var parserSrc string - { - fset := goToken.NewFileSet() - f, err := parser.ParseFile(fset, "parser.go", parserCoreSrc, parser.ParseComments) - if err != nil { - return nil, err - } - - var b strings.Builder - err = format.Node(&b, fset, f) - if err != nil { - return nil, err - } - - parserSrc = b.String() - } - - var grammarSrc string - { - t, err := template.New("").Funcs(genGrammarTemplateFuncs(cgram)).Parse(grammarSrcTmplate) - if err != nil { - return nil, err - } - - var b strings.Builder - err = t.Execute(&b, map[string]interface{}{ - "initialState": cgram.Syntactic.InitialState, - "startProduction": cgram.Syntactic.StartProduction, - "terminalCount": cgram.Syntactic.TerminalCount, - "nonTerminalCount": cgram.Syntactic.NonTerminalCount, - "eofSymbol": cgram.Syntactic.EOFSymbol, - "errorSymbol": cgram.Syntactic.ErrorSymbol, - }) - if err != nil { - return nil, err - } - - grammarSrc = b.String() - } - - var lexerSrc string - { - t, err := template.New("").Funcs(genLexerTemplateFuncs(cgram)).Parse(lexerSrcTmplate) - if err != nil { - return nil, err - } - - var b strings.Builder - err = t.Execute(&b, nil) - if err != nil { - return nil, err - } - - lexerSrc = b.String() - } - - var src string - { - tmpl := `// Code generated by vartan-go. DO NOT EDIT. -{{ .parserSrc }} - -{{ .grammarSrc }} - -{{ .lexerSrc }} -` - t, err := template.New("").Parse(tmpl) - if err != nil { - return nil, err - } - - var b strings.Builder - err = t.Execute(&b, map[string]string{ - "parserSrc": parserSrc, - "grammarSrc": grammarSrc, - "lexerSrc": lexerSrc, - }) - if err != nil { - return nil, err - } - - src = b.String() - } - - fset := goToken.NewFileSet() - f, err := parser.ParseFile(fset, "", src, parser.ParseComments) - if err != nil { - return nil, err - } - - f.Name = ast.NewIdent(pkgName) - - // Complete an import statement. - for _, d := range f.Decls { - gd, ok := d.(*ast.GenDecl) - if !ok || gd.Tok != token.IMPORT { - continue - } - gd.Specs = append(gd.Specs, &ast.ImportSpec{ - Path: &ast.BasicLit{ - Value: `"io"`, - }, - }) - break - } - - var b bytes.Buffer - err = format.Node(&b, fset, f) - if err != nil { - return nil, err - } - - return b.Bytes(), nil -} - -const grammarSrcTmplate = ` -type grammarImpl struct { - recoverProductions []int - action []int - goTo []int - alternativeSymbolCounts []int - errorTrapperStates []int - nonTerminals []string - lhsSymbols []int - terminals []string - terminalSkip []int - astActions [][]int -} - -func NewGrammar() *grammarImpl { - return &grammarImpl{ - recoverProductions: {{ genRecoverProductions }}, - action: {{ genAction }}, - goTo: {{ genGoTo }}, - alternativeSymbolCounts: {{ genAlternativeSymbolCounts }}, - errorTrapperStates: {{ genErrorTrapperStates }}, - nonTerminals: {{ genNonTerminals }}, - lhsSymbols: {{ genLHSSymbols }}, - terminals: {{ genTerminals }}, - terminalSkip: {{ genTerminalSkip }}, - astActions: {{ genASTActions }}, - } -} - -func (g *grammarImpl) InitialState() int { - return {{ .initialState }} -} - -func (g *grammarImpl) StartProduction() int { - return {{ .startProduction }} -} - -func (g *grammarImpl) RecoverProduction(prod int) bool { - return g.recoverProductions[prod] != 0 -} - -func (g *grammarImpl) Action(state int, terminal int) int { - return g.action[state*{{ .terminalCount }}+terminal] -} - -func (g *grammarImpl) GoTo(state int, lhs int) int { - return g.goTo[state*{{ .nonTerminalCount }}+lhs] -} - -func (g *grammarImpl) AlternativeSymbolCount(prod int) int { - return g.alternativeSymbolCounts[prod] -} - -func (g *grammarImpl) TerminalCount() int { - return {{ .terminalCount }} -} - -func (g *grammarImpl) SkipTerminal(terminal int) bool { - return g.terminalSkip[terminal] == 1 -} - -func (g *grammarImpl) ErrorTrapperState(state int) bool { - return g.errorTrapperStates[state] != 0 -} - -func (g *grammarImpl) NonTerminal(nonTerminal int) string { - return g.nonTerminals[nonTerminal] -} - -func (g *grammarImpl) LHS(prod int) int { - return g.lhsSymbols[prod] -} - -func (g *grammarImpl) EOF() int { - return {{ .eofSymbol }} -} - -func (g *grammarImpl) Error() int { - return {{ .errorSymbol }} -} - -func (g *grammarImpl) Terminal(terminal int) string { - return g.terminals[terminal] -} - -func (g *grammarImpl) ASTAction(prod int) []int { - return g.astActions[prod] -} -` - -func genGrammarTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { - return template.FuncMap{ - "genRecoverProductions": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.RecoverProductions { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genAction": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.Action { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genGoTo": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.GoTo { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genAlternativeSymbolCounts": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.AlternativeSymbolCounts { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genErrorTrapperStates": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.ErrorTrapperStates { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genNonTerminals": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]string{\n") - for _, v := range cgram.Syntactic.NonTerminals { - fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genLHSSymbols": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.LHSSymbols { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genTerminals": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]string{\n") - for _, v := range cgram.Syntactic.Terminals { - fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genTerminalSkip": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.TerminalSkip { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - "genASTActions": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[][]int{\n") - for _, entries := range cgram.ASTAction.Entries { - if len(entries) == 0 { - fmt.Fprintf(&b, "nil,\n") - continue - } - - fmt.Fprintf(&b, "{\n") - c := 1 - for _, v := range entries { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "},\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - } -} - -const lexerSrcTmplate = ` -type vToken struct { - terminalID int - tok *Token -} - -func (t *vToken) TerminalID() int { - return t.terminalID -} - -func (t *vToken) Lexeme() []byte { - return t.tok.Lexeme -} - -func (t *vToken) EOF() bool { - return t.tok.EOF -} - -func (t *vToken) Invalid() bool { - return t.tok.Invalid -} - -func (t *vToken) BytePosition() (int, int) { - return t.tok.BytePos, t.tok.ByteLen -} - -func (t *vToken) Position() (int, int) { - return t.tok.Row, t.tok.Col -} - -var kindToTerminal = {{ genKindToTerminal }} - -type tokenStream struct { - lex *Lexer - kindToTerminal []int -} - -func NewTokenStream(src io.Reader) (*tokenStream, error) { - lex, err := NewLexer(NewLexSpec(), src) - if err != nil { - return nil, err - } - - return &tokenStream{ - lex: lex, - }, nil -} - -func (t *tokenStream) Next() (VToken, error) { - tok, err := t.lex.Next() - if err != nil { - return nil, err - } - return &vToken{ - terminalID: kindToTerminal[tok.KindID], - tok: tok, - }, nil -} -` - -func genLexerTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { - return template.FuncMap{ - "genKindToTerminal": func() string { - var b strings.Builder - fmt.Fprintf(&b, "[]int{\n") - c := 1 - for _, v := range cgram.Syntactic.KindToTerminal { - fmt.Fprintf(&b, "%v, ", v) - if c == 20 { - fmt.Fprintf(&b, "\n") - c = 1 - } else { - c++ - } - } - if c > 1 { - fmt.Fprintf(&b, "\n") - } - fmt.Fprintf(&b, "}") - return b.String() - }, - } -} - -func GenSemanticAction(pkgName string) ([]byte, error) { - var src string - { - tmpl := `// Code generated by vartan-go. DO NOT EDIT. -{{ .semActSrc }} -` - t, err := template.New("").Parse(tmpl) - if err != nil { - return nil, err - } - - var b strings.Builder - err = t.Execute(&b, map[string]string{ - "semActSrc": semActSrc, - }) - if err != nil { - return nil, err - } - - src = b.String() - } - - fset := goToken.NewFileSet() - f, err := parser.ParseFile(fset, "", src, parser.ParseComments) - if err != nil { - return nil, err - } - - f.Name = ast.NewIdent(pkgName) - - var b bytes.Buffer - err = format.Node(&b, fset, f) - if err != nil { - return nil, err - } - - return b.Bytes(), nil -} diff --git a/src/urubu/driver/parser/token_stream.go b/src/urubu/driver/parser/token_stream.go deleted file mode 100644 index 788e521..0000000 --- a/src/urubu/driver/parser/token_stream.go +++ /dev/null @@ -1,65 +0,0 @@ -package parser - -import ( - "io" - - "urubu/driver/lexer" - spec "urubu/spec/grammar" -) - -type vToken struct { - terminalID int - tok *lexer.Token -} - -func (t *vToken) TerminalID() int { - return t.terminalID -} - -func (t *vToken) Lexeme() []byte { - return t.tok.Lexeme -} - -func (t *vToken) EOF() bool { - return t.tok.EOF -} - -func (t *vToken) Invalid() bool { - return t.tok.Invalid -} - -func (t *vToken) BytePosition() (int, int) { - return t.tok.BytePos, t.tok.ByteLen -} - -func (t *vToken) Position() (int, int) { - return t.tok.Row, t.tok.Col -} - -type tokenStream struct { - lex *lexer.Lexer - kindToTerminal []int -} - -func NewTokenStream(g *spec.CompiledGrammar, src io.Reader) (TokenStream, error) { - lex, err := lexer.NewLexer(lexer.NewLexSpec(g.Lexical), src) - if err != nil { - return nil, err - } - - return &tokenStream{ - lex: lex, - kindToTerminal: g.Syntactic.KindToTerminal, - }, nil -} - -func (l *tokenStream) Next() (VToken, error) { - tok, err := l.lex.Next() - if err != nil { - return nil, err - } - return &vToken{ - terminalID: l.kindToTerminal[tok.KindID], - tok: tok, - }, nil -} diff --git a/src/urubu/error/error.go b/src/urubu/error.go index 0e5d3af..0e5d3af 100644 --- a/src/urubu/error/error.go +++ b/src/urubu/error.go diff --git a/src/urubu/grammar.go b/src/urubu/grammar.go new file mode 100644 index 0000000..6059210 --- /dev/null +++ b/src/urubu/grammar.go @@ -0,0 +1,2911 @@ +package grammar + +import ( + "crypto/sha256" + "encoding/binary" + "encoding/hex" + "errors" + "fmt" + "io" + "sort" + "strconv" + "strings" + + verr "urubu/error" + "urubu/grammar/lexical" + "urubu/grammar/symbol" + spec "urubu/spec/grammar" + "urubu/spec/grammar/parser" +) + +type firstEntry struct { + symbols map[symbol.Symbol]struct{} + empty bool +} + +func newFirstEntry() *firstEntry { + return &firstEntry{ + symbols: map[symbol.Symbol]struct{}{}, + empty: false, + } +} + +func (e *firstEntry) add(sym symbol.Symbol) bool { + if _, ok := e.symbols[sym]; ok { + return false + } + e.symbols[sym] = struct{}{} + return true +} + +func (e *firstEntry) addEmpty() bool { + if !e.empty { + e.empty = true + return true + } + return false +} + +func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { + if target == nil { + return false + } + changed := false + for sym := range target.symbols { + added := e.add(sym) + if added { + changed = true + } + } + return changed +} + +type firstSet struct { + set map[symbol.Symbol]*firstEntry +} + +func newFirstSet(prods *productionSet) *firstSet { + fst := &firstSet{ + set: map[symbol.Symbol]*firstEntry{}, + } + for _, prod := range prods.getAllProductions() { + if _, ok := fst.set[prod.lhs]; ok { + continue + } + fst.set[prod.lhs] = newFirstEntry() + } + + return fst +} + +func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { + entry := newFirstEntry() + if prod.rhsLen <= head { + entry.addEmpty() + return entry, nil + } + for _, sym := range prod.rhs[head:] { + if sym.IsTerminal() { + entry.add(sym) + return entry, nil + } + + e := fst.findBySymbol(sym) + if e == nil { + return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym) + } + for s := range e.symbols { + entry.add(s) + } + if !e.empty { + return entry, nil + } + } + entry.addEmpty() + return entry, nil +} + +func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { + return fst.set[sym] +} + +type firstComContext struct { + first *firstSet +} + +func newFirstComContext(prods *productionSet) *firstComContext { + return &firstComContext{ + first: newFirstSet(prods), + } +} + +func genFirstSet(prods *productionSet) (*firstSet, error) { + cc := newFirstComContext(prods) + for { + more := false + for _, prod := range prods.getAllProductions() { + e := cc.first.findBySymbol(prod.lhs) + changed, err := genProdFirstEntry(cc, e, prod) + if err != nil { + return nil, err + } + if changed { + more = true + } + } + if !more { + break + } + } + return cc.first, nil +} + +func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) { + if prod.isEmpty() { + return acc.addEmpty(), nil + } + + for _, sym := range prod.rhs { + if sym.IsTerminal() { + return acc.add(sym), nil + } + + e := cc.first.findBySymbol(sym) + changed := acc.mergeExceptEmpty(e) + if !e.empty { + return changed, nil + } + } + return acc.addEmpty(), nil +} + +type astActionEntry struct { + position int + expansion bool +} + +type assocType string + +const ( + assocTypeNil = assocType("") + assocTypeLeft = assocType("left") + assocTypeRight = assocType("right") +) + +const ( + precNil = 0 + precMin = 1 +) + +// precAndAssoc represents precedence and associativities of terminal symbols and productions. +// We use the priority of the production to resolve shift/reduce conflicts. +type precAndAssoc struct { + // termPrec and termAssoc represent the precedence of the terminal symbols. + termPrec map[symbol.SymbolNum]int + termAssoc map[symbol.SymbolNum]assocType + + // prodPrec and prodAssoc represent the precedence and the associativities of the production. + // These values are inherited from the right-most terminal symbols in the RHS of the productions. + prodPrec map[productionNum]int + prodAssoc map[productionNum]assocType +} + +func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { + prec, ok := pa.termPrec[sym] + if !ok { + return precNil + } + + return prec +} + +func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { + assoc, ok := pa.termAssoc[sym] + if !ok { + return assocTypeNil + } + + return assoc +} + +func (pa *precAndAssoc) productionPredence(prod productionNum) int { + prec, ok := pa.prodPrec[prod] + if !ok { + return precNil + } + + return prec +} + +func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType { + assoc, ok := pa.prodAssoc[prod] + if !ok { + return assocTypeNil + } + + return assoc +} + +const reservedSymbolNameError = "error" + +type Grammar struct { + name string + lexSpec *lexical.LexSpec + skipSymbols []symbol.Symbol + productionSet *productionSet + augmentedStartSymbol symbol.Symbol + errorSymbol symbol.Symbol + symbolTable *symbol.SymbolTableReader + astActions map[productionID][]*astActionEntry + precAndAssoc *precAndAssoc + + // recoverProductions is a set of productions having the recover directive. + recoverProductions map[productionID]struct{} +} + +type buildConfig struct { + isReportingEnabled bool +} + +type BuildOption func(config *buildConfig) + +func EnableReporting() BuildOption { + return func(config *buildConfig) { + config.isReportingEnabled = true + } +} + +type GrammarBuilder struct { + AST *parser.RootNode + + errs verr.SpecErrors +} + +func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + gram, err := b.build() + if err != nil { + return nil, nil, err + } + + return compile(gram, opts...) +} + +func (b *GrammarBuilder) build() (*Grammar, error) { + var specName string + { + errOccurred := false + for _, dir := range b.AST.Directives { + if dir.Name != "name" { + continue + } + + if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'name' takes just one ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + + errOccurred = true + break + } + + specName = dir.Parameters[0].ID + break + } + + if specName == "" && !errOccurred { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrNoGrammarName, + }) + } + } + + b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST) + if len(b.errs) > 0 { + return nil, b.errs + } + + symTab, ss, err := b.genSymbolTable(b.AST) + if err != nil { + return nil, err + } + + lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) + if err != nil { + return nil, err + } + + prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) + if err != nil { + return nil, err + } + if prodsAndActs == nil && len(b.errs) > 0 { + return nil, b.errs + } + + pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) + if err != nil { + return nil, err + } + if pa == nil && len(b.errs) > 0 { + return nil, b.errs + } + + syms := findUsedAndUnusedSymbols(b.AST) + if syms == nil && len(b.errs) > 0 { + return nil, b.errs + } + + // When a terminal symbol that cannot be reached from the start symbol has the skip directive, + // the compiler treats its terminal as a used symbol, not unused. + { + r := symTab.Reader() + for _, sym := range skip { + s, _ := r.ToText(sym) + if _, ok := syms.unusedTerminals[s]; !ok { + prod := syms.usedTerminals[s] + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrTermCannotBeSkipped, + Detail: s, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + continue + } + + delete(syms.unusedTerminals, s) + } + } + + for sym, prod := range syms.unusedProductions { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUnusedProduction, + Detail: sym, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + for sym, prod := range syms.unusedTerminals { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUnusedTerminal, + Detail: sym, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + if len(b.errs) > 0 { + return nil, b.errs + } + + return &Grammar{ + name: specName, + lexSpec: lexSpec, + skipSymbols: skip, + productionSet: prodsAndActs.prods, + augmentedStartSymbol: prodsAndActs.augStartSym, + errorSymbol: ss.errSym, + symbolTable: symTab.Reader(), + astActions: prodsAndActs.astActs, + recoverProductions: prodsAndActs.recoverProds, + precAndAssoc: pa, + }, nil +} + +type usedAndUnusedSymbols struct { + unusedProductions map[string]*parser.ProductionNode + unusedTerminals map[string]*parser.ProductionNode + usedTerminals map[string]*parser.ProductionNode +} + +func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { + prods := map[string]*parser.ProductionNode{} + lexProds := map[string]*parser.ProductionNode{} + mark := map[string]bool{} + { + for _, p := range root.Productions { + prods[p.LHS] = p + mark[p.LHS] = false + for _, alt := range p.RHS { + for _, e := range alt.Elements { + if e.ID == "" { + continue + } + mark[e.ID] = false + } + } + } + + for _, p := range root.LexProductions { + lexProds[p.LHS] = p + mark[p.LHS] = false + } + + start := root.Productions[0] + mark[start.LHS] = true + markUsedSymbols(mark, map[string]bool{}, prods, start) + + // We don't have to check the error symbol because the error symbol doesn't have a production. + delete(mark, reservedSymbolNameError) + } + + usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) + unusedProds := map[string]*parser.ProductionNode{} + unusedTerms := map[string]*parser.ProductionNode{} + for sym, used := range mark { + if p, ok := prods[sym]; ok { + if used { + continue + } + unusedProds[sym] = p + continue + } + if p, ok := lexProds[sym]; ok { + if used { + usedTerms[sym] = p + } else { + unusedTerms[sym] = p + } + continue + } + + // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error + // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can + // ignore it here. + } + + return &usedAndUnusedSymbols{ + usedTerminals: usedTerms, + unusedProductions: unusedProds, + unusedTerminals: unusedTerms, + } +} + +func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { + if marked[prod.LHS] { + return + } + + for _, alt := range prod.RHS { + for _, e := range alt.Elements { + if e.ID == "" { + continue + } + + mark[e.ID] = true + + p, ok := prods[e.ID] + if !ok { + continue + } + + // Remove a production to avoid inifinite recursion. + marked[prod.LHS] = true + + markUsedSymbols(mark, marked, prods, p) + } + } +} + +func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { + var ids []string + { + for _, prod := range root.Productions { + ids = append(ids, prod.LHS) + for _, alt := range prod.RHS { + for _, elem := range alt.Elements { + if elem.Label != nil { + ids = append(ids, elem.Label.Name) + } + } + } + } + for _, prod := range root.LexProductions { + ids = append(ids, prod.LHS) + } + for _, dir := range root.Directives { + dirIDs := collectUserDefinedIDsFromDirective(dir) + if len(dirIDs) > 0 { + ids = append(ids, dirIDs...) + } + } + } + + duplicated := lexical.FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return + } + + for _, dup := range duplicated { + var s string + { + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + s = b.String() + } + + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrSpellingInconsistency, + Detail: s, + }) + } +} + +func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { + var ids []string + for _, param := range dir.Parameters { + if param.Group != nil { + for _, d := range param.Group { + dIDs := collectUserDefinedIDsFromDirective(d) + if len(dIDs) > 0 { + ids = append(ids, dIDs...) + } + } + } + if param.OrderedSymbol != "" { + ids = append(ids, param.OrderedSymbol) + } + } + return ids +} + +type symbols struct { + errSym symbol.Symbol + augStartSym symbol.Symbol + startSym symbol.Symbol +} + +func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { + symTab := symbol.NewSymbolTable() + w := symTab.Writer() + r := symTab.Reader() + + // We need to register the reserved symbol before registering others. + var errSym symbol.Symbol + { + sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) + if err != nil { + return nil, nil, err + } + errSym = sym + } + + for _, prod := range root.LexProductions { + if sym, exist := r.ToSymbol(prod.LHS); exist { + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateTerminal, + Detail: prod.LHS, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + continue + } + + _, err := w.RegisterTerminalSymbol(prod.LHS) + if err != nil { + return nil, nil, err + } + } + + startProd := root.Productions[0] + augStartText := fmt.Sprintf("%s'", startProd.LHS) + var err error + augStartSym, err := w.RegisterStartSymbol(augStartText) + if err != nil { + return nil, nil, err + } + if augStartSym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: startProd.Pos.Row, + Col: startProd.Pos.Col, + }) + } + + startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) + if err != nil { + return nil, nil, err + } + if startSym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: startProd.Pos.Row, + Col: startProd.Pos.Col, + }) + } + + for _, prod := range root.Productions { + sym, err := w.RegisterNonTerminalSymbol(prod.LHS) + if err != nil { + return nil, nil, err + } + if sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateName, + Detail: prod.LHS, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + } + + return symTab, &symbols{ + errSym: errSym, + augStartSym: augStartSym, + startSym: startSym, + }, nil +} + +func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { + entries := []*lexical.LexEntry{} + skipSyms := []symbol.Symbol{} + for _, prod := range root.LexProductions { + entry, skip, specErr, err := genLexEntry(prod) + if err != nil { + return nil, nil, err + } + if specErr != nil { + b.errs = append(b.errs, specErr) + continue + } + if skip { + sym, _ := symTab.ToSymbol(prod.LHS) + skipSyms = append(skipSyms, sym) + } + entries = append(entries, entry) + } + + checkedFragments := map[string]struct{}{} + for _, fragment := range root.Fragments { + if _, exist := checkedFragments[fragment.LHS]; exist { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateFragment, + Detail: fragment.LHS, + Row: fragment.Pos.Row, + Col: fragment.Pos.Col, + }) + continue + } + checkedFragments[fragment.LHS] = struct{}{} + + entries = append(entries, &lexical.LexEntry{ + Fragment: true, + Kind: spec.LexKindName(fragment.LHS), + Pattern: fragment.RHS, + }) + } + + return &lexical.LexSpec{ + Entries: entries, + }, skipSyms, nil +} + +func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { + alt := prod.RHS[0] + elem := alt.Elements[0] + + var pattern string + if elem.Literally { + pattern = spec.EscapePattern(elem.Pattern) + } else { + pattern = elem.Pattern + } + + var modes []spec.LexModeName + var skip bool + var push spec.LexModeName + var pop bool + dirConsumed := map[string]struct{}{} + for _, dir := range prod.Directives { + if _, consumed := dirConsumed[dir.Name]; consumed { + return nil, false, &verr.SpecError{ + Cause: semErrDuplicateDir, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + dirConsumed[dir.Name] = struct{}{} + + switch dir.Name { + case "mode": + if len(dir.Parameters) == 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'mode' directive needs an ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + for _, param := range dir.Parameters { + if param.ID == "" { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'mode' directive needs an ID parameter", + Row: param.Pos.Row, + Col: param.Pos.Col, + }, nil + } + modes = append(modes, spec.LexModeName(param.ID)) + } + case "skip": + if len(dir.Parameters) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'skip' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + skip = true + case "push": + if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'push' directive needs an ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + push = spec.LexModeName(dir.Parameters[0].ID) + case "pop": + if len(dir.Parameters) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'pop' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + pop = true + default: + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + } + + if len(alt.Directives) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrInvalidAltDir, + Detail: "a lexical production cannot have alternative directives", + Row: alt.Directives[0].Pos.Row, + Col: alt.Directives[0].Pos.Col, + }, nil + } + + return &lexical.LexEntry{ + Modes: modes, + Kind: spec.LexKindName(prod.LHS), + Pattern: pattern, + Push: push, + Pop: pop, + }, skip, nil, nil +} + +type productionsAndActions struct { + prods *productionSet + augStartSym symbol.Symbol + astActs map[productionID][]*astActionEntry + prodPrecsTerm map[productionID]symbol.Symbol + prodPrecsOrdSym map[productionID]string + prodPrecPoss map[productionID]*parser.Position + recoverProds map[productionID]struct{} +} + +func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { + if len(root.Productions) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrNoProduction, + }) + return nil, nil + } + + prods := newProductionSet() + astActs := map[productionID][]*astActionEntry{} + prodPrecsTerm := map[productionID]symbol.Symbol{} + prodPrecsOrdSym := map[productionID]string{} + prodPrecPoss := map[productionID]*parser.Position{} + recoverProds := map[productionID]struct{}{} + + p, err := newProduction(augStartSym, []symbol.Symbol{ + startSym, + }) + if err != nil { + return nil, err + } + + prods.append(p) + + for _, prod := range root.Productions { + lhsSym, ok := symTab.ToSymbol(prod.LHS) + if !ok { + // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. + return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) + } + + if len(prod.Directives) > 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrInvalidProdDir, + Detail: "a production cannot have production directives", + Row: prod.Directives[0].Pos.Row, + Col: prod.Directives[0].Pos.Col, + }) + continue + } + + LOOP_RHS: + for _, alt := range prod.RHS { + altSyms := make([]symbol.Symbol, len(alt.Elements)) + offsets := map[string]int{} + ambiguousIDOffsets := map[string]struct{}{} + for i, elem := range alt.Elements { + sym, ok := symTab.ToSymbol(elem.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedSym, + Detail: elem.ID, + Row: elem.Pos.Row, + Col: elem.Pos.Col, + }) + continue LOOP_RHS + } + altSyms[i] = sym + + if elem.Label != nil { + if _, added := offsets[elem.Label.Name]; added { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateLabel, + Detail: elem.Label.Name, + Row: elem.Label.Pos.Row, + Col: elem.Label.Pos.Col, + }) + continue LOOP_RHS + } + if _, found := symTab.ToSymbol(elem.Label.Name); found { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrInvalidLabel, + Detail: elem.Label.Name, + Row: elem.Label.Pos.Row, + Col: elem.Label.Pos.Col, + }) + continue LOOP_RHS + } + offsets[elem.Label.Name] = i + } + // A symbol having a label can be specified by both the label and the symbol name. + // So record the symbol's position, whether or not it has a label. + if elem.ID != "" { + if _, exist := offsets[elem.ID]; exist { + // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need + // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify + // a label to resolve the ambiguity. + delete(offsets, elem.ID) + ambiguousIDOffsets[elem.ID] = struct{}{} + } else { + offsets[elem.ID] = i + } + } + } + + p, err := newProduction(lhsSym, altSyms) + if err != nil { + return nil, err + } + if _, exist := prods.findByID(p.id); exist { + // Report the line number of a duplicate alternative. + // When the alternative is empty, we report the position of its LHS. + var row int + var col int + if len(alt.Elements) > 0 { + row = alt.Elements[0].Pos.Row + col = alt.Elements[0].Pos.Col + } else { + row = prod.Pos.Row + col = prod.Pos.Col + } + + var detail string + { + var b strings.Builder + fmt.Fprintf(&b, "%v →", prod.LHS) + for _, elem := range alt.Elements { + switch { + case elem.ID != "": + fmt.Fprintf(&b, " %v", elem.ID) + case elem.Pattern != "": + fmt.Fprintf(&b, ` "%v"`, elem.Pattern) + } + } + if len(alt.Elements) == 0 { + fmt.Fprintf(&b, " ε") + } + + detail = b.String() + } + + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateProduction, + Detail: detail, + Row: row, + Col: col, + }) + continue LOOP_RHS + } + prods.append(p) + + dirConsumed := map[string]struct{}{} + for _, dir := range alt.Directives { + if _, consumed := dirConsumed[dir.Name]; consumed { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateDir, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + } + dirConsumed[dir.Name] = struct{}{} + + switch dir.Name { + case "ast": + if len(dir.Parameters) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'ast' directive needs at least one parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + astAct := make([]*astActionEntry, len(dir.Parameters)) + consumedOffsets := map[int]struct{}{} + for i, param := range dir.Parameters { + if param.ID == "" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'ast' directive can take only ID parameters", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + + if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrAmbiguousElem, + Detail: fmt.Sprintf("'%v' is ambiguous", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + + offset, ok := offsets[param.ID] + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + if _, consumed := consumedOffsets[offset]; consumed { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateElem, + Detail: param.ID, + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + consumedOffsets[offset] = struct{}{} + + if param.Expansion { + elem := alt.Elements[offset] + if elem.Pattern != "" { + // Currently, it is a bug to reach here because it is + // forbidden to have anything other than ID appear in + // production rules. + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + elemSym, ok := symTab.ToSymbol(elem.ID) + if !ok { + // If the symbol was not found, it's a bug. + return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) + } + if elemSym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + } + + astAct[i] = &astActionEntry{ + position: offset + 1, + expansion: param.Expansion, + } + } + astActs[p.id] = astAct + case "prec": + if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'prec' directive needs just one ID parameter or ordered symbol", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + param := dir.Parameters[0] + switch { + case param.ID != "": + sym, ok := symTab.ToSymbol(param.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + } + if !sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + prodPrecsTerm[p.id] = sym + prodPrecPoss[p.id] = ¶m.Pos + case param.OrderedSymbol != "": + prodPrecsOrdSym[p.id] = param.OrderedSymbol + prodPrecPoss[p.id] = ¶m.Pos + } + case "recover": + if len(dir.Parameters) > 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'recover' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + recoverProds[p.id] = struct{}{} + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name), + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + } + } + } + + return &productionsAndActions{ + prods: prods, + augStartSym: augStartSym, + astActs: astActs, + prodPrecsTerm: prodPrecsTerm, + prodPrecsOrdSym: prodPrecsOrdSym, + prodPrecPoss: prodPrecPoss, + recoverProds: recoverProds, + }, nil +} + +func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { + termPrec := map[symbol.SymbolNum]int{} + termAssoc := map[symbol.SymbolNum]assocType{} + ordSymPrec := map[string]int{} + { + var precGroup []*parser.DirectiveNode + for _, dir := range b.AST.Directives { + if dir.Name == "prec" { + if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'prec' needs just one directive group", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue + } + precGroup = dir.Parameters[0].Group + continue + } + + if dir.Name != "name" && dir.Name != "prec" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue + } + } + + precN := precMin + for _, dir := range precGroup { + var assocTy assocType + switch dir.Name { + case "left": + assocTy = assocTypeLeft + case "right": + assocTy = assocTypeRight + case "assign": + assocTy = assocTypeNil + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + return nil, nil + } + + if len(dir.Parameters) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "associativity needs at least one symbol", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + return nil, nil + } + ASSOC_PARAM_LOOP: + for _, p := range dir.Parameters { + switch { + case p.ID != "": + sym, ok := symTab.ToSymbol(p.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' is undefined", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if !sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if prec, alreadySet := termPrec[sym.Num()]; alreadySet { + if prec == precN { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } + break ASSOC_PARAM_LOOP + } + + termPrec[sym.Num()] = precN + termAssoc[sym.Num()] = assocTy + case p.OrderedSymbol != "": + if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { + if prec == precN { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } + break ASSOC_PARAM_LOOP + } + + ordSymPrec[p.OrderedSymbol] = precN + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "a parameter must be an ID or an ordered symbol", + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + } + + precN++ + } + } + if len(b.errs) > 0 { + return nil, nil + } + + prodPrec := map[productionNum]int{} + prodAssoc := map[productionNum]assocType{} + for _, prod := range prodsAndActs.prods.getAllProductions() { + // A #prec directive changes only precedence, not associativity. + if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { + if prec, ok := termPrec[term.Num()]; ok { + prodPrec[prod.num] = prec + prodAssoc[prod.num] = assocTypeNil + } else { + text, _ := symTab.ToText(term) + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedPrec, + Detail: text, + Row: prodsAndActs.prodPrecPoss[prod.id].Row, + Col: prodsAndActs.prodPrecPoss[prod.id].Col, + }) + } + } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok { + if prec, ok := ordSymPrec[ordSym]; ok { + prodPrec[prod.num] = prec + prodAssoc[prod.num] = assocTypeNil + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedOrdSym, + Detail: fmt.Sprintf("$%v", ordSym), + Row: prodsAndActs.prodPrecPoss[prod.id].Row, + Col: prodsAndActs.prodPrecPoss[prod.id].Col, + }) + } + } else { + // A production inherits precedence and associativity from the right-most terminal symbol. + mostrightTerm := symbol.SymbolNil + for _, sym := range prod.rhs { + if !sym.IsTerminal() { + continue + } + mostrightTerm = sym + } + if !mostrightTerm.IsNil() { + prodPrec[prod.num] = termPrec[mostrightTerm.Num()] + prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] + } + } + } + if len(b.errs) > 0 { + return nil, nil + } + + return &precAndAssoc{ + termPrec: termPrec, + termAssoc: termAssoc, + prodPrec: prodPrec, + prodAssoc: prodAssoc, + }, nil +} + +func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + config := &buildConfig{} + for _, opt := range opts { + opt(config) + } + + lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) + if err != nil { + if len(cErrs) > 0 { + var b strings.Builder + writeCompileError(&b, cErrs[0]) + for _, cerr := range cErrs[1:] { + fmt.Fprintf(&b, "\n") + writeCompileError(&b, cerr) + } + return nil, nil, fmt.Errorf(b.String()) + } + return nil, nil, err + } + + kind2Term := make([]int, len(lexSpec.KindNames)) + for i, k := range lexSpec.KindNames { + if k == spec.LexKindNameNil { + kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() + continue + } + + sym, ok := gram.symbolTable.ToSymbol(k.String()) + if !ok { + return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) + } + kind2Term[i] = sym.Num().Int() + } + + termTexts, err := gram.symbolTable.TerminalTexts() + if err != nil { + return nil, nil, err + } + + var termSkip []int + { + r := gram.symbolTable.Reader() + // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), + // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts + // because it does not contain a predefined symbol, like EOF. + // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. + termSkip = make([]int, len(termTexts)) + for _, t := range termTexts { + s, _ := r.ToSymbol(t) + for _, sk := range gram.skipSymbols { + if s != sk { + continue + } + termSkip[s.Num()] = 1 + break + } + } + } + + nonTerms, err := gram.symbolTable.NonTerminalTexts() + if err != nil { + return nil, nil, err + } + + firstSet, err := genFirstSet(gram.productionSet) + if err != nil { + return nil, nil, err + } + + lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol) + if err != nil { + return nil, nil, err + } + + var tab *ParsingTable + var report *spec.Report + { + lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet) + if err != nil { + return nil, nil, err + } + + b := &lrTableBuilder{ + automaton: lalr1.lr0Automaton, + prods: gram.productionSet, + termCount: len(termTexts), + nonTermCount: len(nonTerms), + symTab: gram.symbolTable, + precAndAssoc: gram.precAndAssoc, + } + tab, err = b.build() + if err != nil { + return nil, nil, err + } + + if config.isReportingEnabled { + report, err = b.genReport(tab, gram) + if err != nil { + return nil, nil, err + } + } + } + + action := make([]int, len(tab.actionTable)) + for i, e := range tab.actionTable { + action[i] = int(e) + } + goTo := make([]int, len(tab.goToTable)) + for i, e := range tab.goToTable { + goTo[i] = int(e) + } + + lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1) + altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1) + recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) + astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) + for _, p := range gram.productionSet.getAllProductions() { + lhsSyms[p.num] = p.lhs.Num().Int() + altSymCounts[p.num] = p.rhsLen + + if _, ok := gram.recoverProductions[p.id]; ok { + recoverProds[p.num] = 1 + } + + astAct, ok := gram.astActions[p.id] + if !ok { + continue + } + astActEntry := make([]int, len(astAct)) + for i, e := range astAct { + if e.expansion { + astActEntry[i] = e.position * -1 + } else { + astActEntry[i] = e.position + } + } + astActEnties[p.num] = astActEntry + } + + return &spec.CompiledGrammar{ + Name: gram.name, + Lexical: lexSpec, + Syntactic: &spec.SyntacticSpec{ + Action: action, + GoTo: goTo, + StateCount: tab.stateCount, + InitialState: tab.InitialState.Int(), + StartProduction: productionNumStart.Int(), + LHSSymbols: lhsSyms, + AlternativeSymbolCounts: altSymCounts, + Terminals: termTexts, + TerminalCount: tab.terminalCount, + TerminalSkip: termSkip, + KindToTerminal: kind2Term, + NonTerminals: nonTerms, + NonTerminalCount: tab.nonTerminalCount, + EOFSymbol: symbol.SymbolEOF.Num().Int(), + ErrorSymbol: gram.errorSymbol.Num().Int(), + ErrorTrapperStates: tab.errorTrapperStates, + RecoverProductions: recoverProds, + }, + ASTAction: &spec.ASTAction{ + Entries: astActEnties, + }, + }, report, nil +} + +func writeCompileError(w io.Writer, cErr *lexical.CompileError) { + if cErr.Fragment { + fmt.Fprintf(w, "fragment ") + } + fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause) + if cErr.Detail != "" { + fmt.Fprintf(w, ": %v", cErr.Detail) + } +} + +type lrItemID [32]byte + +func (id lrItemID) String() string { + return fmt.Sprintf("%x", id.num()) +} + +func (id lrItemID) num() uint32 { + return binary.LittleEndian.Uint32(id[:]) +} + +type lookAhead struct { + symbols map[symbol.Symbol]struct{} + + // When propagation is true, an item propagates look-ahead symbols to other items. + propagation bool +} + +type lrItem struct { + id lrItemID + prod productionID + + // E → E + T + // + // Dot | Dotted Symbol | Item + // ----+---------------+------------ + // 0 | E | E →・E + T + // 1 | + | E → E・+ T + // 2 | T | E → E +・T + // 3 | Nil | E → E + T・ + dot int + dottedSymbol symbol.Symbol + + // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. + // It looks like S' →・S. + initial bool + + // When reducible is true, the item looks like E → E + T・. + reducible bool + + // When kernel is true, the item is kernel item. + kernel bool + + // lookAhead stores look-ahead symbols, and they are terminal symbols. + // The item is reducible only when the look-ahead symbols appear as the next input symbol. + lookAhead lookAhead +} + +func newLR0Item(prod *production, dot int) (*lrItem, error) { + if prod == nil { + return nil, fmt.Errorf("production must be non-nil") + } + + if dot < 0 || dot > prod.rhsLen { + return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) + } + + var id lrItemID + { + b := []byte{} + b = append(b, prod.id[:]...) + bDot := make([]byte, 8) + binary.LittleEndian.PutUint64(bDot, uint64(dot)) + b = append(b, bDot...) + id = sha256.Sum256(b) + } + + dottedSymbol := symbol.SymbolNil + if dot < prod.rhsLen { + dottedSymbol = prod.rhs[dot] + } + + initial := false + if prod.lhs.IsStart() && dot == 0 { + initial = true + } + + reducible := false + if dot == prod.rhsLen { + reducible = true + } + + kernel := false + if initial || dot > 0 { + kernel = true + } + + item := &lrItem{ + id: id, + prod: prod.id, + dot: dot, + dottedSymbol: dottedSymbol, + initial: initial, + reducible: reducible, + kernel: kernel, + } + + return item, nil +} + +type kernelID [32]byte + +func (id kernelID) String() string { + return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) +} + +type kernel struct { + id kernelID + items []*lrItem +} + +func newKernel(items []*lrItem) (*kernel, error) { + if len(items) == 0 { + return nil, fmt.Errorf("a kernel need at least one item") + } + + // Remove duplicates from items. + var sortedItems []*lrItem + { + m := map[lrItemID]*lrItem{} + for _, item := range items { + if !item.kernel { + return nil, fmt.Errorf("not a kernel item: %v", item) + } + m[item.id] = item + } + sortedItems = []*lrItem{} + for _, item := range m { + sortedItems = append(sortedItems, item) + } + sort.Slice(sortedItems, func(i, j int) bool { + return sortedItems[i].id.num() < sortedItems[j].id.num() + }) + } + + var id kernelID + { + b := []byte{} + for _, item := range sortedItems { + b = append(b, item.id[:]...) + } + id = sha256.Sum256(b) + } + + return &kernel{ + id: id, + items: sortedItems, + }, nil +} + +type stateNum int + +const stateNumInitial = stateNum(0) + +func (n stateNum) Int() int { + return int(n) +} + +func (n stateNum) String() string { + return strconv.Itoa(int(n)) +} + +func (n stateNum) next() stateNum { + return stateNum(n + 1) +} + +type lrState struct { + *kernel + num stateNum + next map[symbol.Symbol]kernelID + reducible map[productionID]struct{} + + // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. + // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store + // look-ahead symbols because the kernel items don't include these items. + // + // For instance, we have the following productions, and A is a terminal symbol. + // + // s' → s + // s → A | ε + // + // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't + // include `s → ・ε`. + // + // s' → ・s + // s → ・A + // s → ・ε + emptyProdItems []*lrItem + + // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. + // The `α` and `β` can be empty. + // + // A → α・error β + isErrorTrapper bool +} + +type stateAndLRItem struct { + kernelID kernelID + itemID lrItemID +} + +type propagation struct { + src *stateAndLRItem + dest []*stateAndLRItem +} + +type lalr1Automaton struct { + *lr0Automaton +} + +func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { + // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $] + iniState := lr0.states[lr0.initialState] + iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ + symbol.SymbolEOF: {}, + } + + var props []*propagation + for _, state := range lr0.states { + for _, kItem := range state.items { + items, err := genLALR1Closure(kItem, prods, first) + if err != nil { + return nil, err + } + + kItem.lookAhead.propagation = true + + var propDests []*stateAndLRItem + for _, item := range items { + if item.reducible { + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + + if p.isEmpty() { + var reducibleItem *lrItem + for _, it := range state.emptyProdItems { + if it.id != item.id { + continue + } + + reducibleItem = it + break + } + if reducibleItem == nil { + return nil, fmt.Errorf("reducible item not found: %v", item.id) + } + if reducibleItem.lookAhead.symbols == nil { + reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + for a := range item.lookAhead.symbols { + reducibleItem.lookAhead.symbols[a] = struct{}{} + } + + propDests = append(propDests, &stateAndLRItem{ + kernelID: state.id, + itemID: item.id, + }) + } + + continue + } + + nextKID := state.next[item.dottedSymbol] + var nextItemID lrItemID + { + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + it, err := newLR0Item(p, item.dot+1) + if err != nil { + return nil, fmt.Errorf("failed to generate an item ID: %v", err) + } + nextItemID = it.id + } + + if item.lookAhead.propagation { + propDests = append(propDests, &stateAndLRItem{ + kernelID: nextKID, + itemID: nextItemID, + }) + } else { + nextState := lr0.states[nextKID] + var nextItem *lrItem + for _, it := range nextState.items { + if it.id != nextItemID { + continue + } + nextItem = it + break + } + if nextItem == nil { + return nil, fmt.Errorf("item not found: %v", nextItemID) + } + + if nextItem.lookAhead.symbols == nil { + nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + + for a := range item.lookAhead.symbols { + nextItem.lookAhead.symbols[a] = struct{}{} + } + } + } + if len(propDests) == 0 { + continue + } + + props = append(props, &propagation{ + src: &stateAndLRItem{ + kernelID: state.id, + itemID: kItem.id, + }, + dest: propDests, + }) + } + } + + err := propagateLookAhead(lr0, props) + if err != nil { + return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err) + } + + return &lalr1Automaton{ + lr0Automaton: lr0, + }, nil +} + +func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { + items := []*lrItem{} + knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} + knownItemsProp := map[lrItemID]struct{}{} + uncheckedItems := []*lrItem{} + items = append(items, srcItem) + uncheckedItems = append(uncheckedItems, srcItem) + for len(uncheckedItems) > 0 { + nextUncheckedItems := []*lrItem{} + for _, item := range uncheckedItems { + if item.dottedSymbol.IsTerminal() { + continue + } + + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + + var fstSyms []symbol.Symbol + var isFstNullable bool + { + fst, err := first.find(p, item.dot+1) + if err != nil { + return nil, err + } + + fstSyms = make([]symbol.Symbol, len(fst.symbols)) + i := 0 + for s := range fst.symbols { + fstSyms[i] = s + i++ + } + if fst.empty { + isFstNullable = true + } + } + + ps, _ := prods.findByLHS(item.dottedSymbol) + for _, prod := range ps { + var lookAhead []symbol.Symbol + { + var lookAheadCount int + if isFstNullable { + lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols) + } else { + lookAheadCount = len(fstSyms) + } + + lookAhead = make([]symbol.Symbol, lookAheadCount) + i := 0 + for _, s := range fstSyms { + lookAhead[i] = s + i++ + } + if isFstNullable { + for a := range item.lookAhead.symbols { + lookAhead[i] = a + i++ + } + } + } + + for _, a := range lookAhead { + newItem, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if items, exist := knownItems[newItem.id]; exist { + if _, exist := items[a]; exist { + continue + } + } + + newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ + a: {}, + } + + items = append(items, newItem) + if knownItems[newItem.id] == nil { + knownItems[newItem.id] = map[symbol.Symbol]struct{}{} + } + knownItems[newItem.id][a] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, newItem) + } + + if isFstNullable { + newItem, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if _, exist := knownItemsProp[newItem.id]; exist { + continue + } + + newItem.lookAhead.propagation = true + + items = append(items, newItem) + knownItemsProp[newItem.id] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, newItem) + } + } + } + uncheckedItems = nextUncheckedItems + } + + return items, nil +} + +func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { + for { + changed := false + for _, prop := range props { + srcState, ok := lr0.states[prop.src.kernelID] + if !ok { + return fmt.Errorf("source state not found: %v", prop.src.kernelID) + } + var srcItem *lrItem + for _, item := range srcState.items { + if item.id != prop.src.itemID { + continue + } + srcItem = item + break + } + if srcItem == nil { + return fmt.Errorf("source item not found: %v", prop.src.itemID) + } + + for _, dest := range prop.dest { + destState, ok := lr0.states[dest.kernelID] + if !ok { + return fmt.Errorf("destination state not found: %v", dest.kernelID) + } + var destItem *lrItem + for _, item := range destState.items { + if item.id != dest.itemID { + continue + } + destItem = item + break + } + if destItem == nil { + for _, item := range destState.emptyProdItems { + if item.id != dest.itemID { + continue + } + destItem = item + break + } + if destItem == nil { + return fmt.Errorf("destination item not found: %v", dest.itemID) + } + } + + for a := range srcItem.lookAhead.symbols { + if _, ok := destItem.lookAhead.symbols[a]; ok { + continue + } + + if destItem.lookAhead.symbols == nil { + destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + + destItem.lookAhead.symbols[a] = struct{}{} + changed = true + } + } + } + if !changed { + break + } + } + + return nil +} + +type lr0Automaton struct { + initialState kernelID + states map[kernelID]*lrState +} + +func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { + if !startSym.IsStart() { + return nil, fmt.Errorf("passed symbold is not a start symbol") + } + + automaton := &lr0Automaton{ + states: map[kernelID]*lrState{}, + } + + currentState := stateNumInitial + knownKernels := map[kernelID]struct{}{} + uncheckedKernels := []*kernel{} + + // Generate an initial kernel. + { + prods, _ := prods.findByLHS(startSym) + initialItem, err := newLR0Item(prods[0], 0) + if err != nil { + return nil, err + } + + k, err := newKernel([]*lrItem{initialItem}) + if err != nil { + return nil, err + } + + automaton.initialState = k.id + knownKernels[k.id] = struct{}{} + uncheckedKernels = append(uncheckedKernels, k) + } + + for len(uncheckedKernels) > 0 { + nextUncheckedKernels := []*kernel{} + for _, k := range uncheckedKernels { + state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym) + if err != nil { + return nil, err + } + state.num = currentState + currentState = currentState.next() + + automaton.states[state.id] = state + + for _, k := range neighbours { + if _, known := knownKernels[k.id]; known { + continue + } + knownKernels[k.id] = struct{}{} + nextUncheckedKernels = append(nextUncheckedKernels, k) + } + } + uncheckedKernels = nextUncheckedKernels + } + + return automaton, nil +} + +func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { + items, err := genLR0Closure(k, prods) + if err != nil { + return nil, nil, err + } + neighbours, err := genNeighbourKernels(items, prods) + if err != nil { + return nil, nil, err + } + + next := map[symbol.Symbol]kernelID{} + kernels := []*kernel{} + for _, n := range neighbours { + next[n.symbol] = n.kernel.id + kernels = append(kernels, n.kernel) + } + + reducible := map[productionID]struct{}{} + var emptyProdItems []*lrItem + isErrorTrapper := false + for _, item := range items { + if item.dottedSymbol == errSym { + isErrorTrapper = true + } + + if item.reducible { + reducible[item.prod] = struct{}{} + + prod, ok := prods.findByID(item.prod) + if !ok { + return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod) + } + if prod.isEmpty() { + emptyProdItems = append(emptyProdItems, item) + } + } + } + + return &lrState{ + kernel: k, + next: next, + reducible: reducible, + emptyProdItems: emptyProdItems, + isErrorTrapper: isErrorTrapper, + }, kernels, nil +} + +func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { + items := []*lrItem{} + knownItems := map[lrItemID]struct{}{} + uncheckedItems := []*lrItem{} + for _, item := range k.items { + items = append(items, item) + uncheckedItems = append(uncheckedItems, item) + } + for len(uncheckedItems) > 0 { + nextUncheckedItems := []*lrItem{} + for _, item := range uncheckedItems { + if item.dottedSymbol.IsTerminal() { + continue + } + + ps, _ := prods.findByLHS(item.dottedSymbol) + for _, prod := range ps { + item, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if _, exist := knownItems[item.id]; exist { + continue + } + items = append(items, item) + knownItems[item.id] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, item) + } + } + uncheckedItems = nextUncheckedItems + } + + return items, nil +} + +type neighbourKernel struct { + symbol symbol.Symbol + kernel *kernel +} + +func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { + kItemMap := map[symbol.Symbol][]*lrItem{} + for _, item := range items { + if item.dottedSymbol.IsNil() { + continue + } + prod, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("a production was not found: %v", item.prod) + } + kItem, err := newLR0Item(prod, item.dot+1) + if err != nil { + return nil, err + } + kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) + } + + nextSyms := []symbol.Symbol{} + for sym := range kItemMap { + nextSyms = append(nextSyms, sym) + } + sort.Slice(nextSyms, func(i, j int) bool { + return nextSyms[i] < nextSyms[j] + }) + + kernels := []*neighbourKernel{} + for _, sym := range nextSyms { + k, err := newKernel(kItemMap[sym]) + if err != nil { + return nil, err + } + kernels = append(kernels, &neighbourKernel{ + symbol: sym, + kernel: k, + }) + } + + return kernels, nil +} + +type ActionType string + +const ( + ActionTypeShift = ActionType("shift") + ActionTypeReduce = ActionType("reduce") + ActionTypeError = ActionType("error") +) + +type actionEntry int + +const actionEntryEmpty = actionEntry(0) + +func newShiftActionEntry(state stateNum) actionEntry { + return actionEntry(state * -1) +} + +func newReduceActionEntry(prod productionNum) actionEntry { + return actionEntry(prod) +} + +func (e actionEntry) isEmpty() bool { + return e == actionEntryEmpty +} + +func (e actionEntry) describe() (ActionType, stateNum, productionNum) { + if e == actionEntryEmpty { + return ActionTypeError, stateNumInitial, productionNumNil + } + if e < 0 { + return ActionTypeShift, stateNum(e * -1), productionNumNil + } + return ActionTypeReduce, stateNumInitial, productionNum(e) +} + +type GoToType string + +const ( + GoToTypeRegistered = GoToType("registered") + GoToTypeError = GoToType("error") +) + +type goToEntry uint + +const goToEntryEmpty = goToEntry(0) + +func newGoToEntry(state stateNum) goToEntry { + return goToEntry(state) +} + +func (e goToEntry) describe() (GoToType, stateNum) { + if e == goToEntryEmpty { + return GoToTypeError, stateNumInitial + } + return GoToTypeRegistered, stateNum(e) +} + +type conflictResolutionMethod int + +func (m conflictResolutionMethod) Int() int { + return int(m) +} + +const ( + ResolvedByPrec conflictResolutionMethod = 1 + ResolvedByAssoc conflictResolutionMethod = 2 + ResolvedByShift conflictResolutionMethod = 3 + ResolvedByProdOrder conflictResolutionMethod = 4 +) + +type conflict interface { + conflict() +} + +type shiftReduceConflict struct { + state stateNum + sym symbol.Symbol + nextState stateNum + prodNum productionNum + resolvedBy conflictResolutionMethod +} + +func (c *shiftReduceConflict) conflict() { +} + +type reduceReduceConflict struct { + state stateNum + sym symbol.Symbol + prodNum1 productionNum + prodNum2 productionNum + resolvedBy conflictResolutionMethod +} + +func (c *reduceReduceConflict) conflict() { +} + +var ( + _ conflict = &shiftReduceConflict{} + _ conflict = &reduceReduceConflict{} +) + +type ParsingTable struct { + actionTable []actionEntry + goToTable []goToEntry + stateCount int + terminalCount int + nonTerminalCount int + + // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`, + // the state has an item having the following form. The `α` and `β` can be empty. + // + // A → α・error β + errorTrapperStates []int + + InitialState stateNum +} + +func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { + pos := state.Int()*t.terminalCount + sym.Int() + return t.actionTable[pos].describe() +} + +func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { + pos := state.Int()*t.nonTerminalCount + sym.Int() + return t.goToTable[pos].describe() +} + +func (t *ParsingTable) readAction(row int, col int) actionEntry { + return t.actionTable[row*t.terminalCount+col] +} + +func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { + t.actionTable[row*t.terminalCount+col] = act +} + +func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { + pos := state.Int()*t.nonTerminalCount + sym.Num().Int() + t.goToTable[pos] = newGoToEntry(nextState) +} + +type lrTableBuilder struct { + automaton *lr0Automaton + prods *productionSet + termCount int + nonTermCount int + symTab *symbol.SymbolTableReader + precAndAssoc *precAndAssoc + + conflicts []conflict +} + +func (b *lrTableBuilder) build() (*ParsingTable, error) { + var ptab *ParsingTable + { + initialState := b.automaton.states[b.automaton.initialState] + ptab = &ParsingTable{ + actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount), + goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount), + stateCount: len(b.automaton.states), + terminalCount: b.termCount, + nonTerminalCount: b.nonTermCount, + errorTrapperStates: make([]int, len(b.automaton.states)), + InitialState: initialState.num, + } + } + + for _, state := range b.automaton.states { + if state.isErrorTrapper { + ptab.errorTrapperStates[state.num] = 1 + } + + for sym, kID := range state.next { + nextState := b.automaton.states[kID] + if sym.IsTerminal() { + b.writeShiftAction(ptab, state.num, sym, nextState.num) + } else { + ptab.writeGoTo(state.num, sym, nextState.num) + } + } + + for prodID := range state.reducible { + reducibleProd, ok := b.prods.findByID(prodID) + if !ok { + return nil, fmt.Errorf("reducible production not found: %v", prodID) + } + + var reducibleItem *lrItem + for _, item := range state.items { + if item.prod != reducibleProd.id { + continue + } + + reducibleItem = item + break + } + if reducibleItem == nil { + for _, item := range state.emptyProdItems { + if item.prod != reducibleProd.id { + continue + } + + reducibleItem = item + break + } + if reducibleItem == nil { + return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num) + } + } + + for a := range reducibleItem.lookAhead.symbols { + b.writeReduceAction(ptab, state.num, a, reducibleProd.num) + } + } + } + + return ptab, nil +} + +// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, +// we prioritize the shift action. +func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) + if !act.isEmpty() { + ty, _, p := act.describe() + if ty == ActionTypeReduce { + act, method := b.resolveSRConflict(sym.Num(), p) + b.conflicts = append(b.conflicts, &shiftReduceConflict{ + state: state, + sym: sym, + nextState: nextState, + prodNum: p, + resolvedBy: method, + }) + if act == ActionTypeShift { + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) + } + return + } + } + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) +} + +// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, +// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces +// the production with higher priority. Productions defined earlier in the grammar file have a higher priority. +func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) + if !act.isEmpty() { + ty, s, p := act.describe() + switch ty { + case ActionTypeReduce: + if p == prod { + return + } + + b.conflicts = append(b.conflicts, &reduceReduceConflict{ + state: state, + sym: sym, + prodNum1: p, + prodNum2: prod, + resolvedBy: ResolvedByProdOrder, + }) + if p < prod { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) + } else { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) + } + case ActionTypeShift: + act, method := b.resolveSRConflict(sym.Num(), prod) + b.conflicts = append(b.conflicts, &shiftReduceConflict{ + state: state, + sym: sym, + nextState: s, + prodNum: prod, + resolvedBy: method, + }) + if act == ActionTypeReduce { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) + } + } + return + } + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) +} + +func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { + symPrec := b.precAndAssoc.terminalPrecedence(sym) + prodPrec := b.precAndAssoc.productionPredence(prod) + if symPrec == 0 || prodPrec == 0 { + return ActionTypeShift, ResolvedByShift + } + if symPrec == prodPrec { + assoc := b.precAndAssoc.productionAssociativity(prod) + if assoc != assocTypeLeft { + return ActionTypeShift, ResolvedByAssoc + } + return ActionTypeReduce, ResolvedByAssoc + } + if symPrec < prodPrec { + return ActionTypeShift, ResolvedByPrec + } + return ActionTypeReduce, ResolvedByPrec +} + +func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { + var terms []*spec.Terminal + { + termSyms := b.symTab.TerminalSymbols() + terms = make([]*spec.Terminal, len(termSyms)+1) + + for _, sym := range termSyms { + name, ok := b.symTab.ToText(sym) + if !ok { + return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) + } + + term := &spec.Terminal{ + Number: sym.Num().Int(), + Name: name, + } + + prec := b.precAndAssoc.terminalPrecedence(sym.Num()) + if prec != precNil { + term.Precedence = prec + } + + assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) + switch assoc { + case assocTypeLeft: + term.Associativity = "l" + case assocTypeRight: + term.Associativity = "r" + } + + terms[sym.Num()] = term + } + } + + var nonTerms []*spec.NonTerminal + { + nonTermSyms := b.symTab.NonTerminalSymbols() + nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) + for _, sym := range nonTermSyms { + name, ok := b.symTab.ToText(sym) + if !ok { + return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) + } + + nonTerms[sym.Num()] = &spec.NonTerminal{ + Number: sym.Num().Int(), + Name: name, + } + } + } + + var prods []*spec.Production + { + ps := gram.productionSet.getAllProductions() + prods = make([]*spec.Production, len(ps)+1) + for _, p := range ps { + rhs := make([]int, len(p.rhs)) + for i, e := range p.rhs { + if e.IsTerminal() { + rhs[i] = e.Num().Int() + } else { + rhs[i] = e.Num().Int() * -1 + } + } + + prod := &spec.Production{ + Number: p.num.Int(), + LHS: p.lhs.Num().Int(), + RHS: rhs, + } + + prec := b.precAndAssoc.productionPredence(p.num) + if prec != precNil { + prod.Precedence = prec + } + + assoc := b.precAndAssoc.productionAssociativity(p.num) + switch assoc { + case assocTypeLeft: + prod.Associativity = "l" + case assocTypeRight: + prod.Associativity = "r" + } + + prods[p.num.Int()] = prod + } + } + + var states []*spec.State + { + srConflicts := map[stateNum][]*shiftReduceConflict{} + rrConflicts := map[stateNum][]*reduceReduceConflict{} + for _, con := range b.conflicts { + switch c := con.(type) { + case *shiftReduceConflict: + srConflicts[c.state] = append(srConflicts[c.state], c) + case *reduceReduceConflict: + rrConflicts[c.state] = append(rrConflicts[c.state], c) + } + } + + states = make([]*spec.State, len(b.automaton.states)) + for _, s := range b.automaton.states { + kernel := make([]*spec.Item, len(s.items)) + for i, item := range s.items { + p, ok := b.prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod) + } + + kernel[i] = &spec.Item{ + Production: p.num.Int(), + Dot: item.dot, + } + } + + sort.Slice(kernel, func(i, j int) bool { + if kernel[i].Production < kernel[j].Production { + return true + } + if kernel[i].Production > kernel[j].Production { + return false + } + return kernel[i].Dot < kernel[j].Dot + }) + + var shift []*spec.Transition + var reduce []*spec.Reduce + var goTo []*spec.Transition + { + TERMINALS_LOOP: + for _, t := range b.symTab.TerminalSymbols() { + act, next, prod := tab.getAction(s.num, t.Num()) + switch act { + case ActionTypeShift: + shift = append(shift, &spec.Transition{ + Symbol: t.Num().Int(), + State: next.Int(), + }) + case ActionTypeReduce: + for _, r := range reduce { + if r.Production == prod.Int() { + r.LookAhead = append(r.LookAhead, t.Num().Int()) + continue TERMINALS_LOOP + } + } + reduce = append(reduce, &spec.Reduce{ + LookAhead: []int{t.Num().Int()}, + Production: prod.Int(), + }) + } + } + + for _, n := range b.symTab.NonTerminalSymbols() { + ty, next := tab.getGoTo(s.num, n.Num()) + if ty == GoToTypeRegistered { + goTo = append(goTo, &spec.Transition{ + Symbol: n.Num().Int(), + State: next.Int(), + }) + } + } + + sort.Slice(shift, func(i, j int) bool { + return shift[i].State < shift[j].State + }) + sort.Slice(reduce, func(i, j int) bool { + return reduce[i].Production < reduce[j].Production + }) + sort.Slice(goTo, func(i, j int) bool { + return goTo[i].State < goTo[j].State + }) + } + + sr := []*spec.SRConflict{} + rr := []*spec.RRConflict{} + { + for _, c := range srConflicts[s.num] { + conflict := &spec.SRConflict{ + Symbol: c.sym.Num().Int(), + State: c.nextState.Int(), + Production: c.prodNum.Int(), + ResolvedBy: c.resolvedBy.Int(), + } + + ty, s, p := tab.getAction(s.num, c.sym.Num()) + switch ty { + case ActionTypeShift: + n := s.Int() + conflict.AdoptedState = &n + case ActionTypeReduce: + n := p.Int() + conflict.AdoptedProduction = &n + } + + sr = append(sr, conflict) + } + + sort.Slice(sr, func(i, j int) bool { + return sr[i].Symbol < sr[j].Symbol + }) + + for _, c := range rrConflicts[s.num] { + conflict := &spec.RRConflict{ + Symbol: c.sym.Num().Int(), + Production1: c.prodNum1.Int(), + Production2: c.prodNum2.Int(), + ResolvedBy: c.resolvedBy.Int(), + } + + _, _, p := tab.getAction(s.num, c.sym.Num()) + conflict.AdoptedProduction = p.Int() + + rr = append(rr, conflict) + } + + sort.Slice(rr, func(i, j int) bool { + return rr[i].Symbol < rr[j].Symbol + }) + } + + states[s.num.Int()] = &spec.State{ + Number: s.num.Int(), + Kernel: kernel, + Shift: shift, + Reduce: reduce, + GoTo: goTo, + SRConflict: sr, + RRConflict: rr, + } + } + } + + return &spec.Report{ + Terminals: terms, + NonTerminals: nonTerms, + Productions: prods, + States: states, + }, nil +} + +type productionID [32]byte + +func (id productionID) String() string { + return hex.EncodeToString(id[:]) +} + +func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { + seq := lhs.Byte() + for _, sym := range rhs { + seq = append(seq, sym.Byte()...) + } + return productionID(sha256.Sum256(seq)) +} + +type productionNum uint16 + +const ( + productionNumNil = productionNum(0) + productionNumStart = productionNum(1) + productionNumMin = productionNum(2) +) + +func (n productionNum) Int() int { + return int(n) +} + +type production struct { + id productionID + num productionNum + lhs symbol.Symbol + rhs []symbol.Symbol + rhsLen int +} + +func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { + if lhs.IsNil() { + return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) + } + for _, sym := range rhs { + if sym.IsNil() { + return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) + } + } + + return &production{ + id: genProductionID(lhs, rhs), + lhs: lhs, + rhs: rhs, + rhsLen: len(rhs), + }, nil +} + +func (p *production) isEmpty() bool { + return p.rhsLen == 0 +} + +type productionSet struct { + lhs2Prods map[symbol.Symbol][]*production + id2Prod map[productionID]*production + num productionNum +} + +func newProductionSet() *productionSet { + return &productionSet{ + lhs2Prods: map[symbol.Symbol][]*production{}, + id2Prod: map[productionID]*production{}, + num: productionNumMin, + } +} + +func (ps *productionSet) append(prod *production) { + if _, ok := ps.id2Prod[prod.id]; ok { + return + } + + if prod.lhs.IsStart() { + prod.num = productionNumStart + } else { + prod.num = ps.num + ps.num++ + } + + if prods, ok := ps.lhs2Prods[prod.lhs]; ok { + ps.lhs2Prods[prod.lhs] = append(prods, prod) + } else { + ps.lhs2Prods[prod.lhs] = []*production{prod} + } + ps.id2Prod[prod.id] = prod +} + +func (ps *productionSet) findByID(id productionID) (*production, bool) { + prod, ok := ps.id2Prod[id] + return prod, ok +} + +func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { + if lhs.IsNil() { + return nil, false + } + + prods, ok := ps.lhs2Prods[lhs] + return prods, ok +} + +func (ps *productionSet) getAllProductions() map[productionID]*production { + return ps.id2Prod +} + +var ( + semErrNoGrammarName = errors.New("name is missing") + semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") + semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") + semErrUndefinedPrec = errors.New("symbol must has precedence") + semErrUndefinedOrdSym = errors.New("undefined ordered symbol") + semErrUnusedProduction = errors.New("unused production") + semErrUnusedTerminal = errors.New("unused terminal") + semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") + semErrNoProduction = errors.New("a grammar needs at least one production") + semErrUndefinedSym = errors.New("undefined symbol") + semErrDuplicateProduction = errors.New("duplicate production") + semErrDuplicateTerminal = errors.New("duplicate terminal") + semErrDuplicateFragment = errors.New("duplicate fragment") + semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") + semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") + semErrDuplicateLabel = errors.New("a label must be unique in an alternative") + semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") + semErrDirInvalidName = errors.New("invalid directive name") + semErrDirInvalidParam = errors.New("invalid parameter") + semErrDuplicateDir = errors.New("a directive must not be duplicated") + semErrDuplicateElem = errors.New("duplicate element") + semErrAmbiguousElem = errors.New("ambiguous element") + semErrInvalidProdDir = errors.New("invalid production directive") + semErrInvalidAltDir = errors.New("invalid alternative directive") +) diff --git a/src/urubu/grammar/first.go b/src/urubu/grammar/first.go deleted file mode 100644 index 6443bcf..0000000 --- a/src/urubu/grammar/first.go +++ /dev/null @@ -1,148 +0,0 @@ -package grammar - -import ( - "fmt" - - "urubu/grammar/symbol" -) - -type firstEntry struct { - symbols map[symbol.Symbol]struct{} - empty bool -} - -func newFirstEntry() *firstEntry { - return &firstEntry{ - symbols: map[symbol.Symbol]struct{}{}, - empty: false, - } -} - -func (e *firstEntry) add(sym symbol.Symbol) bool { - if _, ok := e.symbols[sym]; ok { - return false - } - e.symbols[sym] = struct{}{} - return true -} - -func (e *firstEntry) addEmpty() bool { - if !e.empty { - e.empty = true - return true - } - return false -} - -func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { - if target == nil { - return false - } - changed := false - for sym := range target.symbols { - added := e.add(sym) - if added { - changed = true - } - } - return changed -} - -type firstSet struct { - set map[symbol.Symbol]*firstEntry -} - -func newFirstSet(prods *productionSet) *firstSet { - fst := &firstSet{ - set: map[symbol.Symbol]*firstEntry{}, - } - for _, prod := range prods.getAllProductions() { - if _, ok := fst.set[prod.lhs]; ok { - continue - } - fst.set[prod.lhs] = newFirstEntry() - } - - return fst -} - -func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { - entry := newFirstEntry() - if prod.rhsLen <= head { - entry.addEmpty() - return entry, nil - } - for _, sym := range prod.rhs[head:] { - if sym.IsTerminal() { - entry.add(sym) - return entry, nil - } - - e := fst.findBySymbol(sym) - if e == nil { - return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym) - } - for s := range e.symbols { - entry.add(s) - } - if !e.empty { - return entry, nil - } - } - entry.addEmpty() - return entry, nil -} - -func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { - return fst.set[sym] -} - -type firstComContext struct { - first *firstSet -} - -func newFirstComContext(prods *productionSet) *firstComContext { - return &firstComContext{ - first: newFirstSet(prods), - } -} - -func genFirstSet(prods *productionSet) (*firstSet, error) { - cc := newFirstComContext(prods) - for { - more := false - for _, prod := range prods.getAllProductions() { - e := cc.first.findBySymbol(prod.lhs) - changed, err := genProdFirstEntry(cc, e, prod) - if err != nil { - return nil, err - } - if changed { - more = true - } - } - if !more { - break - } - } - return cc.first, nil -} - -func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) { - if prod.isEmpty() { - return acc.addEmpty(), nil - } - - for _, sym := range prod.rhs { - if sym.IsTerminal() { - return acc.add(sym), nil - } - - e := cc.first.findBySymbol(sym) - changed := acc.mergeExceptEmpty(e) - if !e.empty { - return changed, nil - } - } - return acc.addEmpty(), nil -} diff --git a/src/urubu/grammar/grammar.go b/src/urubu/grammar/grammar.go deleted file mode 100644 index bfa53c6..0000000 --- a/src/urubu/grammar/grammar.go +++ /dev/null @@ -1,1390 +0,0 @@ -package grammar - -import ( - "fmt" - "io" - "strings" - - verr "urubu/error" - "urubu/grammar/lexical" - "urubu/grammar/symbol" - spec "urubu/spec/grammar" - "urubu/spec/grammar/parser" -) - -type astActionEntry struct { - position int - expansion bool -} - -type assocType string - -const ( - assocTypeNil = assocType("") - assocTypeLeft = assocType("left") - assocTypeRight = assocType("right") -) - -const ( - precNil = 0 - precMin = 1 -) - -// precAndAssoc represents precedence and associativities of terminal symbols and productions. -// We use the priority of the production to resolve shift/reduce conflicts. -type precAndAssoc struct { - // termPrec and termAssoc represent the precedence of the terminal symbols. - termPrec map[symbol.SymbolNum]int - termAssoc map[symbol.SymbolNum]assocType - - // prodPrec and prodAssoc represent the precedence and the associativities of the production. - // These values are inherited from the right-most terminal symbols in the RHS of the productions. - prodPrec map[productionNum]int - prodAssoc map[productionNum]assocType -} - -func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { - prec, ok := pa.termPrec[sym] - if !ok { - return precNil - } - - return prec -} - -func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { - assoc, ok := pa.termAssoc[sym] - if !ok { - return assocTypeNil - } - - return assoc -} - -func (pa *precAndAssoc) productionPredence(prod productionNum) int { - prec, ok := pa.prodPrec[prod] - if !ok { - return precNil - } - - return prec -} - -func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType { - assoc, ok := pa.prodAssoc[prod] - if !ok { - return assocTypeNil - } - - return assoc -} - -const reservedSymbolNameError = "error" - -type Grammar struct { - name string - lexSpec *lexical.LexSpec - skipSymbols []symbol.Symbol - productionSet *productionSet - augmentedStartSymbol symbol.Symbol - errorSymbol symbol.Symbol - symbolTable *symbol.SymbolTableReader - astActions map[productionID][]*astActionEntry - precAndAssoc *precAndAssoc - - // recoverProductions is a set of productions having the recover directive. - recoverProductions map[productionID]struct{} -} - -type buildConfig struct { - isReportingEnabled bool -} - -type BuildOption func(config *buildConfig) - -func EnableReporting() BuildOption { - return func(config *buildConfig) { - config.isReportingEnabled = true - } -} - -type GrammarBuilder struct { - AST *parser.RootNode - - errs verr.SpecErrors -} - -func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { - gram, err := b.build() - if err != nil { - return nil, nil, err - } - - return compile(gram, opts...) -} - -func (b *GrammarBuilder) build() (*Grammar, error) { - var specName string - { - errOccurred := false - for _, dir := range b.AST.Directives { - if dir.Name != "name" { - continue - } - - if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'name' takes just one ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - - errOccurred = true - break - } - - specName = dir.Parameters[0].ID - break - } - - if specName == "" && !errOccurred { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrNoGrammarName, - }) - } - } - - b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST) - if len(b.errs) > 0 { - return nil, b.errs - } - - symTab, ss, err := b.genSymbolTable(b.AST) - if err != nil { - return nil, err - } - - lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) - if err != nil { - return nil, err - } - - prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) - if err != nil { - return nil, err - } - if prodsAndActs == nil && len(b.errs) > 0 { - return nil, b.errs - } - - pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) - if err != nil { - return nil, err - } - if pa == nil && len(b.errs) > 0 { - return nil, b.errs - } - - syms := findUsedAndUnusedSymbols(b.AST) - if syms == nil && len(b.errs) > 0 { - return nil, b.errs - } - - // When a terminal symbol that cannot be reached from the start symbol has the skip directive, - // the compiler treats its terminal as a used symbol, not unused. - { - r := symTab.Reader() - for _, sym := range skip { - s, _ := r.ToText(sym) - if _, ok := syms.unusedTerminals[s]; !ok { - prod := syms.usedTerminals[s] - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrTermCannotBeSkipped, - Detail: s, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - continue - } - - delete(syms.unusedTerminals, s) - } - } - - for sym, prod := range syms.unusedProductions { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUnusedProduction, - Detail: sym, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - for sym, prod := range syms.unusedTerminals { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUnusedTerminal, - Detail: sym, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - if len(b.errs) > 0 { - return nil, b.errs - } - - return &Grammar{ - name: specName, - lexSpec: lexSpec, - skipSymbols: skip, - productionSet: prodsAndActs.prods, - augmentedStartSymbol: prodsAndActs.augStartSym, - errorSymbol: ss.errSym, - symbolTable: symTab.Reader(), - astActions: prodsAndActs.astActs, - recoverProductions: prodsAndActs.recoverProds, - precAndAssoc: pa, - }, nil -} - -type usedAndUnusedSymbols struct { - unusedProductions map[string]*parser.ProductionNode - unusedTerminals map[string]*parser.ProductionNode - usedTerminals map[string]*parser.ProductionNode -} - -func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { - prods := map[string]*parser.ProductionNode{} - lexProds := map[string]*parser.ProductionNode{} - mark := map[string]bool{} - { - for _, p := range root.Productions { - prods[p.LHS] = p - mark[p.LHS] = false - for _, alt := range p.RHS { - for _, e := range alt.Elements { - if e.ID == "" { - continue - } - mark[e.ID] = false - } - } - } - - for _, p := range root.LexProductions { - lexProds[p.LHS] = p - mark[p.LHS] = false - } - - start := root.Productions[0] - mark[start.LHS] = true - markUsedSymbols(mark, map[string]bool{}, prods, start) - - // We don't have to check the error symbol because the error symbol doesn't have a production. - delete(mark, reservedSymbolNameError) - } - - usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) - unusedProds := map[string]*parser.ProductionNode{} - unusedTerms := map[string]*parser.ProductionNode{} - for sym, used := range mark { - if p, ok := prods[sym]; ok { - if used { - continue - } - unusedProds[sym] = p - continue - } - if p, ok := lexProds[sym]; ok { - if used { - usedTerms[sym] = p - } else { - unusedTerms[sym] = p - } - continue - } - - // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error - // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can - // ignore it here. - } - - return &usedAndUnusedSymbols{ - usedTerminals: usedTerms, - unusedProductions: unusedProds, - unusedTerminals: unusedTerms, - } -} - -func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { - if marked[prod.LHS] { - return - } - - for _, alt := range prod.RHS { - for _, e := range alt.Elements { - if e.ID == "" { - continue - } - - mark[e.ID] = true - - p, ok := prods[e.ID] - if !ok { - continue - } - - // Remove a production to avoid inifinite recursion. - marked[prod.LHS] = true - - markUsedSymbols(mark, marked, prods, p) - } - } -} - -func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { - var ids []string - { - for _, prod := range root.Productions { - ids = append(ids, prod.LHS) - for _, alt := range prod.RHS { - for _, elem := range alt.Elements { - if elem.Label != nil { - ids = append(ids, elem.Label.Name) - } - } - } - } - for _, prod := range root.LexProductions { - ids = append(ids, prod.LHS) - } - for _, dir := range root.Directives { - dirIDs := collectUserDefinedIDsFromDirective(dir) - if len(dirIDs) > 0 { - ids = append(ids, dirIDs...) - } - } - } - - duplicated := lexical.FindSpellingInconsistencies(ids) - if len(duplicated) == 0 { - return - } - - for _, dup := range duplicated { - var s string - { - var b strings.Builder - fmt.Fprintf(&b, "%+v", dup[0]) - for _, id := range dup[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - s = b.String() - } - - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrSpellingInconsistency, - Detail: s, - }) - } -} - -func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { - var ids []string - for _, param := range dir.Parameters { - if param.Group != nil { - for _, d := range param.Group { - dIDs := collectUserDefinedIDsFromDirective(d) - if len(dIDs) > 0 { - ids = append(ids, dIDs...) - } - } - } - if param.OrderedSymbol != "" { - ids = append(ids, param.OrderedSymbol) - } - } - return ids -} - -type symbols struct { - errSym symbol.Symbol - augStartSym symbol.Symbol - startSym symbol.Symbol -} - -func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { - symTab := symbol.NewSymbolTable() - w := symTab.Writer() - r := symTab.Reader() - - // We need to register the reserved symbol before registering others. - var errSym symbol.Symbol - { - sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) - if err != nil { - return nil, nil, err - } - errSym = sym - } - - for _, prod := range root.LexProductions { - if sym, exist := r.ToSymbol(prod.LHS); exist { - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateTerminal, - Detail: prod.LHS, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - continue - } - - _, err := w.RegisterTerminalSymbol(prod.LHS) - if err != nil { - return nil, nil, err - } - } - - startProd := root.Productions[0] - augStartText := fmt.Sprintf("%s'", startProd.LHS) - var err error - augStartSym, err := w.RegisterStartSymbol(augStartText) - if err != nil { - return nil, nil, err - } - if augStartSym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: startProd.Pos.Row, - Col: startProd.Pos.Col, - }) - } - - startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) - if err != nil { - return nil, nil, err - } - if startSym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: startProd.Pos.Row, - Col: startProd.Pos.Col, - }) - } - - for _, prod := range root.Productions { - sym, err := w.RegisterNonTerminalSymbol(prod.LHS) - if err != nil { - return nil, nil, err - } - if sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateName, - Detail: prod.LHS, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - } - - return symTab, &symbols{ - errSym: errSym, - augStartSym: augStartSym, - startSym: startSym, - }, nil -} - -func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { - entries := []*lexical.LexEntry{} - skipSyms := []symbol.Symbol{} - for _, prod := range root.LexProductions { - entry, skip, specErr, err := genLexEntry(prod) - if err != nil { - return nil, nil, err - } - if specErr != nil { - b.errs = append(b.errs, specErr) - continue - } - if skip { - sym, _ := symTab.ToSymbol(prod.LHS) - skipSyms = append(skipSyms, sym) - } - entries = append(entries, entry) - } - - checkedFragments := map[string]struct{}{} - for _, fragment := range root.Fragments { - if _, exist := checkedFragments[fragment.LHS]; exist { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateFragment, - Detail: fragment.LHS, - Row: fragment.Pos.Row, - Col: fragment.Pos.Col, - }) - continue - } - checkedFragments[fragment.LHS] = struct{}{} - - entries = append(entries, &lexical.LexEntry{ - Fragment: true, - Kind: spec.LexKindName(fragment.LHS), - Pattern: fragment.RHS, - }) - } - - return &lexical.LexSpec{ - Entries: entries, - }, skipSyms, nil -} - -func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { - alt := prod.RHS[0] - elem := alt.Elements[0] - - var pattern string - if elem.Literally { - pattern = spec.EscapePattern(elem.Pattern) - } else { - pattern = elem.Pattern - } - - var modes []spec.LexModeName - var skip bool - var push spec.LexModeName - var pop bool - dirConsumed := map[string]struct{}{} - for _, dir := range prod.Directives { - if _, consumed := dirConsumed[dir.Name]; consumed { - return nil, false, &verr.SpecError{ - Cause: semErrDuplicateDir, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - dirConsumed[dir.Name] = struct{}{} - - switch dir.Name { - case "mode": - if len(dir.Parameters) == 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'mode' directive needs an ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - for _, param := range dir.Parameters { - if param.ID == "" { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'mode' directive needs an ID parameter", - Row: param.Pos.Row, - Col: param.Pos.Col, - }, nil - } - modes = append(modes, spec.LexModeName(param.ID)) - } - case "skip": - if len(dir.Parameters) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'skip' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - skip = true - case "push": - if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'push' directive needs an ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - push = spec.LexModeName(dir.Parameters[0].ID) - case "pop": - if len(dir.Parameters) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'pop' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - pop = true - default: - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - } - - if len(alt.Directives) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrInvalidAltDir, - Detail: "a lexical production cannot have alternative directives", - Row: alt.Directives[0].Pos.Row, - Col: alt.Directives[0].Pos.Col, - }, nil - } - - return &lexical.LexEntry{ - Modes: modes, - Kind: spec.LexKindName(prod.LHS), - Pattern: pattern, - Push: push, - Pop: pop, - }, skip, nil, nil -} - -type productionsAndActions struct { - prods *productionSet - augStartSym symbol.Symbol - astActs map[productionID][]*astActionEntry - prodPrecsTerm map[productionID]symbol.Symbol - prodPrecsOrdSym map[productionID]string - prodPrecPoss map[productionID]*parser.Position - recoverProds map[productionID]struct{} -} - -func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { - if len(root.Productions) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrNoProduction, - }) - return nil, nil - } - - prods := newProductionSet() - astActs := map[productionID][]*astActionEntry{} - prodPrecsTerm := map[productionID]symbol.Symbol{} - prodPrecsOrdSym := map[productionID]string{} - prodPrecPoss := map[productionID]*parser.Position{} - recoverProds := map[productionID]struct{}{} - - p, err := newProduction(augStartSym, []symbol.Symbol{ - startSym, - }) - if err != nil { - return nil, err - } - - prods.append(p) - - for _, prod := range root.Productions { - lhsSym, ok := symTab.ToSymbol(prod.LHS) - if !ok { - // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. - return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) - } - - if len(prod.Directives) > 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrInvalidProdDir, - Detail: "a production cannot have production directives", - Row: prod.Directives[0].Pos.Row, - Col: prod.Directives[0].Pos.Col, - }) - continue - } - - LOOP_RHS: - for _, alt := range prod.RHS { - altSyms := make([]symbol.Symbol, len(alt.Elements)) - offsets := map[string]int{} - ambiguousIDOffsets := map[string]struct{}{} - for i, elem := range alt.Elements { - sym, ok := symTab.ToSymbol(elem.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedSym, - Detail: elem.ID, - Row: elem.Pos.Row, - Col: elem.Pos.Col, - }) - continue LOOP_RHS - } - altSyms[i] = sym - - if elem.Label != nil { - if _, added := offsets[elem.Label.Name]; added { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateLabel, - Detail: elem.Label.Name, - Row: elem.Label.Pos.Row, - Col: elem.Label.Pos.Col, - }) - continue LOOP_RHS - } - if _, found := symTab.ToSymbol(elem.Label.Name); found { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrInvalidLabel, - Detail: elem.Label.Name, - Row: elem.Label.Pos.Row, - Col: elem.Label.Pos.Col, - }) - continue LOOP_RHS - } - offsets[elem.Label.Name] = i - } - // A symbol having a label can be specified by both the label and the symbol name. - // So record the symbol's position, whether or not it has a label. - if elem.ID != "" { - if _, exist := offsets[elem.ID]; exist { - // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need - // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify - // a label to resolve the ambiguity. - delete(offsets, elem.ID) - ambiguousIDOffsets[elem.ID] = struct{}{} - } else { - offsets[elem.ID] = i - } - } - } - - p, err := newProduction(lhsSym, altSyms) - if err != nil { - return nil, err - } - if _, exist := prods.findByID(p.id); exist { - // Report the line number of a duplicate alternative. - // When the alternative is empty, we report the position of its LHS. - var row int - var col int - if len(alt.Elements) > 0 { - row = alt.Elements[0].Pos.Row - col = alt.Elements[0].Pos.Col - } else { - row = prod.Pos.Row - col = prod.Pos.Col - } - - var detail string - { - var b strings.Builder - fmt.Fprintf(&b, "%v →", prod.LHS) - for _, elem := range alt.Elements { - switch { - case elem.ID != "": - fmt.Fprintf(&b, " %v", elem.ID) - case elem.Pattern != "": - fmt.Fprintf(&b, ` "%v"`, elem.Pattern) - } - } - if len(alt.Elements) == 0 { - fmt.Fprintf(&b, " ε") - } - - detail = b.String() - } - - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateProduction, - Detail: detail, - Row: row, - Col: col, - }) - continue LOOP_RHS - } - prods.append(p) - - dirConsumed := map[string]struct{}{} - for _, dir := range alt.Directives { - if _, consumed := dirConsumed[dir.Name]; consumed { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateDir, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - } - dirConsumed[dir.Name] = struct{}{} - - switch dir.Name { - case "ast": - if len(dir.Parameters) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'ast' directive needs at least one parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - astAct := make([]*astActionEntry, len(dir.Parameters)) - consumedOffsets := map[int]struct{}{} - for i, param := range dir.Parameters { - if param.ID == "" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'ast' directive can take only ID parameters", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - - if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrAmbiguousElem, - Detail: fmt.Sprintf("'%v' is ambiguous", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - - offset, ok := offsets[param.ID] - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - if _, consumed := consumedOffsets[offset]; consumed { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateElem, - Detail: param.ID, - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - consumedOffsets[offset] = struct{}{} - - if param.Expansion { - elem := alt.Elements[offset] - if elem.Pattern != "" { - // Currently, it is a bug to reach here because it is - // forbidden to have anything other than ID appear in - // production rules. - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - elemSym, ok := symTab.ToSymbol(elem.ID) - if !ok { - // If the symbol was not found, it's a bug. - return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) - } - if elemSym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - } - - astAct[i] = &astActionEntry{ - position: offset + 1, - expansion: param.Expansion, - } - } - astActs[p.id] = astAct - case "prec": - if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'prec' directive needs just one ID parameter or ordered symbol", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - param := dir.Parameters[0] - switch { - case param.ID != "": - sym, ok := symTab.ToSymbol(param.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - } - if !sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - prodPrecsTerm[p.id] = sym - prodPrecPoss[p.id] = ¶m.Pos - case param.OrderedSymbol != "": - prodPrecsOrdSym[p.id] = param.OrderedSymbol - prodPrecPoss[p.id] = ¶m.Pos - } - case "recover": - if len(dir.Parameters) > 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'recover' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - recoverProds[p.id] = struct{}{} - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name), - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - } - } - } - - return &productionsAndActions{ - prods: prods, - augStartSym: augStartSym, - astActs: astActs, - prodPrecsTerm: prodPrecsTerm, - prodPrecsOrdSym: prodPrecsOrdSym, - prodPrecPoss: prodPrecPoss, - recoverProds: recoverProds, - }, nil -} - -func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { - termPrec := map[symbol.SymbolNum]int{} - termAssoc := map[symbol.SymbolNum]assocType{} - ordSymPrec := map[string]int{} - { - var precGroup []*parser.DirectiveNode - for _, dir := range b.AST.Directives { - if dir.Name == "prec" { - if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'prec' needs just one directive group", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue - } - precGroup = dir.Parameters[0].Group - continue - } - - if dir.Name != "name" && dir.Name != "prec" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue - } - } - - precN := precMin - for _, dir := range precGroup { - var assocTy assocType - switch dir.Name { - case "left": - assocTy = assocTypeLeft - case "right": - assocTy = assocTypeRight - case "assign": - assocTy = assocTypeNil - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - return nil, nil - } - - if len(dir.Parameters) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "associativity needs at least one symbol", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - return nil, nil - } - ASSOC_PARAM_LOOP: - for _, p := range dir.Parameters { - switch { - case p.ID != "": - sym, ok := symTab.ToSymbol(p.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' is undefined", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if !sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if prec, alreadySet := termPrec[sym.Num()]; alreadySet { - if prec == precN { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } - break ASSOC_PARAM_LOOP - } - - termPrec[sym.Num()] = precN - termAssoc[sym.Num()] = assocTy - case p.OrderedSymbol != "": - if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { - if prec == precN { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } - break ASSOC_PARAM_LOOP - } - - ordSymPrec[p.OrderedSymbol] = precN - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "a parameter must be an ID or an ordered symbol", - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - } - - precN++ - } - } - if len(b.errs) > 0 { - return nil, nil - } - - prodPrec := map[productionNum]int{} - prodAssoc := map[productionNum]assocType{} - for _, prod := range prodsAndActs.prods.getAllProductions() { - // A #prec directive changes only precedence, not associativity. - if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { - if prec, ok := termPrec[term.Num()]; ok { - prodPrec[prod.num] = prec - prodAssoc[prod.num] = assocTypeNil - } else { - text, _ := symTab.ToText(term) - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedPrec, - Detail: text, - Row: prodsAndActs.prodPrecPoss[prod.id].Row, - Col: prodsAndActs.prodPrecPoss[prod.id].Col, - }) - } - } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok { - if prec, ok := ordSymPrec[ordSym]; ok { - prodPrec[prod.num] = prec - prodAssoc[prod.num] = assocTypeNil - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedOrdSym, - Detail: fmt.Sprintf("$%v", ordSym), - Row: prodsAndActs.prodPrecPoss[prod.id].Row, - Col: prodsAndActs.prodPrecPoss[prod.id].Col, - }) - } - } else { - // A production inherits precedence and associativity from the right-most terminal symbol. - mostrightTerm := symbol.SymbolNil - for _, sym := range prod.rhs { - if !sym.IsTerminal() { - continue - } - mostrightTerm = sym - } - if !mostrightTerm.IsNil() { - prodPrec[prod.num] = termPrec[mostrightTerm.Num()] - prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] - } - } - } - if len(b.errs) > 0 { - return nil, nil - } - - return &precAndAssoc{ - termPrec: termPrec, - termAssoc: termAssoc, - prodPrec: prodPrec, - prodAssoc: prodAssoc, - }, nil -} - -func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { - config := &buildConfig{} - for _, opt := range opts { - opt(config) - } - - lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) - if err != nil { - if len(cErrs) > 0 { - var b strings.Builder - writeCompileError(&b, cErrs[0]) - for _, cerr := range cErrs[1:] { - fmt.Fprintf(&b, "\n") - writeCompileError(&b, cerr) - } - return nil, nil, fmt.Errorf(b.String()) - } - return nil, nil, err - } - - kind2Term := make([]int, len(lexSpec.KindNames)) - for i, k := range lexSpec.KindNames { - if k == spec.LexKindNameNil { - kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() - continue - } - - sym, ok := gram.symbolTable.ToSymbol(k.String()) - if !ok { - return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) - } - kind2Term[i] = sym.Num().Int() - } - - termTexts, err := gram.symbolTable.TerminalTexts() - if err != nil { - return nil, nil, err - } - - var termSkip []int - { - r := gram.symbolTable.Reader() - // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), - // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts - // because it does not contain a predefined symbol, like EOF. - // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. - termSkip = make([]int, len(termTexts)) - for _, t := range termTexts { - s, _ := r.ToSymbol(t) - for _, sk := range gram.skipSymbols { - if s != sk { - continue - } - termSkip[s.Num()] = 1 - break - } - } - } - - nonTerms, err := gram.symbolTable.NonTerminalTexts() - if err != nil { - return nil, nil, err - } - - firstSet, err := genFirstSet(gram.productionSet) - if err != nil { - return nil, nil, err - } - - lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol) - if err != nil { - return nil, nil, err - } - - var tab *ParsingTable - var report *spec.Report - { - lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet) - if err != nil { - return nil, nil, err - } - - b := &lrTableBuilder{ - automaton: lalr1.lr0Automaton, - prods: gram.productionSet, - termCount: len(termTexts), - nonTermCount: len(nonTerms), - symTab: gram.symbolTable, - precAndAssoc: gram.precAndAssoc, - } - tab, err = b.build() - if err != nil { - return nil, nil, err - } - - if config.isReportingEnabled { - report, err = b.genReport(tab, gram) - if err != nil { - return nil, nil, err - } - } - } - - action := make([]int, len(tab.actionTable)) - for i, e := range tab.actionTable { - action[i] = int(e) - } - goTo := make([]int, len(tab.goToTable)) - for i, e := range tab.goToTable { - goTo[i] = int(e) - } - - lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1) - altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1) - recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) - astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) - for _, p := range gram.productionSet.getAllProductions() { - lhsSyms[p.num] = p.lhs.Num().Int() - altSymCounts[p.num] = p.rhsLen - - if _, ok := gram.recoverProductions[p.id]; ok { - recoverProds[p.num] = 1 - } - - astAct, ok := gram.astActions[p.id] - if !ok { - continue - } - astActEntry := make([]int, len(astAct)) - for i, e := range astAct { - if e.expansion { - astActEntry[i] = e.position * -1 - } else { - astActEntry[i] = e.position - } - } - astActEnties[p.num] = astActEntry - } - - return &spec.CompiledGrammar{ - Name: gram.name, - Lexical: lexSpec, - Syntactic: &spec.SyntacticSpec{ - Action: action, - GoTo: goTo, - StateCount: tab.stateCount, - InitialState: tab.InitialState.Int(), - StartProduction: productionNumStart.Int(), - LHSSymbols: lhsSyms, - AlternativeSymbolCounts: altSymCounts, - Terminals: termTexts, - TerminalCount: tab.terminalCount, - TerminalSkip: termSkip, - KindToTerminal: kind2Term, - NonTerminals: nonTerms, - NonTerminalCount: tab.nonTerminalCount, - EOFSymbol: symbol.SymbolEOF.Num().Int(), - ErrorSymbol: gram.errorSymbol.Num().Int(), - ErrorTrapperStates: tab.errorTrapperStates, - RecoverProductions: recoverProds, - }, - ASTAction: &spec.ASTAction{ - Entries: astActEnties, - }, - }, report, nil -} - -func writeCompileError(w io.Writer, cErr *lexical.CompileError) { - if cErr.Fragment { - fmt.Fprintf(w, "fragment ") - } - fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause) - if cErr.Detail != "" { - fmt.Fprintf(w, ": %v", cErr.Detail) - } -} diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go deleted file mode 100644 index 6c5fe42..0000000 --- a/src/urubu/grammar/item.go +++ /dev/null @@ -1,206 +0,0 @@ -package grammar - -import ( - "crypto/sha256" - "encoding/binary" - "fmt" - "sort" - "strconv" - - "urubu/grammar/symbol" -) - -type lrItemID [32]byte - -func (id lrItemID) String() string { - return fmt.Sprintf("%x", id.num()) -} - -func (id lrItemID) num() uint32 { - return binary.LittleEndian.Uint32(id[:]) -} - -type lookAhead struct { - symbols map[symbol.Symbol]struct{} - - // When propagation is true, an item propagates look-ahead symbols to other items. - propagation bool -} - -type lrItem struct { - id lrItemID - prod productionID - - // E → E + T - // - // Dot | Dotted Symbol | Item - // ----+---------------+------------ - // 0 | E | E →・E + T - // 1 | + | E → E・+ T - // 2 | T | E → E +・T - // 3 | Nil | E → E + T・ - dot int - dottedSymbol symbol.Symbol - - // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. - // It looks like S' →・S. - initial bool - - // When reducible is true, the item looks like E → E + T・. - reducible bool - - // When kernel is true, the item is kernel item. - kernel bool - - // lookAhead stores look-ahead symbols, and they are terminal symbols. - // The item is reducible only when the look-ahead symbols appear as the next input symbol. - lookAhead lookAhead -} - -func newLR0Item(prod *production, dot int) (*lrItem, error) { - if prod == nil { - return nil, fmt.Errorf("production must be non-nil") - } - - if dot < 0 || dot > prod.rhsLen { - return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) - } - - var id lrItemID - { - b := []byte{} - b = append(b, prod.id[:]...) - bDot := make([]byte, 8) - binary.LittleEndian.PutUint64(bDot, uint64(dot)) - b = append(b, bDot...) - id = sha256.Sum256(b) - } - - dottedSymbol := symbol.SymbolNil - if dot < prod.rhsLen { - dottedSymbol = prod.rhs[dot] - } - - initial := false - if prod.lhs.IsStart() && dot == 0 { - initial = true - } - - reducible := false - if dot == prod.rhsLen { - reducible = true - } - - kernel := false - if initial || dot > 0 { - kernel = true - } - - item := &lrItem{ - id: id, - prod: prod.id, - dot: dot, - dottedSymbol: dottedSymbol, - initial: initial, - reducible: reducible, - kernel: kernel, - } - - return item, nil -} - -type kernelID [32]byte - -func (id kernelID) String() string { - return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) -} - -type kernel struct { - id kernelID - items []*lrItem -} - -func newKernel(items []*lrItem) (*kernel, error) { - if len(items) == 0 { - return nil, fmt.Errorf("a kernel need at least one item") - } - - // Remove duplicates from items. - var sortedItems []*lrItem - { - m := map[lrItemID]*lrItem{} - for _, item := range items { - if !item.kernel { - return nil, fmt.Errorf("not a kernel item: %v", item) - } - m[item.id] = item - } - sortedItems = []*lrItem{} - for _, item := range m { - sortedItems = append(sortedItems, item) - } - sort.Slice(sortedItems, func(i, j int) bool { - return sortedItems[i].id.num() < sortedItems[j].id.num() - }) - } - - var id kernelID - { - b := []byte{} - for _, item := range sortedItems { - b = append(b, item.id[:]...) - } - id = sha256.Sum256(b) - } - - return &kernel{ - id: id, - items: sortedItems, - }, nil -} - -type stateNum int - -const stateNumInitial = stateNum(0) - -func (n stateNum) Int() int { - return int(n) -} - -func (n stateNum) String() string { - return strconv.Itoa(int(n)) -} - -func (n stateNum) next() stateNum { - return stateNum(n + 1) -} - -type lrState struct { - *kernel - num stateNum - next map[symbol.Symbol]kernelID - reducible map[productionID]struct{} - - // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. - // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store - // look-ahead symbols because the kernel items don't include these items. - // - // For instance, we have the following productions, and A is a terminal symbol. - // - // s' → s - // s → A | ε - // - // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't - // include `s → ・ε`. - // - // s' → ・s - // s → ・A - // s → ・ε - emptyProdItems []*lrItem - - // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. - // The `α` and `β` can be empty. - // - // A → α・error β - isErrorTrapper bool -} diff --git a/src/urubu/grammar/lalr1.go b/src/urubu/grammar/lalr1.go deleted file mode 100644 index 8373568..0000000 --- a/src/urubu/grammar/lalr1.go +++ /dev/null @@ -1,318 +0,0 @@ -package grammar - -import ( - "fmt" - - "urubu/grammar/symbol" -) - -type stateAndLRItem struct { - kernelID kernelID - itemID lrItemID -} - -type propagation struct { - src *stateAndLRItem - dest []*stateAndLRItem -} - -type lalr1Automaton struct { - *lr0Automaton -} - -func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { - // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $] - iniState := lr0.states[lr0.initialState] - iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ - symbol.SymbolEOF: {}, - } - - var props []*propagation - for _, state := range lr0.states { - for _, kItem := range state.items { - items, err := genLALR1Closure(kItem, prods, first) - if err != nil { - return nil, err - } - - kItem.lookAhead.propagation = true - - var propDests []*stateAndLRItem - for _, item := range items { - if item.reducible { - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - - if p.isEmpty() { - var reducibleItem *lrItem - for _, it := range state.emptyProdItems { - if it.id != item.id { - continue - } - - reducibleItem = it - break - } - if reducibleItem == nil { - return nil, fmt.Errorf("reducible item not found: %v", item.id) - } - if reducibleItem.lookAhead.symbols == nil { - reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - for a := range item.lookAhead.symbols { - reducibleItem.lookAhead.symbols[a] = struct{}{} - } - - propDests = append(propDests, &stateAndLRItem{ - kernelID: state.id, - itemID: item.id, - }) - } - - continue - } - - nextKID := state.next[item.dottedSymbol] - var nextItemID lrItemID - { - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - it, err := newLR0Item(p, item.dot+1) - if err != nil { - return nil, fmt.Errorf("failed to generate an item ID: %v", err) - } - nextItemID = it.id - } - - if item.lookAhead.propagation { - propDests = append(propDests, &stateAndLRItem{ - kernelID: nextKID, - itemID: nextItemID, - }) - } else { - nextState := lr0.states[nextKID] - var nextItem *lrItem - for _, it := range nextState.items { - if it.id != nextItemID { - continue - } - nextItem = it - break - } - if nextItem == nil { - return nil, fmt.Errorf("item not found: %v", nextItemID) - } - - if nextItem.lookAhead.symbols == nil { - nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - - for a := range item.lookAhead.symbols { - nextItem.lookAhead.symbols[a] = struct{}{} - } - } - } - if len(propDests) == 0 { - continue - } - - props = append(props, &propagation{ - src: &stateAndLRItem{ - kernelID: state.id, - itemID: kItem.id, - }, - dest: propDests, - }) - } - } - - err := propagateLookAhead(lr0, props) - if err != nil { - return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err) - } - - return &lalr1Automaton{ - lr0Automaton: lr0, - }, nil -} - -func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { - items := []*lrItem{} - knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} - knownItemsProp := map[lrItemID]struct{}{} - uncheckedItems := []*lrItem{} - items = append(items, srcItem) - uncheckedItems = append(uncheckedItems, srcItem) - for len(uncheckedItems) > 0 { - nextUncheckedItems := []*lrItem{} - for _, item := range uncheckedItems { - if item.dottedSymbol.IsTerminal() { - continue - } - - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - - var fstSyms []symbol.Symbol - var isFstNullable bool - { - fst, err := first.find(p, item.dot+1) - if err != nil { - return nil, err - } - - fstSyms = make([]symbol.Symbol, len(fst.symbols)) - i := 0 - for s := range fst.symbols { - fstSyms[i] = s - i++ - } - if fst.empty { - isFstNullable = true - } - } - - ps, _ := prods.findByLHS(item.dottedSymbol) - for _, prod := range ps { - var lookAhead []symbol.Symbol - { - var lookAheadCount int - if isFstNullable { - lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols) - } else { - lookAheadCount = len(fstSyms) - } - - lookAhead = make([]symbol.Symbol, lookAheadCount) - i := 0 - for _, s := range fstSyms { - lookAhead[i] = s - i++ - } - if isFstNullable { - for a := range item.lookAhead.symbols { - lookAhead[i] = a - i++ - } - } - } - - for _, a := range lookAhead { - newItem, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if items, exist := knownItems[newItem.id]; exist { - if _, exist := items[a]; exist { - continue - } - } - - newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ - a: {}, - } - - items = append(items, newItem) - if knownItems[newItem.id] == nil { - knownItems[newItem.id] = map[symbol.Symbol]struct{}{} - } - knownItems[newItem.id][a] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, newItem) - } - - if isFstNullable { - newItem, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if _, exist := knownItemsProp[newItem.id]; exist { - continue - } - - newItem.lookAhead.propagation = true - - items = append(items, newItem) - knownItemsProp[newItem.id] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, newItem) - } - } - } - uncheckedItems = nextUncheckedItems - } - - return items, nil -} - -func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { - for { - changed := false - for _, prop := range props { - srcState, ok := lr0.states[prop.src.kernelID] - if !ok { - return fmt.Errorf("source state not found: %v", prop.src.kernelID) - } - var srcItem *lrItem - for _, item := range srcState.items { - if item.id != prop.src.itemID { - continue - } - srcItem = item - break - } - if srcItem == nil { - return fmt.Errorf("source item not found: %v", prop.src.itemID) - } - - for _, dest := range prop.dest { - destState, ok := lr0.states[dest.kernelID] - if !ok { - return fmt.Errorf("destination state not found: %v", dest.kernelID) - } - var destItem *lrItem - for _, item := range destState.items { - if item.id != dest.itemID { - continue - } - destItem = item - break - } - if destItem == nil { - for _, item := range destState.emptyProdItems { - if item.id != dest.itemID { - continue - } - destItem = item - break - } - if destItem == nil { - return fmt.Errorf("destination item not found: %v", dest.itemID) - } - } - - for a := range srcItem.lookAhead.symbols { - if _, ok := destItem.lookAhead.symbols[a]; ok { - continue - } - - if destItem.lookAhead.symbols == nil { - destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - - destItem.lookAhead.symbols[a] = struct{}{} - changed = true - } - } - } - if !changed { - break - } - } - - return nil -} diff --git a/src/urubu/grammar/lexical/compiler.go b/src/urubu/grammar/lexical.go index 637018a..515e491 100644 --- a/src/urubu/grammar/lexical/compiler.go +++ b/src/urubu/grammar/lexical.go @@ -3,6 +3,8 @@ package lexical import ( "bytes" "fmt" + "sort" + "strings" "urubu/compressor" "urubu/grammar/lexical/dfa" @@ -411,3 +413,165 @@ func convertIntSliceToStateIDSlice(s []int) []spec.StateID { } return ss } + +type LexEntry struct { + Kind spec.LexKindName + Pattern string + Modes []spec.LexModeName + Push spec.LexModeName + Pop bool + Fragment bool +} + +type LexSpec struct { + Entries []*LexEntry +} + +func (s *LexSpec) Validate() error { + if len(s.Entries) <= 0 { + return fmt.Errorf("the lexical specification must have at least one entry") + } + { + ks := map[string]struct{}{} + fks := map[string]struct{}{} + for _, e := range s.Entries { + // Allow duplicate names between fragments and non-fragments. + if e.Fragment { + if _, exist := fks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + fks[e.Kind.String()] = struct{}{} + } else { + if _, exist := ks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + ks[e.Kind.String()] = struct{}{} + } + } + } + { + kinds := []string{} + modes := []string{ + spec.LexModeNameDefault.String(), // This is a predefined mode. + } + for _, e := range s.Entries { + if e.Fragment { + continue + } + + kinds = append(kinds, e.Kind.String()) + + for _, m := range e.Modes { + modes = append(modes, m.String()) + } + } + + kindErrs := findSpellingInconsistenciesErrors(kinds, nil) + modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { + if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { + var b strings.Builder + fmt.Fprintf(&b, "%+v", ids[0]) + for _, id := range ids[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) + } + return nil + }) + errs := append(kindErrs, modeErrs...) + if len(errs) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%v", errs[0]) + for _, err := range errs[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return fmt.Errorf(b.String()) + } + } + + return nil +} + +func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { + duplicated := FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return nil + } + + var errs []error + for _, dup := range duplicated { + if hook != nil { + err := hook(dup) + if err != nil { + errs = append(errs, err) + continue + } + } + + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) + errs = append(errs, err) + } + + return errs +} + +// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same +// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same +// in UpperCamelCase. Thus they are considere to be spelling inconsistency. +func FindSpellingInconsistencies(ids []string) [][]string { + m := map[string][]string{} + for _, id := range removeDuplicates(ids) { + c := SnakeCaseToUpperCamelCase(id) + m[c] = append(m[c], id) + } + + var duplicated [][]string + for _, camels := range m { + if len(camels) == 1 { + continue + } + duplicated = append(duplicated, camels) + } + + for _, dup := range duplicated { + sort.Slice(dup, func(i, j int) bool { + return dup[i] < dup[j] + }) + } + sort.Slice(duplicated, func(i, j int) bool { + return duplicated[i][0] < duplicated[j][0] + }) + + return duplicated +} + +func removeDuplicates(s []string) []string { + m := map[string]struct{}{} + for _, v := range s { + m[v] = struct{}{} + } + + var unique []string + for v := range m { + unique = append(unique, v) + } + + return unique +} + +func SnakeCaseToUpperCamelCase(snake string) string { + elems := strings.Split(snake, "_") + for i, e := range elems { + if len(e) == 0 { + continue + } + elems[i] = strings.ToUpper(string(e[0])) + e[1:] + } + + return strings.Join(elems, "") +} diff --git a/src/urubu/grammar/lexical/dfa/tree.go b/src/urubu/grammar/lexical/dfa.go index 8a11aee..982420d 100644 --- a/src/urubu/grammar/lexical/dfa/tree.go +++ b/src/urubu/grammar/lexical/dfa.go @@ -1,15 +1,358 @@ package dfa import ( + "encoding/binary" "fmt" "io" "sort" + "strings" "urubu/grammar/lexical/parser" spec "urubu/spec/grammar" "urubu/utf8" ) +type symbolTable struct { + symPos2Byte map[symbolPosition]byteRange + endPos2ID map[symbolPosition]spec.LexModeKindID +} + +func genSymbolTable(root byteTree) *symbolTable { + symTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{}, + endPos2ID: map[symbolPosition]spec.LexModeKindID{}, + } + return genSymTab(symTab, root) +} + +func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { + if node == nil { + return symTab + } + + switch n := node.(type) { + case *symbolNode: + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } + case *endMarkerNode: + symTab.endPos2ID[n.pos] = n.id + default: + left, right := node.children() + genSymTab(symTab, left) + genSymTab(symTab, right) + } + return symTab +} + +type DFA struct { + States []string + InitialState string + AcceptingStatesTable map[string]spec.LexModeKindID + TransitionTable map[string][256]string +} + +func GenDFA(root byteTree, symTab *symbolTable) *DFA { + initialState := root.first() + initialStateHash := initialState.hash() + stateMap := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + tranTab := map[string][256]string{} + { + follow := genFollowTable(root) + unmarkedStates := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + for len(unmarkedStates) > 0 { + nextUnmarkedStates := map[string]*symbolPositionSet{} + for hash, state := range unmarkedStates { + tranTabOfState := [256]*symbolPositionSet{} + for _, pos := range state.set() { + if pos.isEndMark() { + continue + } + valRange := symTab.symPos2Byte[pos] + for symVal := valRange.from; symVal <= valRange.to; symVal++ { + if tranTabOfState[symVal] == nil { + tranTabOfState[symVal] = newSymbolPositionSet() + } + tranTabOfState[symVal].merge(follow[pos]) + } + } + for _, t := range tranTabOfState { + if t == nil { + continue + } + h := t.hash() + if _, ok := stateMap[h]; ok { + continue + } + stateMap[h] = t + nextUnmarkedStates[h] = t + } + tabOfState := [256]string{} + for v, t := range tranTabOfState { + if t == nil { + continue + } + tabOfState[v] = t.hash() + } + tranTab[hash] = tabOfState + } + unmarkedStates = nextUnmarkedStates + } + } + + accTab := map[string]spec.LexModeKindID{} + { + for h, s := range stateMap { + for _, pos := range s.set() { + if !pos.isEndMark() { + continue + } + priorID, ok := accTab[h] + if !ok { + accTab[h] = symTab.endPos2ID[pos] + } else { + id := symTab.endPos2ID[pos] + if id < priorID { + accTab[h] = id + } + } + } + } + } + + var states []string + { + for s := range stateMap { + states = append(states, s) + } + sort.Slice(states, func(i, j int) bool { + return states[i] < states[j] + }) + } + + return &DFA{ + States: states, + InitialState: initialStateHash, + AcceptingStatesTable: accTab, + TransitionTable: tranTab, + } +} + +func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { + stateHash2ID := map[string]spec.StateID{} + for i, s := range dfa.States { + // Since 0 represents an invalid value in a transition table, + // assign a number greater than or equal to 1 to states. + stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) + } + + acc := make([]spec.LexModeKindID, len(dfa.States)+1) + for _, s := range dfa.States { + id, ok := dfa.AcceptingStatesTable[s] + if !ok { + continue + } + acc[stateHash2ID[s]] = id + } + + rowCount := len(dfa.States) + 1 + colCount := 256 + tran := make([]spec.StateID, rowCount*colCount) + for s, tab := range dfa.TransitionTable { + for v, to := range tab { + tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] + } + } + + return &spec.TransitionTable{ + InitialStateID: stateHash2ID[dfa.InitialState], + AcceptingStates: acc, + UncompressedTransition: tran, + RowCount: rowCount, + ColCount: colCount, + }, nil +} + +type symbolPosition uint16 + +const ( + symbolPositionNil symbolPosition = 0x0000 + + symbolPositionMin uint16 = 0x0001 + symbolPositionMax uint16 = 0x7fff + + symbolPositionMaskSymbol uint16 = 0x0000 + symbolPositionMaskEndMark uint16 = 0x8000 + + symbolPositionMaskValue uint16 = 0x7fff +) + +func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { + if n < symbolPositionMin || n > symbolPositionMax { + return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) + } + if endMark { + return symbolPosition(n | symbolPositionMaskEndMark), nil + } + return symbolPosition(n | symbolPositionMaskSymbol), nil +} + +func (p symbolPosition) String() string { + if p.isEndMark() { + return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) + } + return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) +} + +func (p symbolPosition) isEndMark() bool { + return uint16(p)&symbolPositionMaskEndMark > 1 +} + +func (p symbolPosition) describe() (uint16, bool) { + v := uint16(p) & symbolPositionMaskValue + if p.isEndMark() { + return v, true + } + return v, false +} + +type symbolPositionSet struct { + // `s` represents a set of symbol positions. + // However, immediately after adding a symbol position, the elements may be duplicated. + // When you need an aligned set with no duplicates, you can get such value via the set function. + s []symbolPosition + sorted bool +} + +func newSymbolPositionSet() *symbolPositionSet { + return &symbolPositionSet{ + s: []symbolPosition{}, + sorted: false, + } +} + +func (s *symbolPositionSet) String() string { + if len(s.s) <= 0 { + return "{}" + } + ps := s.sortAndRemoveDuplicates() + var b strings.Builder + fmt.Fprintf(&b, "{") + for i, p := range ps { + if i <= 0 { + fmt.Fprintf(&b, "%v", p) + continue + } + fmt.Fprintf(&b, ", %v", p) + } + fmt.Fprintf(&b, "}") + return b.String() +} + +func (s *symbolPositionSet) set() []symbolPosition { + s.sortAndRemoveDuplicates() + return s.s +} + +func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { + s.s = append(s.s, pos) + s.sorted = false + return s +} + +func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { + s.s = append(s.s, t.s...) + s.sorted = false + return s +} + +func (s *symbolPositionSet) hash() string { + if len(s.s) <= 0 { + return "" + } + sorted := s.sortAndRemoveDuplicates() + var buf []byte + for _, p := range sorted { + b := make([]byte, 8) + binary.PutUvarint(b, uint64(p)) + buf = append(buf, b...) + } + // Convert to a string to be able to use it as a key of a map. + // But note this byte sequence is made from values of symbol positions, + // so this is not a well-formed UTF-8 sequence. + return string(buf) +} + +func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { + if s.sorted { + return s.s + } + + sortSymbolPositions(s.s, 0, len(s.s)-1) + + // Remove duplicates. + lastV := s.s[0] + nextIdx := 1 + for _, v := range s.s[1:] { + if v == lastV { + continue + } + s.s[nextIdx] = v + nextIdx++ + lastV = v + } + s.s = s.s[:nextIdx] + s.sorted = true + + return s.s +} + +// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. +func sortSymbolPositions(ps []symbolPosition, left, right int) { + if left >= right { + return + } + var pivot symbolPosition + { + // Use a median as a pivot. + p1 := ps[left] + p2 := ps[(left+right)/2] + p3 := ps[right] + if p1 > p2 { + p1, p2 = p2, p1 + } + if p2 > p3 { + p2 = p3 + if p1 > p2 { + p2 = p1 + } + } + pivot = p2 + } + i := left + j := right + for i <= j { + for ps[i] < pivot { + i++ + } + for ps[j] > pivot { + j-- + } + if i <= j { + ps[i], ps[j] = ps[j], ps[i] + i++ + j-- + } + } + sortSymbolPositions(ps, left, j) + sortSymbolPositions(ps, i, right) +} + type byteTree interface { fmt.Stringer children() (byteTree, byteTree) diff --git a/src/urubu/grammar/lexical/dfa/dfa.go b/src/urubu/grammar/lexical/dfa/dfa.go deleted file mode 100644 index 48bd8b4..0000000 --- a/src/urubu/grammar/lexical/dfa/dfa.go +++ /dev/null @@ -1,173 +0,0 @@ -package dfa - -import ( - "sort" - - spec "urubu/spec/grammar" -) - -type symbolTable struct { - symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]spec.LexModeKindID -} - -func genSymbolTable(root byteTree) *symbolTable { - symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]spec.LexModeKindID{}, - } - return genSymTab(symTab, root) -} - -func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { - if node == nil { - return symTab - } - - switch n := node.(type) { - case *symbolNode: - symTab.symPos2Byte[n.pos] = byteRange{ - from: n.from, - to: n.to, - } - case *endMarkerNode: - symTab.endPos2ID[n.pos] = n.id - default: - left, right := node.children() - genSymTab(symTab, left) - genSymTab(symTab, right) - } - return symTab -} - -type DFA struct { - States []string - InitialState string - AcceptingStatesTable map[string]spec.LexModeKindID - TransitionTable map[string][256]string -} - -func GenDFA(root byteTree, symTab *symbolTable) *DFA { - initialState := root.first() - initialStateHash := initialState.hash() - stateMap := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - tranTab := map[string][256]string{} - { - follow := genFollowTable(root) - unmarkedStates := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - for len(unmarkedStates) > 0 { - nextUnmarkedStates := map[string]*symbolPositionSet{} - for hash, state := range unmarkedStates { - tranTabOfState := [256]*symbolPositionSet{} - for _, pos := range state.set() { - if pos.isEndMark() { - continue - } - valRange := symTab.symPos2Byte[pos] - for symVal := valRange.from; symVal <= valRange.to; symVal++ { - if tranTabOfState[symVal] == nil { - tranTabOfState[symVal] = newSymbolPositionSet() - } - tranTabOfState[symVal].merge(follow[pos]) - } - } - for _, t := range tranTabOfState { - if t == nil { - continue - } - h := t.hash() - if _, ok := stateMap[h]; ok { - continue - } - stateMap[h] = t - nextUnmarkedStates[h] = t - } - tabOfState := [256]string{} - for v, t := range tranTabOfState { - if t == nil { - continue - } - tabOfState[v] = t.hash() - } - tranTab[hash] = tabOfState - } - unmarkedStates = nextUnmarkedStates - } - } - - accTab := map[string]spec.LexModeKindID{} - { - for h, s := range stateMap { - for _, pos := range s.set() { - if !pos.isEndMark() { - continue - } - priorID, ok := accTab[h] - if !ok { - accTab[h] = symTab.endPos2ID[pos] - } else { - id := symTab.endPos2ID[pos] - if id < priorID { - accTab[h] = id - } - } - } - } - } - - var states []string - { - for s := range stateMap { - states = append(states, s) - } - sort.Slice(states, func(i, j int) bool { - return states[i] < states[j] - }) - } - - return &DFA{ - States: states, - InitialState: initialStateHash, - AcceptingStatesTable: accTab, - TransitionTable: tranTab, - } -} - -func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { - stateHash2ID := map[string]spec.StateID{} - for i, s := range dfa.States { - // Since 0 represents an invalid value in a transition table, - // assign a number greater than or equal to 1 to states. - stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) - } - - acc := make([]spec.LexModeKindID, len(dfa.States)+1) - for _, s := range dfa.States { - id, ok := dfa.AcceptingStatesTable[s] - if !ok { - continue - } - acc[stateHash2ID[s]] = id - } - - rowCount := len(dfa.States) + 1 - colCount := 256 - tran := make([]spec.StateID, rowCount*colCount) - for s, tab := range dfa.TransitionTable { - for v, to := range tab { - tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] - } - } - - return &spec.TransitionTable{ - InitialStateID: stateHash2ID[dfa.InitialState], - AcceptingStates: acc, - UncompressedTransition: tran, - RowCount: rowCount, - ColCount: colCount, - }, nil -} diff --git a/src/urubu/grammar/lexical/dfa/symbol_position.go b/src/urubu/grammar/lexical/dfa/symbol_position.go deleted file mode 100644 index f154251..0000000 --- a/src/urubu/grammar/lexical/dfa/symbol_position.go +++ /dev/null @@ -1,182 +0,0 @@ -package dfa - -import ( - "encoding/binary" - "fmt" - "strings" -) - -type symbolPosition uint16 - -const ( - symbolPositionNil symbolPosition = 0x0000 - - symbolPositionMin uint16 = 0x0001 - symbolPositionMax uint16 = 0x7fff - - symbolPositionMaskSymbol uint16 = 0x0000 - symbolPositionMaskEndMark uint16 = 0x8000 - - symbolPositionMaskValue uint16 = 0x7fff -) - -func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { - if n < symbolPositionMin || n > symbolPositionMax { - return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) - } - if endMark { - return symbolPosition(n | symbolPositionMaskEndMark), nil - } - return symbolPosition(n | symbolPositionMaskSymbol), nil -} - -func (p symbolPosition) String() string { - if p.isEndMark() { - return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) - } - return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) -} - -func (p symbolPosition) isEndMark() bool { - return uint16(p)&symbolPositionMaskEndMark > 1 -} - -func (p symbolPosition) describe() (uint16, bool) { - v := uint16(p) & symbolPositionMaskValue - if p.isEndMark() { - return v, true - } - return v, false -} - -type symbolPositionSet struct { - // `s` represents a set of symbol positions. - // However, immediately after adding a symbol position, the elements may be duplicated. - // When you need an aligned set with no duplicates, you can get such value via the set function. - s []symbolPosition - sorted bool -} - -func newSymbolPositionSet() *symbolPositionSet { - return &symbolPositionSet{ - s: []symbolPosition{}, - sorted: false, - } -} - -func (s *symbolPositionSet) String() string { - if len(s.s) <= 0 { - return "{}" - } - ps := s.sortAndRemoveDuplicates() - var b strings.Builder - fmt.Fprintf(&b, "{") - for i, p := range ps { - if i <= 0 { - fmt.Fprintf(&b, "%v", p) - continue - } - fmt.Fprintf(&b, ", %v", p) - } - fmt.Fprintf(&b, "}") - return b.String() -} - -func (s *symbolPositionSet) set() []symbolPosition { - s.sortAndRemoveDuplicates() - return s.s -} - -func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { - s.s = append(s.s, pos) - s.sorted = false - return s -} - -func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { - s.s = append(s.s, t.s...) - s.sorted = false - return s -} - -func (s *symbolPositionSet) hash() string { - if len(s.s) <= 0 { - return "" - } - sorted := s.sortAndRemoveDuplicates() - var buf []byte - for _, p := range sorted { - b := make([]byte, 8) - binary.PutUvarint(b, uint64(p)) - buf = append(buf, b...) - } - // Convert to a string to be able to use it as a key of a map. - // But note this byte sequence is made from values of symbol positions, - // so this is not a well-formed UTF-8 sequence. - return string(buf) -} - -func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { - if s.sorted { - return s.s - } - - sortSymbolPositions(s.s, 0, len(s.s)-1) - - // Remove duplicates. - lastV := s.s[0] - nextIdx := 1 - for _, v := range s.s[1:] { - if v == lastV { - continue - } - s.s[nextIdx] = v - nextIdx++ - lastV = v - } - s.s = s.s[:nextIdx] - s.sorted = true - - return s.s -} - -// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. -func sortSymbolPositions(ps []symbolPosition, left, right int) { - if left >= right { - return - } - var pivot symbolPosition - { - // Use a median as a pivot. - p1 := ps[left] - p2 := ps[(left+right)/2] - p3 := ps[right] - if p1 > p2 { - p1, p2 = p2, p1 - } - if p2 > p3 { - p2 = p3 - if p1 > p2 { - p2 = p1 - } - } - pivot = p2 - } - i := left - j := right - for i <= j { - for ps[i] < pivot { - i++ - } - for ps[j] > pivot { - j-- - } - if i <= j { - ps[i], ps[j] = ps[j], ps[i] - i++ - j-- - } - } - sortSymbolPositions(ps, left, j) - sortSymbolPositions(ps, i, right) -} diff --git a/src/urubu/grammar/lexical/entry.go b/src/urubu/grammar/lexical/entry.go deleted file mode 100644 index 44af8ea..0000000 --- a/src/urubu/grammar/lexical/entry.go +++ /dev/null @@ -1,171 +0,0 @@ -package lexical - -import ( - "fmt" - "sort" - "strings" - - spec "urubu/spec/grammar" -) - -type LexEntry struct { - Kind spec.LexKindName - Pattern string - Modes []spec.LexModeName - Push spec.LexModeName - Pop bool - Fragment bool -} - -type LexSpec struct { - Entries []*LexEntry -} - -func (s *LexSpec) Validate() error { - if len(s.Entries) <= 0 { - return fmt.Errorf("the lexical specification must have at least one entry") - } - { - ks := map[string]struct{}{} - fks := map[string]struct{}{} - for _, e := range s.Entries { - // Allow duplicate names between fragments and non-fragments. - if e.Fragment { - if _, exist := fks[e.Kind.String()]; exist { - return fmt.Errorf("kinds `%v` are duplicates", e.Kind) - } - fks[e.Kind.String()] = struct{}{} - } else { - if _, exist := ks[e.Kind.String()]; exist { - return fmt.Errorf("kinds `%v` are duplicates", e.Kind) - } - ks[e.Kind.String()] = struct{}{} - } - } - } - { - kinds := []string{} - modes := []string{ - spec.LexModeNameDefault.String(), // This is a predefined mode. - } - for _, e := range s.Entries { - if e.Fragment { - continue - } - - kinds = append(kinds, e.Kind.String()) - - for _, m := range e.Modes { - modes = append(modes, m.String()) - } - } - - kindErrs := findSpellingInconsistenciesErrors(kinds, nil) - modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { - if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { - var b strings.Builder - fmt.Fprintf(&b, "%+v", ids[0]) - for _, id := range ids[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) - } - return nil - }) - errs := append(kindErrs, modeErrs...) - if len(errs) > 0 { - var b strings.Builder - fmt.Fprintf(&b, "%v", errs[0]) - for _, err := range errs[1:] { - fmt.Fprintf(&b, "\n%v", err) - } - return fmt.Errorf(b.String()) - } - } - - return nil -} - -func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { - duplicated := FindSpellingInconsistencies(ids) - if len(duplicated) == 0 { - return nil - } - - var errs []error - for _, dup := range duplicated { - if hook != nil { - err := hook(dup) - if err != nil { - errs = append(errs, err) - continue - } - } - - var b strings.Builder - fmt.Fprintf(&b, "%+v", dup[0]) - for _, id := range dup[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) - errs = append(errs, err) - } - - return errs -} - -// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same -// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same -// in UpperCamelCase. Thus they are considere to be spelling inconsistency. -func FindSpellingInconsistencies(ids []string) [][]string { - m := map[string][]string{} - for _, id := range removeDuplicates(ids) { - c := SnakeCaseToUpperCamelCase(id) - m[c] = append(m[c], id) - } - - var duplicated [][]string - for _, camels := range m { - if len(camels) == 1 { - continue - } - duplicated = append(duplicated, camels) - } - - for _, dup := range duplicated { - sort.Slice(dup, func(i, j int) bool { - return dup[i] < dup[j] - }) - } - sort.Slice(duplicated, func(i, j int) bool { - return duplicated[i][0] < duplicated[j][0] - }) - - return duplicated -} - -func removeDuplicates(s []string) []string { - m := map[string]struct{}{} - for _, v := range s { - m[v] = struct{}{} - } - - var unique []string - for v := range m { - unique = append(unique, v) - } - - return unique -} - -func SnakeCaseToUpperCamelCase(snake string) string { - elems := strings.Split(snake, "_") - for i, e := range elems { - if len(e) == 0 { - continue - } - elems[i] = strings.ToUpper(string(e[0])) + e[1:] - } - - return strings.Join(elems, "") -} diff --git a/src/urubu/grammar/lexical/parser.go b/src/urubu/grammar/lexical/parser.go new file mode 100644 index 0000000..748e8fe --- /dev/null +++ b/src/urubu/grammar/lexical/parser.go @@ -0,0 +1,1668 @@ +package parser + +import ( + "bufio" + "bytes" + "fmt" + "io" + "sort" + "strconv" + "strings" + + spec "urubu/spec/grammar" + "urubu/ucd" +) + +var ( + ParseErr = fmt.Errorf("parse error") + + // lexical errors + synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") + synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") + synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") + SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") + + // syntax errors + synErrUnexpectedToken = fmt.Errorf("unexpected token") + synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") + synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") + synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") + synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") + synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") + synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") + synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") + synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") + synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") + synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") + synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") + synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") + synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") + synErrRangeInvalidForm = fmt.Errorf("invalid range expression") + synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") + synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") + synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") + synErrCharPropUnsupported = fmt.Errorf("unsupported character property") + synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") +) + +type incompleteFragment struct { + kind spec.LexKindName + root *rootNode +} + +func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { + if len(fragments) == 0 { + return nil + } + + completeFragments := map[spec.LexKindName]CPTree{} + incompleteFragments := []*incompleteFragment{} + for kind, tree := range fragments { + root, ok := tree.(*rootNode) + if !ok { + return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) + } + if root.incomplete() { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + root: root, + }) + } else { + completeFragments[kind] = root + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + complete, err := ApplyFragments(e.root, completeFragments) + if err != nil { + return err + } + if !complete { + remainingFragments = append(remainingFragments, e) + } else { + completeFragments[e.kind] = e.root + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + return ParseErr + } + } + + return nil +} + +func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { + root, ok := t.(*rootNode) + if !ok { + return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) + } + + for name, frag := range fragments { + err := root.applyFragment(name, frag) + if err != nil { + return false, err + } + } + + return !root.incomplete(), nil +} + +type tokenKind string + +const ( + tokenKindChar tokenKind = "char" + tokenKindAnyChar tokenKind = "." + tokenKindRepeat tokenKind = "*" + tokenKindRepeatOneOrMore tokenKind = "+" + tokenKindOption tokenKind = "?" + tokenKindAlt tokenKind = "|" + tokenKindGroupOpen tokenKind = "(" + tokenKindGroupClose tokenKind = ")" + tokenKindBExpOpen tokenKind = "[" + tokenKindInverseBExpOpen tokenKind = "[^" + tokenKindBExpClose tokenKind = "]" + tokenKindCharRange tokenKind = "-" + tokenKindCodePointLeader tokenKind = "\\u" + tokenKindCharPropLeader tokenKind = "\\p" + tokenKindFragmentLeader tokenKind = "\\f" + tokenKindLBrace tokenKind = "{" + tokenKindRBrace tokenKind = "}" + tokenKindEqual tokenKind = "=" + tokenKindCodePoint tokenKind = "code point" + tokenKindCharPropSymbol tokenKind = "character property symbol" + tokenKindFragmentSymbol tokenKind = "fragment symbol" + tokenKindEOF tokenKind = "eof" +) + +type token struct { + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + +type lexerMode string + +const ( + lexerModeDefault lexerMode = "default" + lexerModeBExp lexerMode = "bracket expression" + lexerModeCPExp lexerMode = "code point expression" + lexerModeCharPropExp lexerMode = "character property expression" + lexerModeFragmentExp lexerMode = "fragment expression" +) + +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + +type rangeState string + +// [a-z] +// ^^^^ +// |||`-- ready +// ||`-- expect range terminator +// |`-- read range initiator +// `-- ready +const ( + rangeStateReady rangeState = "ready" + rangeStateReadRangeInitiator rangeState = "read range initiator" + rangeStateExpectRangeTerminator rangeState = "expect range terminator" +) + +type lexer struct { + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + modeStack *lexerModeStack + rangeState rangeState + + errCause error + errDetail string +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, + lastChar: nullChar, + reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, + modeStack: newLexerModeStack(), + rangeState: rangeStateReady, + } +} + +func (l *lexer) error() (string, error) { + return l.errDetail, l.errCause +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch l.modeStack.top() { + case lexerModeBExp: + tok, err := l.nextInBExp(c) + if err != nil { + return nil, err + } + if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { + switch l.rangeState { + case rangeStateReady: + l.rangeState = rangeStateReadRangeInitiator + case rangeStateExpectRangeTerminator: + l.rangeState = rangeStateReady + } + } + switch tok.kind { + case tokenKindBExpClose: + l.modeStack.pop() + case tokenKindCharRange: + l.rangeState = rangeStateExpectRangeTerminator + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + default: + tok, err := l.nextInDefault(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindInverseBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) + } + return tok, nil + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '[': + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case '-': + if l.rangeState != rangeStateReadRangeInitiator { + return newToken(tokenKindChar, c), nil + } + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + } + if c1 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindCharRange, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == '\\' || c == '^' || c == '-' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = synErrCharPropInvalidSymbol + return nil, ParseErr + } + return newCharPropSymbolToken(sym), nil + } +} + +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = SynErrFragmentInvalidSymbol + return nil, ParseErr + } + return newFragmentSymbolToken(sym), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = nullChar + l.reachedEOF = true + return l.lastChar, l.reachedEOF, nil + } + return nullChar, false, err + } + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = c + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil +} + +func (l *lexer) restore() error { + if l.lastChar == nullChar && !l.reachedEOF { + return fmt.Errorf("failed to call restore() because the last character is null") + } + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil +} + +type PatternEntry struct { + ID spec.LexModeKindID + Pattern []byte +} + +type parser struct { + kind spec.LexKindName + lex *lexer + peekedTok *token + lastTok *token + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool + + errCause error + errDetail string +} + +func NewParser(kind spec.LexKindName, src io.Reader) *parser { + return &parser{ + kind: kind, + lex: newLexer(src), + isContributoryPropertyExposed: false, + } +} + +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + +func (p *parser) Error() (string, error) { + return p.errDetail, p.errCause +} + +func (p *parser) Parse() (root CPTree, retErr error) { + defer func() { + err := recover() + if err != nil { + var ok bool + retErr, ok = err.(error) + if !ok { + panic(err) + } + return + } + }() + + return newRootNode(p.kind, p.parseRegexp()), nil +} + +func (p *parser) parseRegexp() CPTree { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.raiseParseError(synErrNullPattern, "") + } + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.expect(tokenKindEOF) + return alt +} + +func (p *parser) parseAlt() CPTree { + left := p.parseConcat() + if left == nil { + if p.consume(tokenKindAlt) { + p.raiseParseError(synErrAltLackOfOperand, "") + } + return nil + } + for { + if !p.consume(tokenKindAlt) { + break + } + right := p.parseConcat() + if right == nil { + p.raiseParseError(synErrAltLackOfOperand, "") + } + left = newAltNode(left, right) + } + return left +} + +func (p *parser) parseConcat() CPTree { + left := p.parseRepeat() + for { + right := p.parseRepeat() + if right == nil { + break + } + left = newConcatNode(left, right) + } + return left +} + +func (p *parser) parseRepeat() CPTree { + group := p.parseGroup() + if group == nil { + if p.consume(tokenKindRepeat) { + p.raiseParseError(synErrRepNoTarget, "* needs an operand") + } + if p.consume(tokenKindRepeatOneOrMore) { + p.raiseParseError(synErrRepNoTarget, "+ needs an operand") + } + if p.consume(tokenKindOption) { + p.raiseParseError(synErrRepNoTarget, "? needs an operand") + } + return nil + } + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) + } + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group +} + +func (p *parser) parseGroup() CPTree { + if p.consume(tokenKindGroupOpen) { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + p.raiseParseError(synErrGroupNoElem, "") + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + if !p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupInvalidForm, "") + } + return alt + } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() CPTree { + if p.consume(tokenKindAnyChar) { + return genAnyCharAST() + } + if p.consume(tokenKindBExpOpen) { + left := p.parseBExpElem() + if left == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + for { + right := p.parseBExpElem() + if right == nil { + break + } + left = newAltNode(left, right) + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return left + } + if p.consume(tokenKindInverseBExpOpen) { + elem := p.parseBExpElem() + if elem == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return inverse + } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + p.raiseParseError(synErrBExpInvalidForm, "") + } + return nil + } + return c +} + +func (p *parser) parseBExpElem() CPTree { + var left CPTree + switch { + case p.consume(tokenKindCodePointLeader): + left = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + left = p.parseCharProp() + if p.consume(tokenKindCharRange) { + p.raiseParseError(synErrRangePropIsUnavailable, "") + } + default: + left = p.parseNormalChar() + } + if left == nil { + return nil + } + if !p.consume(tokenKindCharRange) { + return left + } + var right CPTree + switch { + case p.consume(tokenKindCodePointLeader): + right = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + p.raiseParseError(synErrRangePropIsUnavailable, "") + default: + right = p.parseNormalChar() + } + if right == nil { + p.raiseParseError(synErrRangeInvalidForm, "") + } + from, _, _ := left.Range() + _, to, _ := right.Range() + if !isValidOrder(from, to) { + p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) + } + return newRangeSymbolNode(from, to) +} + +func (p *parser) parseCodePoint() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + if !p.consume(tokenKindCodePoint) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) + } + if n < 0x0000 || n > 0x10FFFF { + p.raiseParseError(synErrCPExpOutOfRange, "") + } + + sym := newSymbolNode(rune(n)) + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + return sym +} + +func (p *parser) parseCharProp() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym2 = p.lastTok.propSymbol + } + + var alt CPTree + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "" + propVal = sym1 + } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.raiseParseError(synErrCharPropUnsupported, propName) + } + pat, err := ucd.NormalizeCharacterProperty(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if pat != "" { + p := NewParser(p.kind, bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() + ast, err := p.Parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if inverse { + r := cpRanges[0] + alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for _, r := range cpRanges[1:] { + alt = exclude(newRangeSymbolNode(r.From, r.To), alt) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + } else { + for _, r := range cpRanges { + alt = genAltNode( + alt, + newRangeSymbolNode(r.From, r.To), + ) + } + } + } + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + + return alt +} + +func (p *parser) parseFragment() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + if !p.consume(tokenKindFragmentSymbol) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + + return newFragmentNode(spec.LexKindName(sym), nil) +} + +func (p *parser) parseNormalChar() CPTree { + if !p.consume(tokenKindChar) { + return nil + } + return newSymbolNode(p.lastTok.char) +} + +func exclude(symbol, base CPTree) CPTree { + if left, right, ok := symbol.Alternatives(); ok { + return exclude(right, exclude(left, base)) + } + + if left, right, ok := base.Alternatives(); ok { + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + } + + if bFrom, bTo, ok := base.Range(); ok { + sFrom, sTo, ok := symbol.Range() + if !ok { + panic(fmt.Errorf("invalid symbol tree: %T", symbol)) + } + + switch { + case sFrom > bFrom && sTo < bTo: + return genAltNode( + newRangeSymbolNode(bFrom, sFrom-1), + newRangeSymbolNode(sTo+1, bTo), + ) + case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: + return newRangeSymbolNode(sTo+1, bTo) + case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: + return newRangeSymbolNode(bFrom, sFrom-1) + case sFrom <= bFrom && sTo >= bTo: + return nil + default: + return base + } + } + + panic(fmt.Errorf("invalid base tree: %T", base)) +} + +func genAnyCharAST() CPTree { + return newRangeSymbolNode(0x0, 0x10FFFF) +} + +func isValidOrder(from, to rune) bool { + return from <= to +} + +func genConcatNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + concat = newConcatNode(concat, c) + } + return concat +} + +func genAltNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + alt = newAltNode(alt, c) + } + return alt +} + +func (p *parser) expect(expected tokenKind) { + if !p.consume(expected) { + tok := p.peekedTok + p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) + } +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + if err == ParseErr { + detail, cause := p.lex.error() + p.raiseParseError(cause, detail) + } + panic(err) + } + } + p.lastTok = tok + if tok.kind == expected { + return true + } + p.peekedTok = tok + p.lastTok = nil + + return false +} + +func (p *parser) raiseParseError(err error, detail string) { + p.errCause = err + p.errDetail = detail + panic(ParseErr) +} + +type CPRange struct { + From rune + To rune +} + +type CPTree interface { + fmt.Stringer + Range() (rune, rune, bool) + Optional() (CPTree, bool) + Repeatable() (CPTree, bool) + Concatenation() (CPTree, CPTree, bool) + Alternatives() (CPTree, CPTree, bool) + Describe() (spec.LexKindName, []spec.LexKindName, error) + + children() (CPTree, CPTree) + clone() CPTree +} + +var ( + _ CPTree = &rootNode{} + _ CPTree = &symbolNode{} + _ CPTree = &concatNode{} + _ CPTree = &altNode{} + _ CPTree = &quantifierNode{} + _ CPTree = &fragmentNode{} +) + +type rootNode struct { + kind spec.LexKindName + tree CPTree + fragments map[spec.LexKindName][]*fragmentNode +} + +func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { + fragments := map[spec.LexKindName][]*fragmentNode{} + collectFragments(t, fragments) + + return &rootNode{ + kind: kind, + tree: t, + fragments: fragments, + } +} + +func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { + if n == nil { + return + } + + if f, ok := n.(*fragmentNode); ok { + fragments[f.kind] = append(fragments[f.kind], f) + return + } + + l, r := n.children() + collectFragments(l, fragments) + collectFragments(r, fragments) +} + +func (n *rootNode) String() string { + return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) +} + +func (n *rootNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *rootNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *rootNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + var frags []spec.LexKindName + for f := range n.fragments { + frags = append(frags, spec.LexKindName(f)) + } + sort.Slice(frags, func(i, j int) bool { + return frags[i] < frags[j] + }) + + return n.kind, frags, nil +} + +func (n *rootNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *rootNode) clone() CPTree { + return n.tree.clone() +} + +func (n *rootNode) incomplete() bool { + return len(n.fragments) > 0 +} + +func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { + root, ok := fragment.(*rootNode) + if !ok { + return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) + } + if root.incomplete() { + return fmt.Errorf("fragment is incomplete") + } + + fs, ok := n.fragments[kind] + if !ok { + return nil + } + for _, f := range fs { + f.tree = root.clone() + } + delete(n.fragments, kind) + + return nil +} + +type symbolNode struct { + CPRange +} + +func newSymbolNode(cp rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: cp, + To: cp, + }, + } +} + +func newRangeSymbolNode(from, to rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: from, + To: to, + }, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: %X..%X", n.From, n.To) +} + +func (n *symbolNode) Range() (rune, rune, bool) { + return n.From, n.To, true +} + +func (n *symbolNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *symbolNode) children() (CPTree, CPTree) { + return nil, nil +} + +func (n *symbolNode) clone() CPTree { + return newRangeSymbolNode(n.From, n.To) +} + +type concatNode struct { + left CPTree + right CPTree +} + +func newConcatNode(left, right CPTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *concatNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *concatNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *concatNode) clone() CPTree { + if n == nil { + return nil + } + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left CPTree + right CPTree +} + +func newAltNode(left, right CPTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *altNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *altNode) Alternatives() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *altNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *altNode) clone() CPTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type quantifierNode struct { + optional bool + repeatable bool + tree CPTree +} + +func (n *quantifierNode) String() string { + switch { + case n.repeatable: + return "repeatable (>= 0 times)" + case n.optional: + return "optional (0 or 1 times)" + default: + return "invalid quantifier" + } +} + +func newRepeatNode(t CPTree) *quantifierNode { + return &quantifierNode{ + repeatable: true, + tree: t, + } +} + +func newRepeatOneOrMoreNode(t CPTree) *concatNode { + return newConcatNode( + t, + &quantifierNode{ + repeatable: true, + tree: t.clone(), + }) +} + +func newOptionNode(t CPTree) *quantifierNode { + return &quantifierNode{ + optional: true, + tree: t, + } +} + +func (n *quantifierNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *quantifierNode) Optional() (CPTree, bool) { + return n.tree, n.optional +} + +func (n *quantifierNode) Repeatable() (CPTree, bool) { + return n.tree, n.repeatable +} + +func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *quantifierNode) children() (CPTree, CPTree) { + return n.tree, nil +} + +func (n *quantifierNode) clone() CPTree { + if n.repeatable { + return newRepeatNode(n.tree.clone()) + } + return newOptionNode(n.tree.clone()) +} + +type fragmentNode struct { + kind spec.LexKindName + tree CPTree +} + +func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { + return &fragmentNode{ + kind: kind, + tree: t, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("fragment: %v", n.kind) +} + +func (n *fragmentNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *fragmentNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *fragmentNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *fragmentNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *fragmentNode) clone() CPTree { + if n.tree == nil { + return newFragmentNode(n.kind, nil) + } + return newFragmentNode(n.kind, n.tree.clone()) +} + +//nolint:unused +func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v\n", ruledLine, t) + children := []CPTree{} + switch n := t.(type) { + case *rootNode: + children = append(children, n.tree) + case *fragmentNode: + children = append(children, n.tree) + default: + left, right := t.children() + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } +} diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go deleted file mode 100644 index be81da4..0000000 --- a/src/urubu/grammar/lexical/parser/error.go +++ /dev/null @@ -1,36 +0,0 @@ -package parser - -import "fmt" - -var ( - ParseErr = fmt.Errorf("parse error") - - // lexical errors - synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") - synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") - synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") - synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") - SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") - - // syntax errors - synErrUnexpectedToken = fmt.Errorf("unexpected token") - synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") - synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") - synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") - synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") - synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") - synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") - synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") - synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") - synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") - synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") - synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") - synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") - synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") - synErrRangeInvalidForm = fmt.Errorf("invalid range expression") - synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") - synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") - synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") - synErrCharPropUnsupported = fmt.Errorf("unsupported character property") - synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") -) diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go deleted file mode 100644 index 196c00b..0000000 --- a/src/urubu/grammar/lexical/parser/fragment.go +++ /dev/null @@ -1,72 +0,0 @@ -package parser - -import ( - "fmt" - - spec "urubu/spec/grammar" -) - -type incompleteFragment struct { - kind spec.LexKindName - root *rootNode -} - -func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { - if len(fragments) == 0 { - return nil - } - - completeFragments := map[spec.LexKindName]CPTree{} - incompleteFragments := []*incompleteFragment{} - for kind, tree := range fragments { - root, ok := tree.(*rootNode) - if !ok { - return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) - } - if root.incomplete() { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - root: root, - }) - } else { - completeFragments[kind] = root - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - complete, err := ApplyFragments(e.root, completeFragments) - if err != nil { - return err - } - if !complete { - remainingFragments = append(remainingFragments, e) - } else { - completeFragments[e.kind] = e.root - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - return ParseErr - } - } - - return nil -} - -func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { - root, ok := t.(*rootNode) - if !ok { - return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) - } - - for name, frag := range fragments { - err := root.applyFragment(name, frag) - if err != nil { - return false, err - } - } - - return !root.incomplete(), nil -} diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go deleted file mode 100644 index 3861825..0000000 --- a/src/urubu/grammar/lexical/parser/lexer.go +++ /dev/null @@ -1,594 +0,0 @@ -package parser - -import ( - "bufio" - "fmt" - "io" - "strings" -) - -type tokenKind string - -const ( - tokenKindChar tokenKind = "char" - tokenKindAnyChar tokenKind = "." - tokenKindRepeat tokenKind = "*" - tokenKindRepeatOneOrMore tokenKind = "+" - tokenKindOption tokenKind = "?" - tokenKindAlt tokenKind = "|" - tokenKindGroupOpen tokenKind = "(" - tokenKindGroupClose tokenKind = ")" - tokenKindBExpOpen tokenKind = "[" - tokenKindInverseBExpOpen tokenKind = "[^" - tokenKindBExpClose tokenKind = "]" - tokenKindCharRange tokenKind = "-" - tokenKindCodePointLeader tokenKind = "\\u" - tokenKindCharPropLeader tokenKind = "\\p" - tokenKindFragmentLeader tokenKind = "\\f" - tokenKindLBrace tokenKind = "{" - tokenKindRBrace tokenKind = "}" - tokenKindEqual tokenKind = "=" - tokenKindCodePoint tokenKind = "code point" - tokenKindCharPropSymbol tokenKind = "character property symbol" - tokenKindFragmentSymbol tokenKind = "fragment symbol" - tokenKindEOF tokenKind = "eof" -) - -type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string - fragmentSymbol string -} - -const nullChar = '\u0000' - -func newToken(kind tokenKind, char rune) *token { - return &token{ - kind: kind, - char: char, - } -} - -func newCodePointToken(codePoint string) *token { - return &token{ - kind: tokenKindCodePoint, - codePoint: codePoint, - } -} - -func newCharPropSymbolToken(propSymbol string) *token { - return &token{ - kind: tokenKindCharPropSymbol, - propSymbol: propSymbol, - } -} - -func newFragmentSymbolToken(fragmentSymbol string) *token { - return &token{ - kind: tokenKindFragmentSymbol, - fragmentSymbol: fragmentSymbol, - } -} - -type lexerMode string - -const ( - lexerModeDefault lexerMode = "default" - lexerModeBExp lexerMode = "bracket expression" - lexerModeCPExp lexerMode = "code point expression" - lexerModeCharPropExp lexerMode = "character property expression" - lexerModeFragmentExp lexerMode = "fragment expression" -) - -type lexerModeStack struct { - stack []lexerMode -} - -func newLexerModeStack() *lexerModeStack { - return &lexerModeStack{ - stack: []lexerMode{ - lexerModeDefault, - }, - } -} - -func (s *lexerModeStack) top() lexerMode { - return s.stack[len(s.stack)-1] -} - -func (s *lexerModeStack) push(m lexerMode) { - s.stack = append(s.stack, m) -} - -func (s *lexerModeStack) pop() { - s.stack = s.stack[:len(s.stack)-1] -} - -type rangeState string - -// [a-z] -// ^^^^ -// |||`-- ready -// ||`-- expect range terminator -// |`-- read range initiator -// `-- ready -const ( - rangeStateReady rangeState = "ready" - rangeStateReadRangeInitiator rangeState = "read range initiator" - rangeStateExpectRangeTerminator rangeState = "expect range terminator" -) - -type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - modeStack *lexerModeStack - rangeState rangeState - - errCause error - errDetail string -} - -func newLexer(src io.Reader) *lexer { - return &lexer{ - src: bufio.NewReader(src), - peekChar2: nullChar, - peekEOF2: false, - peekChar1: nullChar, - peekEOF1: false, - lastChar: nullChar, - reachedEOF: false, - prevChar1: nullChar, - prevEOF1: false, - prevChar2: nullChar, - pervEOF2: false, - modeStack: newLexerModeStack(), - rangeState: rangeStateReady, - } -} - -func (l *lexer) error() (string, error) { - return l.errDetail, l.errCause -} - -func (l *lexer) next() (*token, error) { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return newToken(tokenKindEOF, nullChar), nil - } - - switch l.modeStack.top() { - case lexerModeBExp: - tok, err := l.nextInBExp(c) - if err != nil { - return nil, err - } - if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { - switch l.rangeState { - case rangeStateReady: - l.rangeState = rangeStateReadRangeInitiator - case rangeStateExpectRangeTerminator: - l.rangeState = rangeStateReady - } - } - switch tok.kind { - case tokenKindBExpClose: - l.modeStack.pop() - case tokenKindCharRange: - l.rangeState = rangeStateExpectRangeTerminator - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - } - return tok, nil - case lexerModeCPExp: - tok, err := l.nextInCodePoint(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeCharPropExp: - tok, err := l.nextInCharProp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeFragmentExp: - tok, err := l.nextInFragment(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - default: - tok, err := l.nextInDefault(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindInverseBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - case tokenKindFragmentLeader: - l.modeStack.push(lexerModeFragmentExp) - } - return tok, nil - } -} - -func (l *lexer) nextInDefault(c rune) (*token, error) { - switch c { - case '*': - return newToken(tokenKindRepeat, nullChar), nil - case '+': - return newToken(tokenKindRepeatOneOrMore, nullChar), nil - case '?': - return newToken(tokenKindOption, nullChar), nil - case '.': - return newToken(tokenKindAnyChar, nullChar), nil - case '|': - return newToken(tokenKindAlt, nullChar), nil - case '(': - return newToken(tokenKindGroupOpen, nullChar), nil - case ')': - return newToken(tokenKindGroupClose, nullChar), nil - case '[': - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - if c1 != '^' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - c2, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - if c2 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == 'f' { - return newToken(tokenKindFragmentLeader, nullChar), nil - } - if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInBExp(c rune) (*token, error) { - switch c { - case '-': - if l.rangeState != rangeStateReadRangeInitiator { - return newToken(tokenKindChar, c), nil - } - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - } - if c1 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindCharRange, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == '\\' || c == '^' || c == '-' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInCodePoint(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - if !isHexDigit(c) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if !isHexDigit(c) || n >= 6 { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - fmt.Fprint(&b, string(c)) - n++ - } - cp := b.String() - cpLen := len(cp) - if !(cpLen == 4 || cpLen == 6) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - return newCodePointToken(b.String()), nil - } -} - -func isHexDigit(c rune) bool { - if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { - return true - } - return false -} - -func (l *lexer) nextInCharProp(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - case '=': - return newToken(tokenKindEqual, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' || c == '=' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = synErrCharPropInvalidSymbol - return nil, ParseErr - } - return newCharPropSymbolToken(sym), nil - } -} - -func (l *lexer) nextInFragment(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = SynErrFragmentInvalidSymbol - return nil, ParseErr - } - return newFragmentSymbolToken(sym), nil - } -} - -func (l *lexer) read() (rune, bool, error) { - if l.reachedEOF { - return l.lastChar, l.reachedEOF, nil - } - if l.peekChar1 != nullChar || l.peekEOF1 { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = l.peekChar1 - l.reachedEOF = l.peekEOF1 - l.peekChar1 = l.peekChar2 - l.peekEOF1 = l.peekEOF2 - l.peekChar2 = nullChar - l.peekEOF2 = false - return l.lastChar, l.reachedEOF, nil - } - c, _, err := l.src.ReadRune() - if err != nil { - if err == io.EOF { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = nullChar - l.reachedEOF = true - return l.lastChar, l.reachedEOF, nil - } - return nullChar, false, err - } - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = c - l.reachedEOF = false - return l.lastChar, l.reachedEOF, nil -} - -func (l *lexer) restore() error { - if l.lastChar == nullChar && !l.reachedEOF { - return fmt.Errorf("failed to call restore() because the last character is null") - } - l.peekChar2 = l.peekChar1 - l.peekEOF2 = l.peekEOF1 - l.peekChar1 = l.lastChar - l.peekEOF1 = l.reachedEOF - l.lastChar = l.prevChar1 - l.reachedEOF = l.prevEOF1 - l.prevChar1 = l.prevChar2 - l.prevEOF1 = l.pervEOF2 - l.prevChar2 = nullChar - l.pervEOF2 = false - return nil -} diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go deleted file mode 100644 index 425b553..0000000 --- a/src/urubu/grammar/lexical/parser/parser.go +++ /dev/null @@ -1,531 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "strconv" - - spec "urubu/spec/grammar" - "urubu/ucd" -) - -type PatternEntry struct { - ID spec.LexModeKindID - Pattern []byte -} - -type parser struct { - kind spec.LexKindName - lex *lexer - peekedTok *token - lastTok *token - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool - - errCause error - errDetail string -} - -func NewParser(kind spec.LexKindName, src io.Reader) *parser { - return &parser{ - kind: kind, - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) Error() (string, error) { - return p.errDetail, p.errCause -} - -func (p *parser) Parse() (root CPTree, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - panic(err) - } - return - } - }() - - return newRootNode(p.kind, p.parseRegexp()), nil -} - -func (p *parser) parseRegexp() CPTree { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.raiseParseError(synErrNullPattern, "") - } - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.expect(tokenKindEOF) - return alt -} - -func (p *parser) parseAlt() CPTree { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - p.raiseParseError(synErrAltLackOfOperand, "") - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - p.raiseParseError(synErrAltLackOfOperand, "") - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() CPTree { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() CPTree { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.raiseParseError(synErrRepNoTarget, "* needs an operand") - } - if p.consume(tokenKindRepeatOneOrMore) { - p.raiseParseError(synErrRepNoTarget, "+ needs an operand") - } - if p.consume(tokenKindOption) { - p.raiseParseError(synErrRepNoTarget, "? needs an operand") - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() CPTree { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - p.raiseParseError(synErrGroupNoElem, "") - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - if !p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupInvalidForm, "") - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() CPTree { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - p.raiseParseError(synErrBExpInvalidForm, "") - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() CPTree { - var left CPTree - switch { - case p.consume(tokenKindCodePointLeader): - left = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - left = p.parseCharProp() - if p.consume(tokenKindCharRange) { - p.raiseParseError(synErrRangePropIsUnavailable, "") - } - default: - left = p.parseNormalChar() - } - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - var right CPTree - switch { - case p.consume(tokenKindCodePointLeader): - right = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - p.raiseParseError(synErrRangePropIsUnavailable, "") - default: - right = p.parseNormalChar() - } - if right == nil { - p.raiseParseError(synErrRangeInvalidForm, "") - } - from, _, _ := left.Range() - _, to, _ := right.Range() - if !isValidOrder(from, to) { - p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) - } - return newRangeSymbolNode(from, to) -} - -func (p *parser) parseCodePoint() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - if !p.consume(tokenKindCodePoint) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) - } - if n < 0x0000 || n > 0x10FFFF { - p.raiseParseError(synErrCPExpOutOfRange, "") - } - - sym := newSymbolNode(rune(n)) - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - return sym -} - -func (p *parser) parseCharProp() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym2 = p.lastTok.propSymbol - } - - var alt CPTree - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.raiseParseError(synErrCharPropUnsupported, propName) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if pat != "" { - p := NewParser(p.kind, bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.Parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if inverse { - r := cpRanges[0] - alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for _, r := range cpRanges[1:] { - alt = exclude(newRangeSymbolNode(r.From, r.To), alt) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - } else { - for _, r := range cpRanges { - alt = genAltNode( - alt, - newRangeSymbolNode(r.From, r.To), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - - return alt -} - -func (p *parser) parseFragment() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - if !p.consume(tokenKindFragmentSymbol) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - - return newFragmentNode(spec.LexKindName(sym), nil) -} - -func (p *parser) parseNormalChar() CPTree { - if !p.consume(tokenKindChar) { - return nil - } - return newSymbolNode(p.lastTok.char) -} - -func exclude(symbol, base CPTree) CPTree { - if left, right, ok := symbol.Alternatives(); ok { - return exclude(right, exclude(left, base)) - } - - if left, right, ok := base.Alternatives(); ok { - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - } - - if bFrom, bTo, ok := base.Range(); ok { - sFrom, sTo, ok := symbol.Range() - if !ok { - panic(fmt.Errorf("invalid symbol tree: %T", symbol)) - } - - switch { - case sFrom > bFrom && sTo < bTo: - return genAltNode( - newRangeSymbolNode(bFrom, sFrom-1), - newRangeSymbolNode(sTo+1, bTo), - ) - case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: - return newRangeSymbolNode(sTo+1, bTo) - case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: - return newRangeSymbolNode(bFrom, sFrom-1) - case sFrom <= bFrom && sTo >= bTo: - return nil - default: - return base - } - } - - panic(fmt.Errorf("invalid base tree: %T", base)) -} - -func genAnyCharAST() CPTree { - return newRangeSymbolNode(0x0, 0x10FFFF) -} - -func isValidOrder(from, to rune) bool { - return from <= to -} - -func genConcatNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - if err == ParseErr { - detail, cause := p.lex.error() - p.raiseParseError(cause, detail) - } - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} - -func (p *parser) raiseParseError(err error, detail string) { - p.errCause = err - p.errDetail = detail - panic(ParseErr) -} diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go deleted file mode 100644 index df03d37..0000000 --- a/src/urubu/grammar/lexical/parser/tree.go +++ /dev/null @@ -1,459 +0,0 @@ -package parser - -import ( - "fmt" - "io" - "sort" - - spec "urubu/spec/grammar" -) - -type CPRange struct { - From rune - To rune -} - -type CPTree interface { - fmt.Stringer - Range() (rune, rune, bool) - Optional() (CPTree, bool) - Repeatable() (CPTree, bool) - Concatenation() (CPTree, CPTree, bool) - Alternatives() (CPTree, CPTree, bool) - Describe() (spec.LexKindName, []spec.LexKindName, error) - - children() (CPTree, CPTree) - clone() CPTree -} - -var ( - _ CPTree = &rootNode{} - _ CPTree = &symbolNode{} - _ CPTree = &concatNode{} - _ CPTree = &altNode{} - _ CPTree = &quantifierNode{} - _ CPTree = &fragmentNode{} -) - -type rootNode struct { - kind spec.LexKindName - tree CPTree - fragments map[spec.LexKindName][]*fragmentNode -} - -func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { - fragments := map[spec.LexKindName][]*fragmentNode{} - collectFragments(t, fragments) - - return &rootNode{ - kind: kind, - tree: t, - fragments: fragments, - } -} - -func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { - if n == nil { - return - } - - if f, ok := n.(*fragmentNode); ok { - fragments[f.kind] = append(fragments[f.kind], f) - return - } - - l, r := n.children() - collectFragments(l, fragments) - collectFragments(r, fragments) -} - -func (n *rootNode) String() string { - return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) -} - -func (n *rootNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *rootNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *rootNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - var frags []spec.LexKindName - for f := range n.fragments { - frags = append(frags, spec.LexKindName(f)) - } - sort.Slice(frags, func(i, j int) bool { - return frags[i] < frags[j] - }) - - return n.kind, frags, nil -} - -func (n *rootNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *rootNode) clone() CPTree { - return n.tree.clone() -} - -func (n *rootNode) incomplete() bool { - return len(n.fragments) > 0 -} - -func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { - root, ok := fragment.(*rootNode) - if !ok { - return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) - } - if root.incomplete() { - return fmt.Errorf("fragment is incomplete") - } - - fs, ok := n.fragments[kind] - if !ok { - return nil - } - for _, f := range fs { - f.tree = root.clone() - } - delete(n.fragments, kind) - - return nil -} - -type symbolNode struct { - CPRange -} - -func newSymbolNode(cp rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: cp, - To: cp, - }, - } -} - -func newRangeSymbolNode(from, to rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: from, - To: to, - }, - } -} - -func (n *symbolNode) String() string { - return fmt.Sprintf("symbol: %X..%X", n.From, n.To) -} - -func (n *symbolNode) Range() (rune, rune, bool) { - return n.From, n.To, true -} - -func (n *symbolNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *symbolNode) children() (CPTree, CPTree) { - return nil, nil -} - -func (n *symbolNode) clone() CPTree { - return newRangeSymbolNode(n.From, n.To) -} - -type concatNode struct { - left CPTree - right CPTree -} - -func newConcatNode(left, right CPTree) *concatNode { - return &concatNode{ - left: left, - right: right, - } -} - -func (n *concatNode) String() string { - return "concat" -} - -func (n *concatNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *concatNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *concatNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *concatNode) clone() CPTree { - if n == nil { - return nil - } - return newConcatNode(n.left.clone(), n.right.clone()) -} - -type altNode struct { - left CPTree - right CPTree -} - -func newAltNode(left, right CPTree) *altNode { - return &altNode{ - left: left, - right: right, - } -} - -func (n *altNode) String() string { - return "alt" -} - -func (n *altNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *altNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *altNode) Alternatives() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *altNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *altNode) clone() CPTree { - return newAltNode(n.left.clone(), n.right.clone()) -} - -type quantifierNode struct { - optional bool - repeatable bool - tree CPTree -} - -func (n *quantifierNode) String() string { - switch { - case n.repeatable: - return "repeatable (>= 0 times)" - case n.optional: - return "optional (0 or 1 times)" - default: - return "invalid quantifier" - } -} - -func newRepeatNode(t CPTree) *quantifierNode { - return &quantifierNode{ - repeatable: true, - tree: t, - } -} - -func newRepeatOneOrMoreNode(t CPTree) *concatNode { - return newConcatNode( - t, - &quantifierNode{ - repeatable: true, - tree: t.clone(), - }) -} - -func newOptionNode(t CPTree) *quantifierNode { - return &quantifierNode{ - optional: true, - tree: t, - } -} - -func (n *quantifierNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *quantifierNode) Optional() (CPTree, bool) { - return n.tree, n.optional -} - -func (n *quantifierNode) Repeatable() (CPTree, bool) { - return n.tree, n.repeatable -} - -func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *quantifierNode) children() (CPTree, CPTree) { - return n.tree, nil -} - -func (n *quantifierNode) clone() CPTree { - if n.repeatable { - return newRepeatNode(n.tree.clone()) - } - return newOptionNode(n.tree.clone()) -} - -type fragmentNode struct { - kind spec.LexKindName - tree CPTree -} - -func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { - return &fragmentNode{ - kind: kind, - tree: t, - } -} - -func (n *fragmentNode) String() string { - return fmt.Sprintf("fragment: %v", n.kind) -} - -func (n *fragmentNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *fragmentNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *fragmentNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *fragmentNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *fragmentNode) clone() CPTree { - if n.tree == nil { - return newFragmentNode(n.kind, nil) - } - return newFragmentNode(n.kind, n.tree.clone()) -} - -//nolint:unused -func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { - if t == nil { - return - } - fmt.Fprintf(w, "%v%v\n", ruledLine, t) - children := []CPTree{} - switch n := t.(type) { - case *rootNode: - children = append(children, n.tree) - case *fragmentNode: - children = append(children, n.tree) - default: - left, right := t.children() - if left != nil { - children = append(children, left) - } - if right != nil { - children = append(children, right) - } - } - num := len(children) - for i, child := range children { - line := "└─ " - if num > 1 { - if i == 0 { - line = "├─ " - } else if i < num-1 { - line = "│ " - } - } - prefix := "│ " - if i >= num-1 { - prefix = " " - } - printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } -} diff --git a/src/urubu/grammar/lr0.go b/src/urubu/grammar/lr0.go deleted file mode 100644 index 92a2137..0000000 --- a/src/urubu/grammar/lr0.go +++ /dev/null @@ -1,197 +0,0 @@ -package grammar - -import ( - "fmt" - "sort" - - "urubu/grammar/symbol" -) - -type lr0Automaton struct { - initialState kernelID - states map[kernelID]*lrState -} - -func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { - if !startSym.IsStart() { - return nil, fmt.Errorf("passed symbold is not a start symbol") - } - - automaton := &lr0Automaton{ - states: map[kernelID]*lrState{}, - } - - currentState := stateNumInitial - knownKernels := map[kernelID]struct{}{} - uncheckedKernels := []*kernel{} - - // Generate an initial kernel. - { - prods, _ := prods.findByLHS(startSym) - initialItem, err := newLR0Item(prods[0], 0) - if err != nil { - return nil, err - } - - k, err := newKernel([]*lrItem{initialItem}) - if err != nil { - return nil, err - } - - automaton.initialState = k.id - knownKernels[k.id] = struct{}{} - uncheckedKernels = append(uncheckedKernels, k) - } - - for len(uncheckedKernels) > 0 { - nextUncheckedKernels := []*kernel{} - for _, k := range uncheckedKernels { - state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym) - if err != nil { - return nil, err - } - state.num = currentState - currentState = currentState.next() - - automaton.states[state.id] = state - - for _, k := range neighbours { - if _, known := knownKernels[k.id]; known { - continue - } - knownKernels[k.id] = struct{}{} - nextUncheckedKernels = append(nextUncheckedKernels, k) - } - } - uncheckedKernels = nextUncheckedKernels - } - - return automaton, nil -} - -func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { - items, err := genLR0Closure(k, prods) - if err != nil { - return nil, nil, err - } - neighbours, err := genNeighbourKernels(items, prods) - if err != nil { - return nil, nil, err - } - - next := map[symbol.Symbol]kernelID{} - kernels := []*kernel{} - for _, n := range neighbours { - next[n.symbol] = n.kernel.id - kernels = append(kernels, n.kernel) - } - - reducible := map[productionID]struct{}{} - var emptyProdItems []*lrItem - isErrorTrapper := false - for _, item := range items { - if item.dottedSymbol == errSym { - isErrorTrapper = true - } - - if item.reducible { - reducible[item.prod] = struct{}{} - - prod, ok := prods.findByID(item.prod) - if !ok { - return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod) - } - if prod.isEmpty() { - emptyProdItems = append(emptyProdItems, item) - } - } - } - - return &lrState{ - kernel: k, - next: next, - reducible: reducible, - emptyProdItems: emptyProdItems, - isErrorTrapper: isErrorTrapper, - }, kernels, nil -} - -func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { - items := []*lrItem{} - knownItems := map[lrItemID]struct{}{} - uncheckedItems := []*lrItem{} - for _, item := range k.items { - items = append(items, item) - uncheckedItems = append(uncheckedItems, item) - } - for len(uncheckedItems) > 0 { - nextUncheckedItems := []*lrItem{} - for _, item := range uncheckedItems { - if item.dottedSymbol.IsTerminal() { - continue - } - - ps, _ := prods.findByLHS(item.dottedSymbol) - for _, prod := range ps { - item, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if _, exist := knownItems[item.id]; exist { - continue - } - items = append(items, item) - knownItems[item.id] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, item) - } - } - uncheckedItems = nextUncheckedItems - } - - return items, nil -} - -type neighbourKernel struct { - symbol symbol.Symbol - kernel *kernel -} - -func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { - kItemMap := map[symbol.Symbol][]*lrItem{} - for _, item := range items { - if item.dottedSymbol.IsNil() { - continue - } - prod, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("a production was not found: %v", item.prod) - } - kItem, err := newLR0Item(prod, item.dot+1) - if err != nil { - return nil, err - } - kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) - } - - nextSyms := []symbol.Symbol{} - for sym := range kItemMap { - nextSyms = append(nextSyms, sym) - } - sort.Slice(nextSyms, func(i, j int) bool { - return nextSyms[i] < nextSyms[j] - }) - - kernels := []*neighbourKernel{} - for _, sym := range nextSyms { - k, err := newKernel(kItemMap[sym]) - if err != nil { - return nil, err - } - kernels = append(kernels, &neighbourKernel{ - symbol: sym, - kernel: k, - }) - } - - return kernels, nil -} diff --git a/src/urubu/grammar/parsing_table.go b/src/urubu/grammar/parsing_table.go deleted file mode 100644 index 48ea9fe..0000000 --- a/src/urubu/grammar/parsing_table.go +++ /dev/null @@ -1,553 +0,0 @@ -package grammar - -import ( - "fmt" - "sort" - - "urubu/grammar/symbol" - spec "urubu/spec/grammar" -) - -type ActionType string - -const ( - ActionTypeShift = ActionType("shift") - ActionTypeReduce = ActionType("reduce") - ActionTypeError = ActionType("error") -) - -type actionEntry int - -const actionEntryEmpty = actionEntry(0) - -func newShiftActionEntry(state stateNum) actionEntry { - return actionEntry(state * -1) -} - -func newReduceActionEntry(prod productionNum) actionEntry { - return actionEntry(prod) -} - -func (e actionEntry) isEmpty() bool { - return e == actionEntryEmpty -} - -func (e actionEntry) describe() (ActionType, stateNum, productionNum) { - if e == actionEntryEmpty { - return ActionTypeError, stateNumInitial, productionNumNil - } - if e < 0 { - return ActionTypeShift, stateNum(e * -1), productionNumNil - } - return ActionTypeReduce, stateNumInitial, productionNum(e) -} - -type GoToType string - -const ( - GoToTypeRegistered = GoToType("registered") - GoToTypeError = GoToType("error") -) - -type goToEntry uint - -const goToEntryEmpty = goToEntry(0) - -func newGoToEntry(state stateNum) goToEntry { - return goToEntry(state) -} - -func (e goToEntry) describe() (GoToType, stateNum) { - if e == goToEntryEmpty { - return GoToTypeError, stateNumInitial - } - return GoToTypeRegistered, stateNum(e) -} - -type conflictResolutionMethod int - -func (m conflictResolutionMethod) Int() int { - return int(m) -} - -const ( - ResolvedByPrec conflictResolutionMethod = 1 - ResolvedByAssoc conflictResolutionMethod = 2 - ResolvedByShift conflictResolutionMethod = 3 - ResolvedByProdOrder conflictResolutionMethod = 4 -) - -type conflict interface { - conflict() -} - -type shiftReduceConflict struct { - state stateNum - sym symbol.Symbol - nextState stateNum - prodNum productionNum - resolvedBy conflictResolutionMethod -} - -func (c *shiftReduceConflict) conflict() { -} - -type reduceReduceConflict struct { - state stateNum - sym symbol.Symbol - prodNum1 productionNum - prodNum2 productionNum - resolvedBy conflictResolutionMethod -} - -func (c *reduceReduceConflict) conflict() { -} - -var ( - _ conflict = &shiftReduceConflict{} - _ conflict = &reduceReduceConflict{} -) - -type ParsingTable struct { - actionTable []actionEntry - goToTable []goToEntry - stateCount int - terminalCount int - nonTerminalCount int - - // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`, - // the state has an item having the following form. The `α` and `β` can be empty. - // - // A → α・error β - errorTrapperStates []int - - InitialState stateNum -} - -func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { - pos := state.Int()*t.terminalCount + sym.Int() - return t.actionTable[pos].describe() -} - -func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { - pos := state.Int()*t.nonTerminalCount + sym.Int() - return t.goToTable[pos].describe() -} - -func (t *ParsingTable) readAction(row int, col int) actionEntry { - return t.actionTable[row*t.terminalCount+col] -} - -func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { - t.actionTable[row*t.terminalCount+col] = act -} - -func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { - pos := state.Int()*t.nonTerminalCount + sym.Num().Int() - t.goToTable[pos] = newGoToEntry(nextState) -} - -type lrTableBuilder struct { - automaton *lr0Automaton - prods *productionSet - termCount int - nonTermCount int - symTab *symbol.SymbolTableReader - precAndAssoc *precAndAssoc - - conflicts []conflict -} - -func (b *lrTableBuilder) build() (*ParsingTable, error) { - var ptab *ParsingTable - { - initialState := b.automaton.states[b.automaton.initialState] - ptab = &ParsingTable{ - actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount), - goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount), - stateCount: len(b.automaton.states), - terminalCount: b.termCount, - nonTerminalCount: b.nonTermCount, - errorTrapperStates: make([]int, len(b.automaton.states)), - InitialState: initialState.num, - } - } - - for _, state := range b.automaton.states { - if state.isErrorTrapper { - ptab.errorTrapperStates[state.num] = 1 - } - - for sym, kID := range state.next { - nextState := b.automaton.states[kID] - if sym.IsTerminal() { - b.writeShiftAction(ptab, state.num, sym, nextState.num) - } else { - ptab.writeGoTo(state.num, sym, nextState.num) - } - } - - for prodID := range state.reducible { - reducibleProd, ok := b.prods.findByID(prodID) - if !ok { - return nil, fmt.Errorf("reducible production not found: %v", prodID) - } - - var reducibleItem *lrItem - for _, item := range state.items { - if item.prod != reducibleProd.id { - continue - } - - reducibleItem = item - break - } - if reducibleItem == nil { - for _, item := range state.emptyProdItems { - if item.prod != reducibleProd.id { - continue - } - - reducibleItem = item - break - } - if reducibleItem == nil { - return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num) - } - } - - for a := range reducibleItem.lookAhead.symbols { - b.writeReduceAction(ptab, state.num, a, reducibleProd.num) - } - } - } - - return ptab, nil -} - -// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, -// we prioritize the shift action. -func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { - act := tab.readAction(state.Int(), sym.Num().Int()) - if !act.isEmpty() { - ty, _, p := act.describe() - if ty == ActionTypeReduce { - act, method := b.resolveSRConflict(sym.Num(), p) - b.conflicts = append(b.conflicts, &shiftReduceConflict{ - state: state, - sym: sym, - nextState: nextState, - prodNum: p, - resolvedBy: method, - }) - if act == ActionTypeShift { - tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) - } - return - } - } - tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) -} - -// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, -// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces -// the production with higher priority. Productions defined earlier in the grammar file have a higher priority. -func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { - act := tab.readAction(state.Int(), sym.Num().Int()) - if !act.isEmpty() { - ty, s, p := act.describe() - switch ty { - case ActionTypeReduce: - if p == prod { - return - } - - b.conflicts = append(b.conflicts, &reduceReduceConflict{ - state: state, - sym: sym, - prodNum1: p, - prodNum2: prod, - resolvedBy: ResolvedByProdOrder, - }) - if p < prod { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) - } else { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) - } - case ActionTypeShift: - act, method := b.resolveSRConflict(sym.Num(), prod) - b.conflicts = append(b.conflicts, &shiftReduceConflict{ - state: state, - sym: sym, - nextState: s, - prodNum: prod, - resolvedBy: method, - }) - if act == ActionTypeReduce { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) - } - } - return - } - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) -} - -func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { - symPrec := b.precAndAssoc.terminalPrecedence(sym) - prodPrec := b.precAndAssoc.productionPredence(prod) - if symPrec == 0 || prodPrec == 0 { - return ActionTypeShift, ResolvedByShift - } - if symPrec == prodPrec { - assoc := b.precAndAssoc.productionAssociativity(prod) - if assoc != assocTypeLeft { - return ActionTypeShift, ResolvedByAssoc - } - return ActionTypeReduce, ResolvedByAssoc - } - if symPrec < prodPrec { - return ActionTypeShift, ResolvedByPrec - } - return ActionTypeReduce, ResolvedByPrec -} - -func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { - var terms []*spec.Terminal - { - termSyms := b.symTab.TerminalSymbols() - terms = make([]*spec.Terminal, len(termSyms)+1) - - for _, sym := range termSyms { - name, ok := b.symTab.ToText(sym) - if !ok { - return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) - } - - term := &spec.Terminal{ - Number: sym.Num().Int(), - Name: name, - } - - prec := b.precAndAssoc.terminalPrecedence(sym.Num()) - if prec != precNil { - term.Precedence = prec - } - - assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) - switch assoc { - case assocTypeLeft: - term.Associativity = "l" - case assocTypeRight: - term.Associativity = "r" - } - - terms[sym.Num()] = term - } - } - - var nonTerms []*spec.NonTerminal - { - nonTermSyms := b.symTab.NonTerminalSymbols() - nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) - for _, sym := range nonTermSyms { - name, ok := b.symTab.ToText(sym) - if !ok { - return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) - } - - nonTerms[sym.Num()] = &spec.NonTerminal{ - Number: sym.Num().Int(), - Name: name, - } - } - } - - var prods []*spec.Production - { - ps := gram.productionSet.getAllProductions() - prods = make([]*spec.Production, len(ps)+1) - for _, p := range ps { - rhs := make([]int, len(p.rhs)) - for i, e := range p.rhs { - if e.IsTerminal() { - rhs[i] = e.Num().Int() - } else { - rhs[i] = e.Num().Int() * -1 - } - } - - prod := &spec.Production{ - Number: p.num.Int(), - LHS: p.lhs.Num().Int(), - RHS: rhs, - } - - prec := b.precAndAssoc.productionPredence(p.num) - if prec != precNil { - prod.Precedence = prec - } - - assoc := b.precAndAssoc.productionAssociativity(p.num) - switch assoc { - case assocTypeLeft: - prod.Associativity = "l" - case assocTypeRight: - prod.Associativity = "r" - } - - prods[p.num.Int()] = prod - } - } - - var states []*spec.State - { - srConflicts := map[stateNum][]*shiftReduceConflict{} - rrConflicts := map[stateNum][]*reduceReduceConflict{} - for _, con := range b.conflicts { - switch c := con.(type) { - case *shiftReduceConflict: - srConflicts[c.state] = append(srConflicts[c.state], c) - case *reduceReduceConflict: - rrConflicts[c.state] = append(rrConflicts[c.state], c) - } - } - - states = make([]*spec.State, len(b.automaton.states)) - for _, s := range b.automaton.states { - kernel := make([]*spec.Item, len(s.items)) - for i, item := range s.items { - p, ok := b.prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod) - } - - kernel[i] = &spec.Item{ - Production: p.num.Int(), - Dot: item.dot, - } - } - - sort.Slice(kernel, func(i, j int) bool { - if kernel[i].Production < kernel[j].Production { - return true - } - if kernel[i].Production > kernel[j].Production { - return false - } - return kernel[i].Dot < kernel[j].Dot - }) - - var shift []*spec.Transition - var reduce []*spec.Reduce - var goTo []*spec.Transition - { - TERMINALS_LOOP: - for _, t := range b.symTab.TerminalSymbols() { - act, next, prod := tab.getAction(s.num, t.Num()) - switch act { - case ActionTypeShift: - shift = append(shift, &spec.Transition{ - Symbol: t.Num().Int(), - State: next.Int(), - }) - case ActionTypeReduce: - for _, r := range reduce { - if r.Production == prod.Int() { - r.LookAhead = append(r.LookAhead, t.Num().Int()) - continue TERMINALS_LOOP - } - } - reduce = append(reduce, &spec.Reduce{ - LookAhead: []int{t.Num().Int()}, - Production: prod.Int(), - }) - } - } - - for _, n := range b.symTab.NonTerminalSymbols() { - ty, next := tab.getGoTo(s.num, n.Num()) - if ty == GoToTypeRegistered { - goTo = append(goTo, &spec.Transition{ - Symbol: n.Num().Int(), - State: next.Int(), - }) - } - } - - sort.Slice(shift, func(i, j int) bool { - return shift[i].State < shift[j].State - }) - sort.Slice(reduce, func(i, j int) bool { - return reduce[i].Production < reduce[j].Production - }) - sort.Slice(goTo, func(i, j int) bool { - return goTo[i].State < goTo[j].State - }) - } - - sr := []*spec.SRConflict{} - rr := []*spec.RRConflict{} - { - for _, c := range srConflicts[s.num] { - conflict := &spec.SRConflict{ - Symbol: c.sym.Num().Int(), - State: c.nextState.Int(), - Production: c.prodNum.Int(), - ResolvedBy: c.resolvedBy.Int(), - } - - ty, s, p := tab.getAction(s.num, c.sym.Num()) - switch ty { - case ActionTypeShift: - n := s.Int() - conflict.AdoptedState = &n - case ActionTypeReduce: - n := p.Int() - conflict.AdoptedProduction = &n - } - - sr = append(sr, conflict) - } - - sort.Slice(sr, func(i, j int) bool { - return sr[i].Symbol < sr[j].Symbol - }) - - for _, c := range rrConflicts[s.num] { - conflict := &spec.RRConflict{ - Symbol: c.sym.Num().Int(), - Production1: c.prodNum1.Int(), - Production2: c.prodNum2.Int(), - ResolvedBy: c.resolvedBy.Int(), - } - - _, _, p := tab.getAction(s.num, c.sym.Num()) - conflict.AdoptedProduction = p.Int() - - rr = append(rr, conflict) - } - - sort.Slice(rr, func(i, j int) bool { - return rr[i].Symbol < rr[j].Symbol - }) - } - - states[s.num.Int()] = &spec.State{ - Number: s.num.Int(), - Kernel: kernel, - Shift: shift, - Reduce: reduce, - GoTo: goTo, - SRConflict: sr, - RRConflict: rr, - } - } - } - - return &spec.Report{ - Terminals: terms, - NonTerminals: nonTerms, - Productions: prods, - States: states, - }, nil -} diff --git a/src/urubu/grammar/production.go b/src/urubu/grammar/production.go deleted file mode 100644 index 8f6c103..0000000 --- a/src/urubu/grammar/production.go +++ /dev/null @@ -1,117 +0,0 @@ -package grammar - -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - - "urubu/grammar/symbol" -) - -type productionID [32]byte - -func (id productionID) String() string { - return hex.EncodeToString(id[:]) -} - -func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { - seq := lhs.Byte() - for _, sym := range rhs { - seq = append(seq, sym.Byte()...) - } - return productionID(sha256.Sum256(seq)) -} - -type productionNum uint16 - -const ( - productionNumNil = productionNum(0) - productionNumStart = productionNum(1) - productionNumMin = productionNum(2) -) - -func (n productionNum) Int() int { - return int(n) -} - -type production struct { - id productionID - num productionNum - lhs symbol.Symbol - rhs []symbol.Symbol - rhsLen int -} - -func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { - if lhs.IsNil() { - return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) - } - for _, sym := range rhs { - if sym.IsNil() { - return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) - } - } - - return &production{ - id: genProductionID(lhs, rhs), - lhs: lhs, - rhs: rhs, - rhsLen: len(rhs), - }, nil -} - -func (p *production) isEmpty() bool { - return p.rhsLen == 0 -} - -type productionSet struct { - lhs2Prods map[symbol.Symbol][]*production - id2Prod map[productionID]*production - num productionNum -} - -func newProductionSet() *productionSet { - return &productionSet{ - lhs2Prods: map[symbol.Symbol][]*production{}, - id2Prod: map[productionID]*production{}, - num: productionNumMin, - } -} - -func (ps *productionSet) append(prod *production) { - if _, ok := ps.id2Prod[prod.id]; ok { - return - } - - if prod.lhs.IsStart() { - prod.num = productionNumStart - } else { - prod.num = ps.num - ps.num++ - } - - if prods, ok := ps.lhs2Prods[prod.lhs]; ok { - ps.lhs2Prods[prod.lhs] = append(prods, prod) - } else { - ps.lhs2Prods[prod.lhs] = []*production{prod} - } - ps.id2Prod[prod.id] = prod -} - -func (ps *productionSet) findByID(id productionID) (*production, bool) { - prod, ok := ps.id2Prod[id] - return prod, ok -} - -func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { - if lhs.IsNil() { - return nil, false - } - - prods, ok := ps.lhs2Prods[lhs] - return prods, ok -} - -func (ps *productionSet) getAllProductions() map[productionID]*production { - return ps.id2Prod -} diff --git a/src/urubu/grammar/semantic_error.go b/src/urubu/grammar/semantic_error.go deleted file mode 100644 index 88a6b17..0000000 --- a/src/urubu/grammar/semantic_error.go +++ /dev/null @@ -1,30 +0,0 @@ -package grammar - -import "errors" - -var ( - semErrNoGrammarName = errors.New("name is missing") - semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") - semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") - semErrUndefinedPrec = errors.New("symbol must has precedence") - semErrUndefinedOrdSym = errors.New("undefined ordered symbol") - semErrUnusedProduction = errors.New("unused production") - semErrUnusedTerminal = errors.New("unused terminal") - semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") - semErrNoProduction = errors.New("a grammar needs at least one production") - semErrUndefinedSym = errors.New("undefined symbol") - semErrDuplicateProduction = errors.New("duplicate production") - semErrDuplicateTerminal = errors.New("duplicate terminal") - semErrDuplicateFragment = errors.New("duplicate fragment") - semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") - semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") - semErrDuplicateLabel = errors.New("a label must be unique in an alternative") - semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") - semErrDirInvalidName = errors.New("invalid directive name") - semErrDirInvalidParam = errors.New("invalid parameter") - semErrDuplicateDir = errors.New("a directive must not be duplicated") - semErrDuplicateElem = errors.New("duplicate element") - semErrAmbiguousElem = errors.New("ambiguous element") - semErrInvalidProdDir = errors.New("invalid production directive") - semErrInvalidAltDir = errors.New("invalid alternative directive") -) diff --git a/src/urubu/grammar/symbol/symbol.go b/src/urubu/grammar/symbol.go index f9e6a93..f9e6a93 100644 --- a/src/urubu/grammar/symbol/symbol.go +++ b/src/urubu/grammar/symbol.go diff --git a/src/urubu/spec/grammar/grammar.go b/src/urubu/spec/grammar.go index bf1ea89..c2708d8 100644 --- a/src/urubu/spec/grammar/grammar.go +++ b/src/urubu/spec/grammar.go @@ -1,6 +1,79 @@ package grammar -import "strconv" +import ( + "strconv" + "strings" +) + +type Terminal struct { + Number int `json:"number"` + Name string `json:"name"` + Pattern string `json:"pattern"` + Precedence int `json:"prec"` + Associativity string `json:"assoc"` +} + +type NonTerminal struct { + Number int `json:"number"` + Name string `json:"name"` +} + +type Production struct { + Number int `json:"number"` + LHS int `json:"lhs"` + RHS []int `json:"rhs"` + Precedence int `json:"prec"` + Associativity string `json:"assoc"` +} + +type Item struct { + Production int `json:"production"` + Dot int `json:"dot"` +} + +type Transition struct { + Symbol int `json:"symbol"` + State int `json:"state"` +} + +type Reduce struct { + LookAhead []int `json:"look_ahead"` + Production int `json:"production"` +} + +type SRConflict struct { + Symbol int `json:"symbol"` + State int `json:"state"` + Production int `json:"production"` + AdoptedState *int `json:"adopted_state"` + AdoptedProduction *int `json:"adopted_production"` + ResolvedBy int `json:"resolved_by"` +} + +type RRConflict struct { + Symbol int `json:"symbol"` + Production1 int `json:"production_1"` + Production2 int `json:"production_2"` + AdoptedProduction int `json:"adopted_production"` + ResolvedBy int `json:"resolved_by"` +} + +type State struct { + Number int `json:"number"` + Kernel []*Item `json:"kernel"` + Shift []*Transition `json:"shift"` + Reduce []*Reduce `json:"reduce"` + GoTo []*Transition `json:"goto"` + SRConflict []*SRConflict `json:"sr_conflict"` + RRConflict []*RRConflict `json:"rr_conflict"` +} + +type Report struct { + Terminals []*Terminal `json:"terminals"` + NonTerminals []*NonTerminal `json:"non_terminals"` + Productions []*Production `json:"productions"` + States []*State `json:"states"` +} type CompiledGrammar struct { Name string `json:"name"` @@ -158,3 +231,21 @@ type SyntacticSpec struct { type ASTAction struct { Entries [][]int `json:"entries"` } + +var rep = strings.NewReplacer( + `.`, `\.`, + `*`, `\*`, + `+`, `\+`, + `?`, `\?`, + `|`, `\|`, + `(`, `\(`, + `)`, `\)`, + `[`, `\[`, + `\`, `\\`, +) + +// EscapePattern escapes the special characters. +// For example, EscapePattern(`+`) returns `\+`. +func EscapePattern(s string) string { + return rep.Replace(s) +} diff --git a/src/urubu/spec/grammar/parser/clexspec.json b/src/urubu/spec/grammar/clexspec.json index d0ed3d3..d0ed3d3 100644 --- a/src/urubu/spec/grammar/parser/clexspec.json +++ b/src/urubu/spec/grammar/clexspec.json diff --git a/src/urubu/spec/grammar/description.go b/src/urubu/spec/grammar/description.go deleted file mode 100644 index 0d2a0b7..0000000 --- a/src/urubu/spec/grammar/description.go +++ /dev/null @@ -1,71 +0,0 @@ -package grammar - -type Terminal struct { - Number int `json:"number"` - Name string `json:"name"` - Pattern string `json:"pattern"` - Precedence int `json:"prec"` - Associativity string `json:"assoc"` -} - -type NonTerminal struct { - Number int `json:"number"` - Name string `json:"name"` -} - -type Production struct { - Number int `json:"number"` - LHS int `json:"lhs"` - RHS []int `json:"rhs"` - Precedence int `json:"prec"` - Associativity string `json:"assoc"` -} - -type Item struct { - Production int `json:"production"` - Dot int `json:"dot"` -} - -type Transition struct { - Symbol int `json:"symbol"` - State int `json:"state"` -} - -type Reduce struct { - LookAhead []int `json:"look_ahead"` - Production int `json:"production"` -} - -type SRConflict struct { - Symbol int `json:"symbol"` - State int `json:"state"` - Production int `json:"production"` - AdoptedState *int `json:"adopted_state"` - AdoptedProduction *int `json:"adopted_production"` - ResolvedBy int `json:"resolved_by"` -} - -type RRConflict struct { - Symbol int `json:"symbol"` - Production1 int `json:"production_1"` - Production2 int `json:"production_2"` - AdoptedProduction int `json:"adopted_production"` - ResolvedBy int `json:"resolved_by"` -} - -type State struct { - Number int `json:"number"` - Kernel []*Item `json:"kernel"` - Shift []*Transition `json:"shift"` - Reduce []*Reduce `json:"reduce"` - GoTo []*Transition `json:"goto"` - SRConflict []*SRConflict `json:"sr_conflict"` - RRConflict []*RRConflict `json:"rr_conflict"` -} - -type Report struct { - Terminals []*Terminal `json:"terminals"` - NonTerminals []*NonTerminal `json:"non_terminals"` - Productions []*Production `json:"productions"` - States []*State `json:"states"` -} diff --git a/src/urubu/spec/grammar/parser/lexspec.json b/src/urubu/spec/grammar/lexspec.json index caf1f0e..caf1f0e 100644 --- a/src/urubu/spec/grammar/parser/lexspec.json +++ b/src/urubu/spec/grammar/lexspec.json diff --git a/src/urubu/spec/grammar/parser/vartan_lexer.go b/src/urubu/spec/grammar/parser.go index 76ddfde..0e5a16b 100644 --- a/src/urubu/spec/grammar/parser/vartan_lexer.go +++ b/src/urubu/spec/grammar/parser.go @@ -1,11 +1,920 @@ -// Code generated by maleeni-go. DO NOT EDIT. +//go:generate maleeni compile lexspec.json -o clexspec.json +//go:generate maleeni-go clexspec.json --package parser + package parser import ( + _ "embed" "fmt" "io" "io/ioutil" + "regexp" + "strings" + + verr "urubu/error" + spec "urubu/spec/grammar" +) + +type tokenKind string + +const ( + tokenKindKWFragment = tokenKind("fragment") + tokenKindID = tokenKind("id") + tokenKindTerminalPattern = tokenKind("terminal pattern") + tokenKindStringLiteral = tokenKind("string") + tokenKindColon = tokenKind(":") + tokenKindOr = tokenKind("|") + tokenKindSemicolon = tokenKind(";") + tokenKindLabelMarker = tokenKind("@") + tokenKindDirectiveMarker = tokenKind("#") + tokenKindExpantion = tokenKind("...") + tokenKindOrderedSymbolMarker = tokenKind("$") + tokenKindLParen = tokenKind("(") + tokenKindRParen = tokenKind(")") + tokenKindNewline = tokenKind("newline") + tokenKindEOF = tokenKind("eof") + tokenKindInvalid = tokenKind("invalid") +) + +var ( + reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) + reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) +) + +type Position struct { + Row int + Col int +} + +func newPosition(row, col int) Position { + return Position{ + Row: row, + Col: col, + } +} + +type token struct { + kind tokenKind + text string + pos Position +} + +func newSymbolToken(kind tokenKind, pos Position) *token { + return &token{ + kind: kind, + pos: pos, + } +} + +func newIDToken(text string, pos Position) *token { + return &token{ + kind: tokenKindID, + text: text, + pos: pos, + } +} + +func newTerminalPatternToken(text string, pos Position) *token { + return &token{ + kind: tokenKindTerminalPattern, + text: text, + pos: pos, + } +} + +func newStringLiteralToken(text string, pos Position) *token { + return &token{ + kind: tokenKindStringLiteral, + text: text, + pos: pos, + } +} + +func newEOFToken() *token { + return &token{ + kind: tokenKindEOF, + } +} + +func newInvalidToken(text string, pos Position) *token { + return &token{ + kind: tokenKindInvalid, + text: text, + pos: pos, + } +} + +type lexer struct { + d *Lexer + buf *token +} + +func newLexer(src io.Reader) (*lexer, error) { + d, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + return &lexer{ + d: d, + }, nil +} + +func (l *lexer) next() (*token, error) { + if l.buf != nil { + tok := l.buf + l.buf = nil + return tok, nil + } + + var newline *token + for { + tok, err := l.lexAndSkipWSs() + if err != nil { + return nil, err + } + if tok.kind == tokenKindNewline { + newline = tok + continue + } + + if newline != nil { + l.buf = tok + return newline, nil + } + return tok, nil + } +} + +func (l *lexer) lexAndSkipWSs() (*token, error) { + var tok *Token + for { + var err error + tok, err = l.d.Next() + if err != nil { + return nil, err + } + if tok.Invalid { + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } + if tok.EOF { + return newEOFToken(), nil + } + switch tok.KindID { + case KindIDWhiteSpace: + continue + case KindIDLineComment: + continue + } + + break + } + + switch tok.KindID { + case KindIDNewline: + return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDKwFragment: + return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDIdentifier: + if !reIDChar.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidChar, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidUnderscorePos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.Contains(string(tok.Lexeme), "__") { + return nil, &verr.SpecError{ + Cause: synErrIDConsecutiveUnderscores, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if reIDInvalidDigitsPos.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidDigitsPos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDTerminalOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedTerminal, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDPattern: + // The escape sequences in a pattern string are interpreted by the lexer, except for the \". + // We must interpret the \" before passing them to the lexer because they are delimiters for + // the pattern strings. + fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) + case KindIDEscapeSymbol: + return nil, &verr.SpecError{ + Cause: synErrIncompletedEscSeq, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + case KindIDTerminalClose: + pat := b.String() + if pat == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyPattern, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDStringLiteralOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDCharSeq: + fmt.Fprint(&b, string(tok.Lexeme)) + case KindIDStringLiteralClose: + str := b.String() + if str == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDColon: + return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOr: + return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDSemicolon: + return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLabelMarker: + return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDDirectiveMarker: + return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDExpansion: + return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOrderedSymbolMarker: + return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLParen: + return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDRParen: + return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil + default: + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } +} + +type RootNode struct { + Directives []*DirectiveNode + Productions []*ProductionNode + LexProductions []*ProductionNode + Fragments []*FragmentNode +} + +type ProductionNode struct { + Directives []*DirectiveNode + LHS string + RHS []*AlternativeNode + Pos Position +} + +func (n *ProductionNode) isLexical() bool { + if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" { + return true + } + return false +} + +type AlternativeNode struct { + Elements []*ElementNode + Directives []*DirectiveNode + Pos Position +} + +type ElementNode struct { + ID string + Pattern string + Label *LabelNode + Literally bool + Pos Position +} + +type LabelNode struct { + Name string + Pos Position +} + +type DirectiveNode struct { + Name string + Parameters []*ParameterNode + Pos Position +} + +type ParameterNode struct { + ID string + Pattern string + String string + OrderedSymbol string + Group []*DirectiveNode + Expansion bool + Pos Position +} + +type FragmentNode struct { + LHS string + RHS string + Pos Position +} + +func raiseSyntaxError(row int, synErr *SyntaxError) { + panic(&verr.SpecError{ + Cause: synErr, + Row: row, + }) +} + +func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) { + panic(&verr.SpecError{ + Cause: synErr, + Detail: detail, + Row: row, + }) +} + +func Parse(src io.Reader) (*RootNode, error) { + p, err := newParser(src) + if err != nil { + return nil, err + } + + return p.parse() +} + +type parser struct { + lex *lexer + peekedTok *token + lastTok *token + errs verr.SpecErrors + + // A token position that the parser read at last. + // It is used as additional information in error messages. + pos Position +} + +func newParser(src io.Reader) (*parser, error) { + lex, err := newLexer(src) + if err != nil { + return nil, err + } + return &parser{ + lex: lex, + }, nil +} + +func (p *parser) parse() (root *RootNode, retErr error) { + root = p.parseRoot() + if len(p.errs) > 0 { + return nil, p.errs + } + + return root, nil +} + +func (p *parser) parseRoot() *RootNode { + defer func() { + err := recover() + if err != nil { + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(fmt.Errorf("an unexpected error occurred: %v", err)) + } + p.errs = append(p.errs, specErr) + } + }() + + var dirs []*DirectiveNode + var prods []*ProductionNode + var lexProds []*ProductionNode + var fragments []*FragmentNode + for { + dir := p.parseTopLevelDirective() + if dir != nil { + dirs = append(dirs, dir) + continue + } + + fragment := p.parseFragment() + if fragment != nil { + fragments = append(fragments, fragment) + continue + } + + prod := p.parseProduction() + if prod != nil { + if prod.isLexical() { + lexProds = append(lexProds, prod) + } else { + prods = append(prods, prod) + } + continue + } + + if p.consume(tokenKindEOF) { + break + } + } + + return &RootNode{ + Directives: dirs, + Productions: prods, + LexProductions: lexProds, + Fragments: fragments, + } +} + +func (p *parser) parseTopLevelDirective() *DirectiveNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + dir := p.parseDirective() + if dir == nil { + return nil + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon) + } + + return dir +} + +func (p *parser) parseFragment() *FragmentNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindKWFragment) { + return nil + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoProductionName) + } + lhs := p.lastTok.text + lhsPos := p.lastTok.pos + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindColon) { + raiseSyntaxError(p.pos.Row, synErrNoColon) + } + + var rhs string + switch { + case p.consume(tokenKindTerminalPattern): + rhs = p.lastTok.text + case p.consume(tokenKindStringLiteral): + rhs = spec.EscapePattern(p.lastTok.text) + default: + raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrNoSemicolon) + } + + if !p.consume(tokenKindNewline) { + if !p.consume(tokenKindEOF) { + raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) + } + } + + return &FragmentNode{ + LHS: lhs, + RHS: rhs, + Pos: lhsPos, + } +} + +func (p *parser) parseProduction() *ProductionNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + p.consume(tokenKindNewline) + + if p.consume(tokenKindEOF) { + return nil + } + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoProductionName) + } + lhs := p.lastTok.text + lhsPos := p.lastTok.pos + + var dirs []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + dirs = append(dirs, dir) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindColon) { + raiseSyntaxError(p.pos.Row, synErrNoColon) + } + + alt := p.parseAlternative() + rhs := []*AlternativeNode{alt} + for { + p.consume(tokenKindNewline) + + if !p.consume(tokenKindOr) { + break + } + alt := p.parseAlternative() + rhs = append(rhs, alt) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrNoSemicolon) + } + + if !p.consume(tokenKindNewline) { + if !p.consume(tokenKindEOF) { + raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) + } + } + + prod := &ProductionNode{ + Directives: dirs, + LHS: lhs, + RHS: rhs, + Pos: lhsPos, + } + + // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs. + // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate + // name to the pattern. Therefore, this code prohibits alternatives from containing patterns. + if !prod.isLexical() { + for _, alt := range prod.RHS { + for _, elem := range alt.Elements { + if elem.Pattern != "" { + raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt) + } + } + } + } + + return prod +} + +func (p *parser) parseAlternative() *AlternativeNode { + elems := []*ElementNode{} + for { + elem := p.parseElement() + if elem == nil { + break + } + elems = append(elems, elem) + } + + // When a length of an alternative is zero, we cannot set a position. + var firstElemPos Position + if len(elems) > 0 { + firstElemPos = elems[0].Pos + } + + var dirs []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + dirs = append(dirs, dir) + } + + return &AlternativeNode{ + Elements: elems, + Directives: dirs, + Pos: firstElemPos, + } +} + +func (p *parser) parseElement() *ElementNode { + var elem *ElementNode + switch { + case p.consume(tokenKindID): + elem = &ElementNode{ + ID: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindTerminalPattern): + elem = &ElementNode{ + Pattern: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindStringLiteral): + elem = &ElementNode{ + Pattern: p.lastTok.text, + Literally: true, + Pos: p.lastTok.pos, + } + default: + if p.consume(tokenKindLabelMarker) { + raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol) + } + return nil + } + if p.consume(tokenKindLabelMarker) { + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoLabel) + } + elem.Label = &LabelNode{ + Name: p.lastTok.text, + Pos: p.lastTok.pos, + } + } + return elem +} + +func (p *parser) parseDirective() *DirectiveNode { + p.consume(tokenKindNewline) + + if !p.consume(tokenKindDirectiveMarker) { + return nil + } + dirPos := p.lastTok.pos + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoDirectiveName) + } + name := p.lastTok.text + + var params []*ParameterNode + for { + param := p.parseParameter() + if param == nil { + break + } + params = append(params, param) + } + + return &DirectiveNode{ + Name: name, + Parameters: params, + Pos: dirPos, + } +} + +func (p *parser) parseParameter() *ParameterNode { + var param *ParameterNode + switch { + case p.consume(tokenKindID): + param = &ParameterNode{ + ID: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindTerminalPattern): + param = &ParameterNode{ + Pattern: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindStringLiteral): + param = &ParameterNode{ + String: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindOrderedSymbolMarker): + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName) + } + param = &ParameterNode{ + OrderedSymbol: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindLParen): + pos := p.lastTok.pos + var g []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + g = append(g, dir) + } + if !p.consume(tokenKindRParen) { + raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup) + } + if len(g) == 0 { + // Set an empty slice representing an empty directive group to distinguish between the following two cases. + // + // - #prec (); // vartan allows this case. + // - #prec; // This case will raise an error. + g = []*DirectiveNode{} + } + param = &ParameterNode{ + Group: g, + Pos: pos, + } + } + if p.consume(tokenKindExpantion) { + switch { + case param == nil: + raiseSyntaxError(p.pos.Row, synErrStrayExpOp) + case param.ID == "": + raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand) + } + param.Expansion = true + } + return param +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + panic(err) + } + } + p.pos = tok.pos + if tok.kind == tokenKindInvalid { + raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text) + } + if tok.kind == expected { + p.lastTok = tok + return true + } + p.peekedTok = tok + + return false +} + +func (p *parser) skip() { + var tok *token + var err error + for { + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + p.errs = append(p.errs, &verr.SpecError{ + Cause: err, + Row: p.pos.Row, + }) + continue + } + } + + break + } + + p.lastTok = tok + p.pos = tok.pos +} + +func (p *parser) skipOverTo(kind tokenKind) { + for { + if p.consume(kind) || p.consume(tokenKindEOF) { + return + } + p.skip() + } +} + +type SyntaxError struct { + message string +} + +func newSyntaxError(message string) *SyntaxError { + return &SyntaxError{ + message: message, + } +} + +func (e *SyntaxError) Error() string { + return e.message +} + +var ( + // lexical errors + synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore") + synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier") + synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively") + synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier") + synErrUnclosedTerminal = newSyntaxError("unclosed terminal") + synErrUnclosedString = newSyntaxError("unclosed string") + synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash") + synErrEmptyPattern = newSyntaxError("a pattern must include at least one character") + synErrEmptyString = newSyntaxError("a string must include at least one character") + + // syntax errors + synErrInvalidToken = newSyntaxError("invalid token") + synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;") + synErrNoProductionName = newSyntaxError("a production name is missing") + synErrNoColon = newSyntaxError("the colon must precede alternatives") + synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative") + synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol") + synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @") + synErrNoDirectiveName = newSyntaxError("a directive needs a name") + synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing") + synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )") + synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal") + synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier") + synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier") + synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline") + synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element") ) +// Code generated by maleeni-go. DO NOT EDIT. type ModeID int diff --git a/src/urubu/spec/grammar/parser/lexer.go b/src/urubu/spec/grammar/parser/lexer.go deleted file mode 100644 index bd8a24f..0000000 --- a/src/urubu/spec/grammar/parser/lexer.go +++ /dev/null @@ -1,297 +0,0 @@ -//go:generate maleeni compile lexspec.json -o clexspec.json -//go:generate maleeni-go clexspec.json --package parser - -package parser - -import ( - _ "embed" - "fmt" - "io" - "regexp" - "strings" - - verr "urubu/error" -) - -type tokenKind string - -const ( - tokenKindKWFragment = tokenKind("fragment") - tokenKindID = tokenKind("id") - tokenKindTerminalPattern = tokenKind("terminal pattern") - tokenKindStringLiteral = tokenKind("string") - tokenKindColon = tokenKind(":") - tokenKindOr = tokenKind("|") - tokenKindSemicolon = tokenKind(";") - tokenKindLabelMarker = tokenKind("@") - tokenKindDirectiveMarker = tokenKind("#") - tokenKindExpantion = tokenKind("...") - tokenKindOrderedSymbolMarker = tokenKind("$") - tokenKindLParen = tokenKind("(") - tokenKindRParen = tokenKind(")") - tokenKindNewline = tokenKind("newline") - tokenKindEOF = tokenKind("eof") - tokenKindInvalid = tokenKind("invalid") -) - -var ( - reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) - reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) -) - -type Position struct { - Row int - Col int -} - -func newPosition(row, col int) Position { - return Position{ - Row: row, - Col: col, - } -} - -type token struct { - kind tokenKind - text string - pos Position -} - -func newSymbolToken(kind tokenKind, pos Position) *token { - return &token{ - kind: kind, - pos: pos, - } -} - -func newIDToken(text string, pos Position) *token { - return &token{ - kind: tokenKindID, - text: text, - pos: pos, - } -} - -func newTerminalPatternToken(text string, pos Position) *token { - return &token{ - kind: tokenKindTerminalPattern, - text: text, - pos: pos, - } -} - -func newStringLiteralToken(text string, pos Position) *token { - return &token{ - kind: tokenKindStringLiteral, - text: text, - pos: pos, - } -} - -func newEOFToken() *token { - return &token{ - kind: tokenKindEOF, - } -} - -func newInvalidToken(text string, pos Position) *token { - return &token{ - kind: tokenKindInvalid, - text: text, - pos: pos, - } -} - -type lexer struct { - d *Lexer - buf *token -} - -func newLexer(src io.Reader) (*lexer, error) { - d, err := NewLexer(NewLexSpec(), src) - if err != nil { - return nil, err - } - return &lexer{ - d: d, - }, nil -} - -func (l *lexer) next() (*token, error) { - if l.buf != nil { - tok := l.buf - l.buf = nil - return tok, nil - } - - var newline *token - for { - tok, err := l.lexAndSkipWSs() - if err != nil { - return nil, err - } - if tok.kind == tokenKindNewline { - newline = tok - continue - } - - if newline != nil { - l.buf = tok - return newline, nil - } - return tok, nil - } -} - -func (l *lexer) lexAndSkipWSs() (*token, error) { - var tok *Token - for { - var err error - tok, err = l.d.Next() - if err != nil { - return nil, err - } - if tok.Invalid { - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } - if tok.EOF { - return newEOFToken(), nil - } - switch tok.KindID { - case KindIDWhiteSpace: - continue - case KindIDLineComment: - continue - } - - break - } - - switch tok.KindID { - case KindIDNewline: - return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDKwFragment: - return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDIdentifier: - if !reIDChar.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidChar, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidUnderscorePos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.Contains(string(tok.Lexeme), "__") { - return nil, &verr.SpecError{ - Cause: synErrIDConsecutiveUnderscores, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if reIDInvalidDigitsPos.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidDigitsPos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDTerminalOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedTerminal, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDPattern: - // The escape sequences in a pattern string are interpreted by the lexer, except for the \". - // We must interpret the \" before passing them to the lexer because they are delimiters for - // the pattern strings. - fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) - case KindIDEscapeSymbol: - return nil, &verr.SpecError{ - Cause: synErrIncompletedEscSeq, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - case KindIDTerminalClose: - pat := b.String() - if pat == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyPattern, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDStringLiteralOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDCharSeq: - fmt.Fprint(&b, string(tok.Lexeme)) - case KindIDStringLiteralClose: - str := b.String() - if str == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDColon: - return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOr: - return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDSemicolon: - return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLabelMarker: - return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDDirectiveMarker: - return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDExpansion: - return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOrderedSymbolMarker: - return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLParen: - return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDRParen: - return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil - default: - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } -} diff --git a/src/urubu/spec/grammar/parser/parser.go b/src/urubu/spec/grammar/parser/parser.go deleted file mode 100644 index b604074..0000000 --- a/src/urubu/spec/grammar/parser/parser.go +++ /dev/null @@ -1,582 +0,0 @@ -package parser - -import ( - "fmt" - "io" - - verr "urubu/error" - spec "urubu/spec/grammar" -) - -type RootNode struct { - Directives []*DirectiveNode - Productions []*ProductionNode - LexProductions []*ProductionNode - Fragments []*FragmentNode -} - -type ProductionNode struct { - Directives []*DirectiveNode - LHS string - RHS []*AlternativeNode - Pos Position -} - -func (n *ProductionNode) isLexical() bool { - if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" { - return true - } - return false -} - -type AlternativeNode struct { - Elements []*ElementNode - Directives []*DirectiveNode - Pos Position -} - -type ElementNode struct { - ID string - Pattern string - Label *LabelNode - Literally bool - Pos Position -} - -type LabelNode struct { - Name string - Pos Position -} - -type DirectiveNode struct { - Name string - Parameters []*ParameterNode - Pos Position -} - -type ParameterNode struct { - ID string - Pattern string - String string - OrderedSymbol string - Group []*DirectiveNode - Expansion bool - Pos Position -} - -type FragmentNode struct { - LHS string - RHS string - Pos Position -} - -func raiseSyntaxError(row int, synErr *SyntaxError) { - panic(&verr.SpecError{ - Cause: synErr, - Row: row, - }) -} - -func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) { - panic(&verr.SpecError{ - Cause: synErr, - Detail: detail, - Row: row, - }) -} - -func Parse(src io.Reader) (*RootNode, error) { - p, err := newParser(src) - if err != nil { - return nil, err - } - - return p.parse() -} - -type parser struct { - lex *lexer - peekedTok *token - lastTok *token - errs verr.SpecErrors - - // A token position that the parser read at last. - // It is used as additional information in error messages. - pos Position -} - -func newParser(src io.Reader) (*parser, error) { - lex, err := newLexer(src) - if err != nil { - return nil, err - } - return &parser{ - lex: lex, - }, nil -} - -func (p *parser) parse() (root *RootNode, retErr error) { - root = p.parseRoot() - if len(p.errs) > 0 { - return nil, p.errs - } - - return root, nil -} - -func (p *parser) parseRoot() *RootNode { - defer func() { - err := recover() - if err != nil { - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(fmt.Errorf("an unexpected error occurred: %v", err)) - } - p.errs = append(p.errs, specErr) - } - }() - - var dirs []*DirectiveNode - var prods []*ProductionNode - var lexProds []*ProductionNode - var fragments []*FragmentNode - for { - dir := p.parseTopLevelDirective() - if dir != nil { - dirs = append(dirs, dir) - continue - } - - fragment := p.parseFragment() - if fragment != nil { - fragments = append(fragments, fragment) - continue - } - - prod := p.parseProduction() - if prod != nil { - if prod.isLexical() { - lexProds = append(lexProds, prod) - } else { - prods = append(prods, prod) - } - continue - } - - if p.consume(tokenKindEOF) { - break - } - } - - return &RootNode{ - Directives: dirs, - Productions: prods, - LexProductions: lexProds, - Fragments: fragments, - } -} - -func (p *parser) parseTopLevelDirective() *DirectiveNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - dir := p.parseDirective() - if dir == nil { - return nil - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon) - } - - return dir -} - -func (p *parser) parseFragment() *FragmentNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindKWFragment) { - return nil - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoProductionName) - } - lhs := p.lastTok.text - lhsPos := p.lastTok.pos - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindColon) { - raiseSyntaxError(p.pos.Row, synErrNoColon) - } - - var rhs string - switch { - case p.consume(tokenKindTerminalPattern): - rhs = p.lastTok.text - case p.consume(tokenKindStringLiteral): - rhs = spec.EscapePattern(p.lastTok.text) - default: - raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrNoSemicolon) - } - - if !p.consume(tokenKindNewline) { - if !p.consume(tokenKindEOF) { - raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) - } - } - - return &FragmentNode{ - LHS: lhs, - RHS: rhs, - Pos: lhsPos, - } -} - -func (p *parser) parseProduction() *ProductionNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - p.consume(tokenKindNewline) - - if p.consume(tokenKindEOF) { - return nil - } - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoProductionName) - } - lhs := p.lastTok.text - lhsPos := p.lastTok.pos - - var dirs []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - dirs = append(dirs, dir) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindColon) { - raiseSyntaxError(p.pos.Row, synErrNoColon) - } - - alt := p.parseAlternative() - rhs := []*AlternativeNode{alt} - for { - p.consume(tokenKindNewline) - - if !p.consume(tokenKindOr) { - break - } - alt := p.parseAlternative() - rhs = append(rhs, alt) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrNoSemicolon) - } - - if !p.consume(tokenKindNewline) { - if !p.consume(tokenKindEOF) { - raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) - } - } - - prod := &ProductionNode{ - Directives: dirs, - LHS: lhs, - RHS: rhs, - Pos: lhsPos, - } - - // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs. - // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate - // name to the pattern. Therefore, this code prohibits alternatives from containing patterns. - if !prod.isLexical() { - for _, alt := range prod.RHS { - for _, elem := range alt.Elements { - if elem.Pattern != "" { - raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt) - } - } - } - } - - return prod -} - -func (p *parser) parseAlternative() *AlternativeNode { - elems := []*ElementNode{} - for { - elem := p.parseElement() - if elem == nil { - break - } - elems = append(elems, elem) - } - - // When a length of an alternative is zero, we cannot set a position. - var firstElemPos Position - if len(elems) > 0 { - firstElemPos = elems[0].Pos - } - - var dirs []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - dirs = append(dirs, dir) - } - - return &AlternativeNode{ - Elements: elems, - Directives: dirs, - Pos: firstElemPos, - } -} - -func (p *parser) parseElement() *ElementNode { - var elem *ElementNode - switch { - case p.consume(tokenKindID): - elem = &ElementNode{ - ID: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindTerminalPattern): - elem = &ElementNode{ - Pattern: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindStringLiteral): - elem = &ElementNode{ - Pattern: p.lastTok.text, - Literally: true, - Pos: p.lastTok.pos, - } - default: - if p.consume(tokenKindLabelMarker) { - raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol) - } - return nil - } - if p.consume(tokenKindLabelMarker) { - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoLabel) - } - elem.Label = &LabelNode{ - Name: p.lastTok.text, - Pos: p.lastTok.pos, - } - } - return elem -} - -func (p *parser) parseDirective() *DirectiveNode { - p.consume(tokenKindNewline) - - if !p.consume(tokenKindDirectiveMarker) { - return nil - } - dirPos := p.lastTok.pos - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoDirectiveName) - } - name := p.lastTok.text - - var params []*ParameterNode - for { - param := p.parseParameter() - if param == nil { - break - } - params = append(params, param) - } - - return &DirectiveNode{ - Name: name, - Parameters: params, - Pos: dirPos, - } -} - -func (p *parser) parseParameter() *ParameterNode { - var param *ParameterNode - switch { - case p.consume(tokenKindID): - param = &ParameterNode{ - ID: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindTerminalPattern): - param = &ParameterNode{ - Pattern: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindStringLiteral): - param = &ParameterNode{ - String: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindOrderedSymbolMarker): - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName) - } - param = &ParameterNode{ - OrderedSymbol: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindLParen): - pos := p.lastTok.pos - var g []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - g = append(g, dir) - } - if !p.consume(tokenKindRParen) { - raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup) - } - if len(g) == 0 { - // Set an empty slice representing an empty directive group to distinguish between the following two cases. - // - // - #prec (); // vartan allows this case. - // - #prec; // This case will raise an error. - g = []*DirectiveNode{} - } - param = &ParameterNode{ - Group: g, - Pos: pos, - } - } - if p.consume(tokenKindExpantion) { - switch { - case param == nil: - raiseSyntaxError(p.pos.Row, synErrStrayExpOp) - case param.ID == "": - raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand) - } - param.Expansion = true - } - return param -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - panic(err) - } - } - p.pos = tok.pos - if tok.kind == tokenKindInvalid { - raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text) - } - if tok.kind == expected { - p.lastTok = tok - return true - } - p.peekedTok = tok - - return false -} - -func (p *parser) skip() { - var tok *token - var err error - for { - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - p.errs = append(p.errs, &verr.SpecError{ - Cause: err, - Row: p.pos.Row, - }) - continue - } - } - - break - } - - p.lastTok = tok - p.pos = tok.pos -} - -func (p *parser) skipOverTo(kind tokenKind) { - for { - if p.consume(kind) || p.consume(tokenKindEOF) { - return - } - p.skip() - } -} diff --git a/src/urubu/spec/grammar/parser/syntax_error.go b/src/urubu/spec/grammar/parser/syntax_error.go deleted file mode 100644 index 719fb94..0000000 --- a/src/urubu/spec/grammar/parser/syntax_error.go +++ /dev/null @@ -1,45 +0,0 @@ -package parser - -type SyntaxError struct { - message string -} - -func newSyntaxError(message string) *SyntaxError { - return &SyntaxError{ - message: message, - } -} - -func (e *SyntaxError) Error() string { - return e.message -} - -var ( - // lexical errors - synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore") - synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier") - synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively") - synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier") - synErrUnclosedTerminal = newSyntaxError("unclosed terminal") - synErrUnclosedString = newSyntaxError("unclosed string") - synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash") - synErrEmptyPattern = newSyntaxError("a pattern must include at least one character") - synErrEmptyString = newSyntaxError("a string must include at least one character") - - // syntax errors - synErrInvalidToken = newSyntaxError("invalid token") - synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;") - synErrNoProductionName = newSyntaxError("a production name is missing") - synErrNoColon = newSyntaxError("the colon must precede alternatives") - synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative") - synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol") - synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @") - synErrNoDirectiveName = newSyntaxError("a directive needs a name") - synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing") - synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )") - synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal") - synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier") - synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier") - synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline") - synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element") -) diff --git a/src/urubu/spec/grammar/util.go b/src/urubu/spec/grammar/util.go deleted file mode 100644 index bf3f233..0000000 --- a/src/urubu/spec/grammar/util.go +++ /dev/null @@ -1,21 +0,0 @@ -package grammar - -import "strings" - -var rep = strings.NewReplacer( - `.`, `\.`, - `*`, `\*`, - `+`, `\+`, - `?`, `\?`, - `|`, `\|`, - `(`, `\(`, - `)`, `\)`, - `[`, `\[`, - `\`, `\\`, -) - -// EscapePattern escapes the special characters. -// For example, EscapePattern(`+`) returns `\+`. -func EscapePattern(s string) string { - return rep.Replace(s) -} diff --git a/src/urubu/spec/test/tree_lexer.go b/src/urubu/spec/test.go index 8bb1c87..4985e14 100644 --- a/src/urubu/spec/test/tree_lexer.go +++ b/src/urubu/spec/test.go @@ -1,11 +1,342 @@ -// Code generated by vartan-go. DO NOT EDIT. +//go:generate vartan compile tree.vartan -o tree.json +//go:generate vartan-go tree.json --package test + package test import ( + "bufio" + "bytes" + "encoding/json" + "errors" "fmt" "io" + "regexp" + "strconv" + "strings" + "unicode/utf8" ) +type TreeDiff struct { + ExpectedPath string + ActualPath string + Message string +} + +func newTreeDiff(expected, actual *Tree, message string) *TreeDiff { + return &TreeDiff{ + ExpectedPath: expected.path(), + ActualPath: actual.path(), + Message: message, + } +} + +type Tree struct { + Parent *Tree + Offset int + Kind string + Children []*Tree + Lexeme string +} + +func NewNonTerminalTree(kind string, children ...*Tree) *Tree { + return &Tree{ + Kind: kind, + Children: children, + } +} + +func NewTerminalNode(kind string, lexeme string) *Tree { + return &Tree{ + Kind: kind, + Lexeme: lexeme, + } +} + +func (t *Tree) Fill() *Tree { + for i, c := range t.Children { + c.Parent = t + c.Offset = i + c.Fill() + } + return t +} + +func (t *Tree) path() string { + if t.Parent == nil { + return t.Kind + } + return fmt.Sprintf("%v.[%v]%v", t.Parent.path(), t.Offset, t.Kind) +} + +func (t *Tree) Format() []byte { + var b bytes.Buffer + t.format(&b, 0) + return b.Bytes() +} + +func (t *Tree) format(buf *bytes.Buffer, depth int) { + for i := 0; i < depth; i++ { + buf.WriteString(" ") + } + buf.WriteString("(") + buf.WriteString(t.Kind) + if len(t.Children) > 0 { + buf.WriteString("\n") + for i, c := range t.Children { + c.format(buf, depth+1) + if i < len(t.Children)-1 { + buf.WriteString("\n") + } + } + } + buf.WriteString(")") +} + +func DiffTree(expected, actual *Tree) []*TreeDiff { + if expected == nil && actual == nil { + return nil + } + if actual.Kind != expected.Kind { + msg := fmt.Sprintf("unexpected kind: expected '%v' but got '%v'", expected.Kind, actual.Kind) + return []*TreeDiff{ + newTreeDiff(expected, actual, msg), + } + } + if expected.Lexeme != actual.Lexeme { + msg := fmt.Sprintf("unexpected lexeme: expected '%v' but got '%v'", expected.Lexeme, actual.Lexeme) + return []*TreeDiff{ + newTreeDiff(expected, actual, msg), + } + } + if len(actual.Children) != len(expected.Children) { + msg := fmt.Sprintf("unexpected node count: expected %v but got %v", len(expected.Children), len(actual.Children)) + return []*TreeDiff{ + newTreeDiff(expected, actual, msg), + } + } + var diffs []*TreeDiff + for i, exp := range expected.Children { + if ds := DiffTree(exp, actual.Children[i]); len(ds) > 0 { + diffs = append(diffs, ds...) + } + } + return diffs +} + +type TestCase struct { + Description string + Source []byte + Output *Tree +} + +func ParseTestCase(r io.Reader) (*TestCase, error) { + parts, err := splitIntoParts(r) + if err != nil { + return nil, err + } + if len(parts) != 3 { + return nil, fmt.Errorf("too many or too few part delimiters: a test case consists of just tree parts: %v parts found", len(parts)) + } + + tp := &treeParser{ + lineOffset: parts[0].lineCount + parts[1].lineCount + 2, + } + tree, err := tp.parseTree(bytes.NewReader(parts[2].buf)) + if err != nil { + return nil, err + } + + return &TestCase{ + Description: string(parts[0].buf), + Source: parts[1].buf, + Output: tree, + }, nil +} + +type testCasePart struct { + buf []byte + lineCount int +} + +func splitIntoParts(r io.Reader) ([]*testCasePart, error) { + var bufs []*testCasePart + s := bufio.NewScanner(r) + for { + buf, lineCount, err := readPart(s) + if err != nil { + return nil, err + } + if buf == nil { + break + } + bufs = append(bufs, &testCasePart{ + buf: buf, + lineCount: lineCount, + }) + } + if err := s.Err(); err != nil { + return nil, err + } + return bufs, nil +} + +var reDelim = regexp.MustCompile(`^\s*---+\s*$`) + +func readPart(s *bufio.Scanner) ([]byte, int, error) { + if !s.Scan() { + return nil, 0, s.Err() + } + buf := &bytes.Buffer{} + line := s.Bytes() + if reDelim.Match(line) { + // Return an empty slice because (*bytes.Buffer).Bytes() returns nil if we have never written data. + return []byte{}, 0, nil + } + _, err := buf.Write(line) + if err != nil { + return nil, 0, err + } + lineCount := 1 + for s.Scan() { + line := s.Bytes() + if reDelim.Match(line) { + return buf.Bytes(), lineCount, nil + } + _, err := buf.Write([]byte("\n")) + if err != nil { + return nil, 0, err + } + _, err = buf.Write(line) + if err != nil { + return nil, 0, err + } + lineCount++ + } + if err := s.Err(); err != nil { + return nil, 0, err + } + return buf.Bytes(), lineCount, nil +} + +type treeParser struct { + lineOffset int +} + +func (tp *treeParser) parseTree(src io.Reader) (*Tree, error) { + toks, err := NewTokenStream(src) + if err != nil { + return nil, err + } + gram := NewGrammar() + tb := NewDefaultSyntaxTreeBuilder() + p, err := NewParser(toks, gram, SemanticAction(NewASTActionSet(gram, tb))) + if err != nil { + return nil, err + } + err = p.Parse() + if err != nil { + return nil, err + } + synErrs := p.SyntaxErrors() + if len(synErrs) > 0 { + var b strings.Builder + b.Write(formatSyntaxError(synErrs[0], gram, tp.lineOffset)) + for _, synErr := range synErrs[1:] { + b.WriteRune('\n') + b.Write(formatSyntaxError(synErr, gram, tp.lineOffset)) + } + return nil, errors.New(b.String()) + } + t, err := tp.genTree(tb.Tree()) + if err != nil { + return nil, err + } + return t.Fill(), nil +} + +func formatSyntaxError(synErr *SyntaxError, gram Grammar, lineOffset int) []byte { + var b bytes.Buffer + + b.WriteString(fmt.Sprintf("%v:%v: %v: ", lineOffset+synErr.Row+1, synErr.Col+1, synErr.Message)) + + tok := synErr.Token + switch { + case tok.EOF(): + b.WriteString("<eof>") + case tok.Invalid(): + b.WriteString(fmt.Sprintf("'%v' (<invalid>)", string(tok.Lexeme()))) + default: + if term := gram.Terminal(tok.TerminalID()); term != "" { + b.WriteString(fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), term)) + } else { + b.WriteString(fmt.Sprintf("'%v'", string(tok.Lexeme()))) + } + } + b.WriteString(fmt.Sprintf(": expected: %v", synErr.ExpectedTerminals[0])) + for _, t := range synErr.ExpectedTerminals[1:] { + b.WriteString(fmt.Sprintf(", %v", t)) + } + + return b.Bytes() +} + +func (tp *treeParser) genTree(node *Node) (*Tree, error) { + // A node labeled 'error' cannot have children. It always must be (error). + if sym := node.Children[0]; sym.Text == "error" { + if len(node.Children) > 1 { + return nil, fmt.Errorf("%v:%v: error node cannot take children", tp.lineOffset+sym.Row+1, sym.Col+1) + } + return NewTerminalNode(sym.Text, ""), nil + } + + if len(node.Children) == 2 && node.Children[1].KindName == "string" { + var text string + str := node.Children[1].Children[0] + switch str.KindName { + case "raw_string": + text = str.Children[0].Text + case "interpreted_string": + var b strings.Builder + for _, c := range str.Children { + switch c.KindName { + case "escaped_seq": + b.WriteString(strings.TrimPrefix(`\`, c.Text)) + case "escape_char": + return nil, fmt.Errorf("%v:%v: incomplete escape sequence", tp.lineOffset+c.Row+1, c.Col+1) + case "codepoint_expr": + cp := c.Children[0] + n, err := strconv.ParseInt(cp.Text, 16, 64) + if err != nil { + return nil, fmt.Errorf("%v:%v: %v", tp.lineOffset+cp.Row+1, cp.Col+1, err) + } + if !utf8.ValidRune(rune(n)) { + return nil, fmt.Errorf("%v:%v: invalid code point: %v", tp.lineOffset+cp.Row+1, cp.Col+1, cp.Text) + } + b.WriteRune(rune(n)) + default: + b.WriteString(c.Text) + } + } + text = b.String() + } + return NewTerminalNode(node.Children[0].Text, text), nil + } + + var children []*Tree + if len(node.Children) > 1 { + children = make([]*Tree, len(node.Children)-1) + for i, c := range node.Children[1:] { + var err error + children[i], err = tp.genTree(c) + if err != nil { + return nil, err + } + } + } + return NewNonTerminalTree(node.Children[0].Text, children...), nil +} +// Code generated by vartan-go. DO NOT EDIT. + type ModeID int func (id ModeID) Int() int { @@ -1022,3 +1353,1072 @@ func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, strin id := s.kindIDs[mode][modeKind] return id, s.kindNames[id] } +// Code generated by vartan-go. DO NOT EDIT. + +type Grammar interface { + // InitialState returns the initial state of a parser. + InitialState() int + + // StartProduction returns the start production of grammar. + StartProduction() int + + // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair. + Action(state int, terminal int) int + + // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair. + GoTo(state int, lhs int) int + + // ErrorTrapperState returns true when a state can shift the error symbol. + ErrorTrapperState(state int) bool + + // LHS returns a LHS symbol of a production. + LHS(prod int) int + + // AlternativeSymbolCount returns a symbol count of p production. + AlternativeSymbolCount(prod int) int + + // RecoverProduction returns true when a production has the recover directive. + RecoverProduction(prod int) bool + + // NonTerminal retuns a string representaion of a non-terminal symbol. + NonTerminal(nonTerminal int) string + + // TerminalCount returns a terminal symbol count of grammar. + TerminalCount() int + + // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis. + SkipTerminal(terminal int) bool + + // EOF returns the EOF symbol. + EOF() int + + // Error returns the error symbol. + Error() int + + // Terminal retuns a string representaion of a terminal symbol. + Terminal(terminal int) string + + // ASTAction returns an AST action entries. + ASTAction(prod int) []int +} + +type VToken interface { + // TerminalID returns a terminal ID. + TerminalID() int + + // Lexeme returns a lexeme. + Lexeme() []byte + + // EOF returns true when a token represents EOF. + EOF() bool + + // Invalid returns true when a token is invalid. + Invalid() bool + + // Position returns (row, column) pair. + Position() (int, int) +} + +type TokenStream interface { + Next() (VToken, error) +} + +type SyntaxError struct { + Row int + Col int + Message string + Token VToken + ExpectedTerminals []string +} + +type ParserOption func(p *Parser) error + +// DisableLAC disables LAC (lookahead correction). LAC is enabled by default. +func DisableLAC() ParserOption { + return func(p *Parser) error { + p.disableLAC = true + return nil + } +} + +func SemanticAction(semAct SemanticActionSet) ParserOption { + return func(p *Parser) error { + p.semAct = semAct + return nil + } +} + +type Parser struct { + toks TokenStream + gram Grammar + stateStack *stateStack + semAct SemanticActionSet + disableLAC bool + onError bool + shiftCount int + synErrs []*SyntaxError +} + +func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) { + p := &Parser{ + toks: toks, + gram: gram, + stateStack: &stateStack{}, + } + + for _, opt := range opts { + err := opt(p) + if err != nil { + return nil, err + } + } + + return p, nil +} + +func (p *Parser) Parse() error { + p.stateStack.push(p.gram.InitialState()) + tok, err := p.nextToken() + if err != nil { + return err + } + +ACTION_LOOP: + for { + act := p.lookupAction(tok) + + switch { + case act < 0: // Shift + nextState := act * -1 + + recovered := false + if p.onError { + p.shiftCount++ + + // When the parser performs shift three times, the parser recovers from the error state. + if p.shiftCount >= 3 { + p.onError = false + p.shiftCount = 0 + recovered = true + } + } + + p.shift(nextState) + + if p.semAct != nil { + p.semAct.Shift(tok, recovered) + } + + tok, err = p.nextToken() + if err != nil { + return err + } + case act > 0: // Reduce + prodNum := act + + recovered := false + if p.onError && p.gram.RecoverProduction(prodNum) { + p.onError = false + p.shiftCount = 0 + recovered = true + } + + accepted := p.reduce(prodNum) + if accepted { + if p.semAct != nil { + p.semAct.Accept() + } + + return nil + } + + if p.semAct != nil { + p.semAct.Reduce(prodNum, recovered) + } + default: // Error + if p.onError { + tok, err = p.nextToken() + if err != nil { + return err + } + if tok.EOF() { + if p.semAct != nil { + p.semAct.MissError(tok) + } + + return nil + } + + continue ACTION_LOOP + } + + row, col := tok.Position() + p.synErrs = append(p.synErrs, &SyntaxError{ + Row: row, + Col: col, + Message: "unexpected token", + Token: tok, + ExpectedTerminals: p.searchLookahead(p.stateStack.top()), + }) + + count, ok := p.trapError() + if !ok { + if p.semAct != nil { + p.semAct.MissError(tok) + } + + return nil + } + + p.onError = true + p.shiftCount = 0 + + act, err := p.lookupActionOnError() + if err != nil { + return err + } + + p.shift(act * -1) + + if p.semAct != nil { + p.semAct.TrapAndShiftError(tok, count) + } + } + } +} + +// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid, +// this method returns `true`. +func (p *Parser) validateLookahead(term int) bool { + p.stateStack.enableExploratoryMode() + defer p.stateStack.disableExploratoryMode() + + for { + act := p.gram.Action(p.stateStack.topExploratorily(), term) + + switch { + case act < 0: // Shift + return true + case act > 0: // Reduce + prodNum := act + + lhs := p.gram.LHS(prodNum) + if lhs == p.gram.LHS(p.gram.StartProduction()) { + return true + } + n := p.gram.AlternativeSymbolCount(prodNum) + p.stateStack.popExploratorily(n) + state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs) + p.stateStack.pushExploratorily(state) + default: // Error + return false + } + } +} + +func (p *Parser) nextToken() (VToken, error) { + for { + // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0, + // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect + // a syntax error because the parser cannot find an entry corresponding to the invalid token. + tok, err := p.toks.Next() + if err != nil { + return nil, err + } + + if p.gram.SkipTerminal(tok.TerminalID()) { + continue + } + + return tok, nil + } +} + +func (p *Parser) tokenToTerminal(tok VToken) int { + if tok.EOF() { + return p.gram.EOF() + } + + return tok.TerminalID() +} + +func (p *Parser) lookupAction(tok VToken) int { + if !p.disableLAC { + term := p.tokenToTerminal(tok) + if !p.validateLookahead(term) { + return 0 + } + } + + return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok)) +} + +func (p *Parser) lookupActionOnError() (int, error) { + act := p.gram.Action(p.stateStack.top(), p.gram.Error()) + if act >= 0 { + return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error())) + } + + return act, nil +} + +func (p *Parser) shift(nextState int) { + p.stateStack.push(nextState) +} + +func (p *Parser) reduce(prodNum int) bool { + lhs := p.gram.LHS(prodNum) + if lhs == p.gram.LHS(p.gram.StartProduction()) { + return true + } + n := p.gram.AlternativeSymbolCount(prodNum) + p.stateStack.pop(n) + nextState := p.gram.GoTo(p.stateStack.top(), lhs) + p.stateStack.push(nextState) + return false +} + +func (p *Parser) trapError() (int, bool) { + count := 0 + for { + if p.gram.ErrorTrapperState(p.stateStack.top()) { + return count, true + } + + if p.stateStack.top() != p.gram.InitialState() { + p.stateStack.pop(1) + count++ + } else { + return 0, false + } + } +} + +func (p *Parser) SyntaxErrors() []*SyntaxError { + return p.synErrs +} + +func (p *Parser) searchLookahead(state int) []string { + kinds := []string{} + termCount := p.gram.TerminalCount() + for term := 0; term < termCount; term++ { + if p.disableLAC { + if p.gram.Action(p.stateStack.top(), term) == 0 { + continue + } + } else { + if !p.validateLookahead(term) { + continue + } + } + + // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol + // intentionally. + if term == p.gram.Error() { + continue + } + + kinds = append(kinds, p.gram.Terminal(term)) + } + + return kinds +} + +type stateStack struct { + items []int + itemsExp []int +} + +func (s *stateStack) enableExploratoryMode() { + s.itemsExp = make([]int, len(s.items)) + copy(s.itemsExp, s.items) +} + +func (s *stateStack) disableExploratoryMode() { + s.itemsExp = nil +} + +func (s *stateStack) top() int { + return s.items[len(s.items)-1] +} + +func (s *stateStack) topExploratorily() int { + return s.itemsExp[len(s.itemsExp)-1] +} + +func (s *stateStack) push(state int) { + s.items = append(s.items, state) +} + +func (s *stateStack) pushExploratorily(state int) { + s.itemsExp = append(s.itemsExp, state) +} + +func (s *stateStack) pop(n int) { + s.items = s.items[:len(s.items)-n] +} + +func (s *stateStack) popExploratorily(n int) { + s.itemsExp = s.itemsExp[:len(s.itemsExp)-n] +} + +type grammarImpl struct { + recoverProductions []int + action []int + goTo []int + alternativeSymbolCounts []int + errorTrapperStates []int + nonTerminals []string + lhsSymbols []int + terminals []string + terminalSkip []int + astActions [][]int +} + +func NewGrammar() *grammarImpl { + return &grammarImpl{ + recoverProductions: []int{ + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + }, + action: []int{ + 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -3, 0, 0, 0, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, + -2, 7, 0, -11, 0, 0, -12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, + 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -14, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 12, 12, 0, 0, -17, 12, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -22, 0, 16, 16, 0, 0, 0, 0, 0, -23, + -24, -25, -26, -27, -28, -29, 16, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -30, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -23, -24, -25, -26, -27, -28, -29, 15, + 0, 18, 0, 0, 18, 18, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 0, + 25, 0, 0, 25, 25, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -33, 0, 19, 0, + 0, 19, 19, 0, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -34, 0, 0, 0, 0, 0, 0, 20, 0, 0, 20, + 20, 0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 20, 20, 20, 0, 21, 0, 0, 21, 21, + 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 21, 21, 0, 22, 0, 0, 22, 22, 0, + 0, 0, 0, 0, 22, 22, 22, 22, 22, 22, 22, 22, 0, 23, 0, 0, 23, 23, 0, 0, + 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23, 0, 24, 0, 0, 24, 24, 0, 0, 0, + 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 0, 10, 0, 0, 10, 10, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 13, 13, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 17, 17, 0, 0, 0, 0, 0, 17, + 17, 17, 17, 17, 17, 17, 17, 0, 14, 0, 0, 14, 14, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -36, + 0, 0, 0, 0, 0, 26, 0, 0, 26, 26, 0, 0, 0, 0, 0, 26, 26, 26, 26, 26, + 26, 26, 26, + }, + goTo: []int{ + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 0, 10, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 19, 20, 21, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 21, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + }, + alternativeSymbolCounts: []int{ + 0, 1, 4, 4, 3, 2, 1, 0, 1, 1, 3, 1, 0, 3, 3, 1, 0, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 4, + }, + errorTrapperStates: []int{ + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + nonTerminals: []string{ + "", + "tree'", + "tree", + "tree_list", + "string", + "raw_string", + "opt_raw_string_body", + "interpreted_string", + "opt_interpreted_string_body", + "interpreted_string_body", + "interpreted_string_elem", + "codepoint_expr", + }, + lhsSymbols: []int{ + 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 10, 10, 10, 10, 10, 11, + }, + terminals: []string{ + "", + "<eof>", + "error", + "ws", + "l_paren", + "r_paren", + "identifier", + "raw_string_open", + "raw_string_body", + "raw_string_close", + "interpreted_string_open", + "interpreted_seq", + "codepoint_prefix", + "l_brace", + "r_brace", + "hex_digits", + "escaped_seq", + "escape_char", + "interpreted_string_close", + }, + terminalSkip: []int{ + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + astActions: [][]int{ + nil, + nil, + { + 2, -3, + }, + { + 2, 3, + }, + { + 2, + }, + { + -1, 2, + }, + nil, + nil, + nil, + nil, + { + -2, + }, + nil, + nil, + { + -2, + }, + { + 2, + }, + { + -1, + }, + nil, + { + -1, -2, + }, + { + -1, + }, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + { + 3, + }, + }, + } +} + +func (g *grammarImpl) InitialState() int { + return 0 +} + +func (g *grammarImpl) StartProduction() int { + return 1 +} + +func (g *grammarImpl) RecoverProduction(prod int) bool { + return g.recoverProductions[prod] != 0 +} + +func (g *grammarImpl) Action(state int, terminal int) int { + return g.action[state*19+terminal] +} + +func (g *grammarImpl) GoTo(state int, lhs int) int { + return g.goTo[state*12+lhs] +} + +func (g *grammarImpl) AlternativeSymbolCount(prod int) int { + return g.alternativeSymbolCounts[prod] +} + +func (g *grammarImpl) TerminalCount() int { + return 19 +} + +func (g *grammarImpl) SkipTerminal(terminal int) bool { + return g.terminalSkip[terminal] == 1 +} + +func (g *grammarImpl) ErrorTrapperState(state int) bool { + return g.errorTrapperStates[state] != 0 +} + +func (g *grammarImpl) NonTerminal(nonTerminal int) string { + return g.nonTerminals[nonTerminal] +} + +func (g *grammarImpl) LHS(prod int) int { + return g.lhsSymbols[prod] +} + +func (g *grammarImpl) EOF() int { + return 1 +} + +func (g *grammarImpl) Error() int { + return 2 +} + +func (g *grammarImpl) Terminal(terminal int) string { + return g.terminals[terminal] +} + +func (g *grammarImpl) ASTAction(prod int) []int { + return g.astActions[prod] +} + +type vToken struct { + terminalID int + tok *Token +} + +func (t *vToken) TerminalID() int { + return t.terminalID +} + +func (t *vToken) Lexeme() []byte { + return t.tok.Lexeme +} + +func (t *vToken) EOF() bool { + return t.tok.EOF +} + +func (t *vToken) Invalid() bool { + return t.tok.Invalid +} + +func (t *vToken) Position() (int, int) { + return t.tok.Row, t.tok.Col +} + +var kindToTerminal = []int{ + 0, 3, 4, 5, 6, 7, 10, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, +} + +type tokenStream struct { + lex *Lexer + kindToTerminal []int +} + +func NewTokenStream(src io.Reader) (*tokenStream, error) { + lex, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + + return &tokenStream{ + lex: lex, + }, nil +} + +func (t *tokenStream) Next() (VToken, error) { + tok, err := t.lex.Next() + if err != nil { + return nil, err + } + return &vToken{ + terminalID: kindToTerminal[tok.KindID], + tok: tok, + }, nil +} +// Code generated by vartan-go. DO NOT EDIT. + +// SemanticActionSet is a set of semantic actions a parser calls. +type SemanticActionSet interface { + // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol. + // When the parser recovered from an error state by shifting the token, `recovered` is true. + Shift(tok VToken, recovered bool) + + // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production. + // When the parser recovered from an error state by reducing the production, `recovered` is true. + Reduce(prodNum int, recovered bool) + + // Accept runs when the parser accepts an input. + Accept() + + // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack. + // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards + // from the state stack. + // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token + // corresponding to the error symbol doesn't exist. + TrapAndShiftError(cause VToken, popped int) + + // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error. + MissError(cause VToken) +} + +var _ SemanticActionSet = &SyntaxTreeActionSet{} + +// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface. +type SyntaxTreeNode interface { + // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast` + // directive with `...` operator. + ChildCount() int + + // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast` + // directive with `...` operator. + ExpandChildren() []SyntaxTreeNode +} + +var _ SyntaxTreeNode = &Node{} + +// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types. +// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface. +type SyntaxTreeBuilder interface { + Shift(kindName string, text string, row, col int) SyntaxTreeNode + ShiftError(kindName string) SyntaxTreeNode + Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode + Accept(f SyntaxTreeNode) +} + +var _ SyntaxTreeBuilder = &DefaulSyntaxTreeBuilder{} + +// DefaulSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder. +type DefaulSyntaxTreeBuilder struct { + tree *Node +} + +// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder. +func NewDefaultSyntaxTreeBuilder() *DefaulSyntaxTreeBuilder { + return &DefaulSyntaxTreeBuilder{} +} + +// Shift is a implementation of SyntaxTreeBuilder.Shift. +func (b *DefaulSyntaxTreeBuilder) Shift(kindName string, text string, row, col int) SyntaxTreeNode { + return &Node{ + Type: NodeTypeTerminal, + KindName: kindName, + Text: text, + Row: row, + Col: col, + } +} + +// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError. +func (b *DefaulSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode { + return &Node{ + Type: NodeTypeError, + KindName: kindName, + } +} + +// Reduce is a implementation of SyntaxTreeBuilder.Reduce. +func (b *DefaulSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode { + cNodes := make([]*Node, len(children)) + for i, c := range children { + cNodes[i] = c.(*Node) + } + return &Node{ + Type: NodeTypeNonTerminal, + KindName: kindName, + Children: cNodes, + } +} + +// Accept is a implementation of SyntaxTreeBuilder.Accept. +func (b *DefaulSyntaxTreeBuilder) Accept(f SyntaxTreeNode) { + b.tree = f.(*Node) +} + +// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil. +func (b *DefaulSyntaxTreeBuilder) Tree() *Node { + return b.tree +} + +// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree. +type SyntaxTreeActionSet struct { + gram Grammar + builder SyntaxTreeBuilder + semStack *semanticStack + disableASTAction bool +} + +// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree). +// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them. +func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { + return &SyntaxTreeActionSet{ + gram: gram, + builder: builder, + semStack: newSemanticStack(), + } +} + +// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree). +// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them. +func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { + return &SyntaxTreeActionSet{ + gram: gram, + builder: builder, + semStack: newSemanticStack(), + disableASTAction: true, + } +} + +// Shift is a implementation of SemanticActionSet.Shift method. +func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) { + term := a.tokenToTerminal(tok) + row, col := tok.Position() + a.semStack.push(a.builder.Shift(a.gram.Terminal(term), string(tok.Lexeme()), row, col)) +} + +// Reduce is a implementation of SemanticActionSet.Reduce method. +func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { + lhs := a.gram.LHS(prodNum) + + // When an alternative is empty, `n` will be 0, and `handle` will be empty slice. + n := a.gram.AlternativeSymbolCount(prodNum) + handle := a.semStack.pop(n) + + var astAct []int + if !a.disableASTAction { + astAct = a.gram.ASTAction(prodNum) + } + var children []SyntaxTreeNode + if astAct != nil { + // Count the number of children in advance to avoid frequent growth in a slice for children. + { + l := 0 + for _, e := range astAct { + if e > 0 { + l++ + } else { + offset := e*-1 - 1 + l += handle[offset].ChildCount() + } + } + + children = make([]SyntaxTreeNode, l) + } + + p := 0 + for _, e := range astAct { + if e > 0 { + offset := e - 1 + children[p] = handle[offset] + p++ + } else { + offset := e*-1 - 1 + for _, c := range handle[offset].ExpandChildren() { + children[p] = c + p++ + } + } + } + } else { + // If an alternative has no AST action, a driver generates + // a node with the same structure as a CST. + children = handle + } + + a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children)) +} + +// Accept is a implementation of SemanticActionSet.Accept method. +func (a *SyntaxTreeActionSet) Accept() { + top := a.semStack.pop(1) + a.builder.Accept(top[0]) +} + +// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method. +func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) { + a.semStack.pop(popped) + a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error()))) +} + +// MissError is a implementation of SemanticActionSet.MissError method. +func (a *SyntaxTreeActionSet) MissError(cause VToken) { +} + +func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int { + if tok.EOF() { + return a.gram.EOF() + } + + return tok.TerminalID() +} + +type semanticStack struct { + frames []SyntaxTreeNode +} + +func newSemanticStack() *semanticStack { + return &semanticStack{ + frames: make([]SyntaxTreeNode, 0, 100), + } +} + +func (s *semanticStack) push(f SyntaxTreeNode) { + s.frames = append(s.frames, f) +} + +func (s *semanticStack) pop(n int) []SyntaxTreeNode { + fs := s.frames[len(s.frames)-n:] + s.frames = s.frames[:len(s.frames)-n] + + return fs +} + +type NodeType int + +const ( + NodeTypeError = 0 + NodeTypeTerminal = 1 + NodeTypeNonTerminal = 2 +) + +// Node is a implementation of SyntaxTreeNode interface. +type Node struct { + Type NodeType + KindName string + Text string + Row int + Col int + Children []*Node +} + +func (n *Node) MarshalJSON() ([]byte, error) { + switch n.Type { + case NodeTypeError: + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + }{ + Type: n.Type, + KindName: n.KindName, + }) + case NodeTypeTerminal: + if n.KindName == "" { + return json.Marshal(struct { + Type NodeType `json:"type"` + Text string `json:"text"` + Row int `json:"row"` + Col int `json:"col"` + }{ + Type: n.Type, + Text: n.Text, + Row: n.Row, + Col: n.Col, + }) + } + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + Text string `json:"text"` + Row int `json:"row"` + Col int `json:"col"` + }{ + Type: n.Type, + KindName: n.KindName, + Text: n.Text, + Row: n.Row, + Col: n.Col, + }) + case NodeTypeNonTerminal: + return json.Marshal(struct { + Type NodeType `json:"type"` + KindName string `json:"kind_name"` + Children []*Node `json:"children"` + }{ + Type: n.Type, + KindName: n.KindName, + Children: n.Children, + }) + default: + return nil, fmt.Errorf("invalid node type: %v", n.Type) + } +} + +// ChildCount is a implementation of SyntaxTreeNode.ChildCount. +func (n *Node) ChildCount() int { + return len(n.Children) +} + +// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren. +func (n *Node) ExpandChildren() []SyntaxTreeNode { + fs := make([]SyntaxTreeNode, len(n.Children)) + for i, n := range n.Children { + fs[i] = n + } + return fs +} + +// PrintTree prints a syntax tree whose root is `node`. +func PrintTree(w io.Writer, node *Node) { + printTree(w, node, "", "") +} + +func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) { + if node == nil { + return + } + + switch node.Type { + case NodeTypeError: + fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) + case NodeTypeTerminal: + fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text)) + case NodeTypeNonTerminal: + fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) + + num := len(node.Children) + for i, child := range node.Children { + var line string + if num > 1 && i < num-1 { + line = "├─ " + } else { + line = "└─ " + } + + var prefix string + if i >= num-1 { + prefix = " " + } else { + prefix = "│ " + } + + printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } + } +} diff --git a/src/urubu/spec/test/parser.go b/src/urubu/spec/test/parser.go deleted file mode 100644 index b7265d7..0000000 --- a/src/urubu/spec/test/parser.go +++ /dev/null @@ -1,336 +0,0 @@ -//go:generate vartan compile tree.vartan -o tree.json -//go:generate vartan-go tree.json --package test - -package test - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "io" - "regexp" - "strconv" - "strings" - "unicode/utf8" -) - -type TreeDiff struct { - ExpectedPath string - ActualPath string - Message string -} - -func newTreeDiff(expected, actual *Tree, message string) *TreeDiff { - return &TreeDiff{ - ExpectedPath: expected.path(), - ActualPath: actual.path(), - Message: message, - } -} - -type Tree struct { - Parent *Tree - Offset int - Kind string - Children []*Tree - Lexeme string -} - -func NewNonTerminalTree(kind string, children ...*Tree) *Tree { - return &Tree{ - Kind: kind, - Children: children, - } -} - -func NewTerminalNode(kind string, lexeme string) *Tree { - return &Tree{ - Kind: kind, - Lexeme: lexeme, - } -} - -func (t *Tree) Fill() *Tree { - for i, c := range t.Children { - c.Parent = t - c.Offset = i - c.Fill() - } - return t -} - -func (t *Tree) path() string { - if t.Parent == nil { - return t.Kind - } - return fmt.Sprintf("%v.[%v]%v", t.Parent.path(), t.Offset, t.Kind) -} - -func (t *Tree) Format() []byte { - var b bytes.Buffer - t.format(&b, 0) - return b.Bytes() -} - -func (t *Tree) format(buf *bytes.Buffer, depth int) { - for i := 0; i < depth; i++ { - buf.WriteString(" ") - } - buf.WriteString("(") - buf.WriteString(t.Kind) - if len(t.Children) > 0 { - buf.WriteString("\n") - for i, c := range t.Children { - c.format(buf, depth+1) - if i < len(t.Children)-1 { - buf.WriteString("\n") - } - } - } - buf.WriteString(")") -} - -func DiffTree(expected, actual *Tree) []*TreeDiff { - if expected == nil && actual == nil { - return nil - } - if actual.Kind != expected.Kind { - msg := fmt.Sprintf("unexpected kind: expected '%v' but got '%v'", expected.Kind, actual.Kind) - return []*TreeDiff{ - newTreeDiff(expected, actual, msg), - } - } - if expected.Lexeme != actual.Lexeme { - msg := fmt.Sprintf("unexpected lexeme: expected '%v' but got '%v'", expected.Lexeme, actual.Lexeme) - return []*TreeDiff{ - newTreeDiff(expected, actual, msg), - } - } - if len(actual.Children) != len(expected.Children) { - msg := fmt.Sprintf("unexpected node count: expected %v but got %v", len(expected.Children), len(actual.Children)) - return []*TreeDiff{ - newTreeDiff(expected, actual, msg), - } - } - var diffs []*TreeDiff - for i, exp := range expected.Children { - if ds := DiffTree(exp, actual.Children[i]); len(ds) > 0 { - diffs = append(diffs, ds...) - } - } - return diffs -} - -type TestCase struct { - Description string - Source []byte - Output *Tree -} - -func ParseTestCase(r io.Reader) (*TestCase, error) { - parts, err := splitIntoParts(r) - if err != nil { - return nil, err - } - if len(parts) != 3 { - return nil, fmt.Errorf("too many or too few part delimiters: a test case consists of just tree parts: %v parts found", len(parts)) - } - - tp := &treeParser{ - lineOffset: parts[0].lineCount + parts[1].lineCount + 2, - } - tree, err := tp.parseTree(bytes.NewReader(parts[2].buf)) - if err != nil { - return nil, err - } - - return &TestCase{ - Description: string(parts[0].buf), - Source: parts[1].buf, - Output: tree, - }, nil -} - -type testCasePart struct { - buf []byte - lineCount int -} - -func splitIntoParts(r io.Reader) ([]*testCasePart, error) { - var bufs []*testCasePart - s := bufio.NewScanner(r) - for { - buf, lineCount, err := readPart(s) - if err != nil { - return nil, err - } - if buf == nil { - break - } - bufs = append(bufs, &testCasePart{ - buf: buf, - lineCount: lineCount, - }) - } - if err := s.Err(); err != nil { - return nil, err - } - return bufs, nil -} - -var reDelim = regexp.MustCompile(`^\s*---+\s*$`) - -func readPart(s *bufio.Scanner) ([]byte, int, error) { - if !s.Scan() { - return nil, 0, s.Err() - } - buf := &bytes.Buffer{} - line := s.Bytes() - if reDelim.Match(line) { - // Return an empty slice because (*bytes.Buffer).Bytes() returns nil if we have never written data. - return []byte{}, 0, nil - } - _, err := buf.Write(line) - if err != nil { - return nil, 0, err - } - lineCount := 1 - for s.Scan() { - line := s.Bytes() - if reDelim.Match(line) { - return buf.Bytes(), lineCount, nil - } - _, err := buf.Write([]byte("\n")) - if err != nil { - return nil, 0, err - } - _, err = buf.Write(line) - if err != nil { - return nil, 0, err - } - lineCount++ - } - if err := s.Err(); err != nil { - return nil, 0, err - } - return buf.Bytes(), lineCount, nil -} - -type treeParser struct { - lineOffset int -} - -func (tp *treeParser) parseTree(src io.Reader) (*Tree, error) { - toks, err := NewTokenStream(src) - if err != nil { - return nil, err - } - gram := NewGrammar() - tb := NewDefaultSyntaxTreeBuilder() - p, err := NewParser(toks, gram, SemanticAction(NewASTActionSet(gram, tb))) - if err != nil { - return nil, err - } - err = p.Parse() - if err != nil { - return nil, err - } - synErrs := p.SyntaxErrors() - if len(synErrs) > 0 { - var b strings.Builder - b.Write(formatSyntaxError(synErrs[0], gram, tp.lineOffset)) - for _, synErr := range synErrs[1:] { - b.WriteRune('\n') - b.Write(formatSyntaxError(synErr, gram, tp.lineOffset)) - } - return nil, errors.New(b.String()) - } - t, err := tp.genTree(tb.Tree()) - if err != nil { - return nil, err - } - return t.Fill(), nil -} - -func formatSyntaxError(synErr *SyntaxError, gram Grammar, lineOffset int) []byte { - var b bytes.Buffer - - b.WriteString(fmt.Sprintf("%v:%v: %v: ", lineOffset+synErr.Row+1, synErr.Col+1, synErr.Message)) - - tok := synErr.Token - switch { - case tok.EOF(): - b.WriteString("<eof>") - case tok.Invalid(): - b.WriteString(fmt.Sprintf("'%v' (<invalid>)", string(tok.Lexeme()))) - default: - if term := gram.Terminal(tok.TerminalID()); term != "" { - b.WriteString(fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), term)) - } else { - b.WriteString(fmt.Sprintf("'%v'", string(tok.Lexeme()))) - } - } - b.WriteString(fmt.Sprintf(": expected: %v", synErr.ExpectedTerminals[0])) - for _, t := range synErr.ExpectedTerminals[1:] { - b.WriteString(fmt.Sprintf(", %v", t)) - } - - return b.Bytes() -} - -func (tp *treeParser) genTree(node *Node) (*Tree, error) { - // A node labeled 'error' cannot have children. It always must be (error). - if sym := node.Children[0]; sym.Text == "error" { - if len(node.Children) > 1 { - return nil, fmt.Errorf("%v:%v: error node cannot take children", tp.lineOffset+sym.Row+1, sym.Col+1) - } - return NewTerminalNode(sym.Text, ""), nil - } - - if len(node.Children) == 2 && node.Children[1].KindName == "string" { - var text string - str := node.Children[1].Children[0] - switch str.KindName { - case "raw_string": - text = str.Children[0].Text - case "interpreted_string": - var b strings.Builder - for _, c := range str.Children { - switch c.KindName { - case "escaped_seq": - b.WriteString(strings.TrimPrefix(`\`, c.Text)) - case "escape_char": - return nil, fmt.Errorf("%v:%v: incomplete escape sequence", tp.lineOffset+c.Row+1, c.Col+1) - case "codepoint_expr": - cp := c.Children[0] - n, err := strconv.ParseInt(cp.Text, 16, 64) - if err != nil { - return nil, fmt.Errorf("%v:%v: %v", tp.lineOffset+cp.Row+1, cp.Col+1, err) - } - if !utf8.ValidRune(rune(n)) { - return nil, fmt.Errorf("%v:%v: invalid code point: %v", tp.lineOffset+cp.Row+1, cp.Col+1, cp.Text) - } - b.WriteRune(rune(n)) - default: - b.WriteString(c.Text) - } - } - text = b.String() - } - return NewTerminalNode(node.Children[0].Text, text), nil - } - - var children []*Tree - if len(node.Children) > 1 { - children = make([]*Tree, len(node.Children)-1) - for i, c := range node.Children[1:] { - var err error - children[i], err = tp.genTree(c) - if err != nil { - return nil, err - } - } - } - return NewNonTerminalTree(node.Children[0].Text, children...), nil -} diff --git a/src/urubu/spec/test/tree_parser.go b/src/urubu/spec/test/tree_parser.go deleted file mode 100644 index 528d259..0000000 --- a/src/urubu/spec/test/tree_parser.go +++ /dev/null @@ -1,716 +0,0 @@ -// Code generated by vartan-go. DO NOT EDIT. -package test - -import ( - "fmt" - "io" -) - -type Grammar interface { - // InitialState returns the initial state of a parser. - InitialState() int - - // StartProduction returns the start production of grammar. - StartProduction() int - - // Action returns an ACTION entry corresponding to a (state, terminal symbol) pair. - Action(state int, terminal int) int - - // GoTo returns a GOTO entry corresponding to a (state, non-terminal symbol) pair. - GoTo(state int, lhs int) int - - // ErrorTrapperState returns true when a state can shift the error symbol. - ErrorTrapperState(state int) bool - - // LHS returns a LHS symbol of a production. - LHS(prod int) int - - // AlternativeSymbolCount returns a symbol count of p production. - AlternativeSymbolCount(prod int) int - - // RecoverProduction returns true when a production has the recover directive. - RecoverProduction(prod int) bool - - // NonTerminal retuns a string representaion of a non-terminal symbol. - NonTerminal(nonTerminal int) string - - // TerminalCount returns a terminal symbol count of grammar. - TerminalCount() int - - // SkipTerminal returns true when a terminal symbol must be skipped on syntax analysis. - SkipTerminal(terminal int) bool - - // EOF returns the EOF symbol. - EOF() int - - // Error returns the error symbol. - Error() int - - // Terminal retuns a string representaion of a terminal symbol. - Terminal(terminal int) string - - // ASTAction returns an AST action entries. - ASTAction(prod int) []int -} - -type VToken interface { - // TerminalID returns a terminal ID. - TerminalID() int - - // Lexeme returns a lexeme. - Lexeme() []byte - - // EOF returns true when a token represents EOF. - EOF() bool - - // Invalid returns true when a token is invalid. - Invalid() bool - - // Position returns (row, column) pair. - Position() (int, int) -} - -type TokenStream interface { - Next() (VToken, error) -} - -type SyntaxError struct { - Row int - Col int - Message string - Token VToken - ExpectedTerminals []string -} - -type ParserOption func(p *Parser) error - -// DisableLAC disables LAC (lookahead correction). LAC is enabled by default. -func DisableLAC() ParserOption { - return func(p *Parser) error { - p.disableLAC = true - return nil - } -} - -func SemanticAction(semAct SemanticActionSet) ParserOption { - return func(p *Parser) error { - p.semAct = semAct - return nil - } -} - -type Parser struct { - toks TokenStream - gram Grammar - stateStack *stateStack - semAct SemanticActionSet - disableLAC bool - onError bool - shiftCount int - synErrs []*SyntaxError -} - -func NewParser(toks TokenStream, gram Grammar, opts ...ParserOption) (*Parser, error) { - p := &Parser{ - toks: toks, - gram: gram, - stateStack: &stateStack{}, - } - - for _, opt := range opts { - err := opt(p) - if err != nil { - return nil, err - } - } - - return p, nil -} - -func (p *Parser) Parse() error { - p.stateStack.push(p.gram.InitialState()) - tok, err := p.nextToken() - if err != nil { - return err - } - -ACTION_LOOP: - for { - act := p.lookupAction(tok) - - switch { - case act < 0: // Shift - nextState := act * -1 - - recovered := false - if p.onError { - p.shiftCount++ - - // When the parser performs shift three times, the parser recovers from the error state. - if p.shiftCount >= 3 { - p.onError = false - p.shiftCount = 0 - recovered = true - } - } - - p.shift(nextState) - - if p.semAct != nil { - p.semAct.Shift(tok, recovered) - } - - tok, err = p.nextToken() - if err != nil { - return err - } - case act > 0: // Reduce - prodNum := act - - recovered := false - if p.onError && p.gram.RecoverProduction(prodNum) { - p.onError = false - p.shiftCount = 0 - recovered = true - } - - accepted := p.reduce(prodNum) - if accepted { - if p.semAct != nil { - p.semAct.Accept() - } - - return nil - } - - if p.semAct != nil { - p.semAct.Reduce(prodNum, recovered) - } - default: // Error - if p.onError { - tok, err = p.nextToken() - if err != nil { - return err - } - if tok.EOF() { - if p.semAct != nil { - p.semAct.MissError(tok) - } - - return nil - } - - continue ACTION_LOOP - } - - row, col := tok.Position() - p.synErrs = append(p.synErrs, &SyntaxError{ - Row: row, - Col: col, - Message: "unexpected token", - Token: tok, - ExpectedTerminals: p.searchLookahead(p.stateStack.top()), - }) - - count, ok := p.trapError() - if !ok { - if p.semAct != nil { - p.semAct.MissError(tok) - } - - return nil - } - - p.onError = true - p.shiftCount = 0 - - act, err := p.lookupActionOnError() - if err != nil { - return err - } - - p.shift(act * -1) - - if p.semAct != nil { - p.semAct.TrapAndShiftError(tok, count) - } - } - } -} - -// validateLookahead validates whether `term` is a valid lookahead in the current context. When `term` is valid, -// this method returns `true`. -func (p *Parser) validateLookahead(term int) bool { - p.stateStack.enableExploratoryMode() - defer p.stateStack.disableExploratoryMode() - - for { - act := p.gram.Action(p.stateStack.topExploratorily(), term) - - switch { - case act < 0: // Shift - return true - case act > 0: // Reduce - prodNum := act - - lhs := p.gram.LHS(prodNum) - if lhs == p.gram.LHS(p.gram.StartProduction()) { - return true - } - n := p.gram.AlternativeSymbolCount(prodNum) - p.stateStack.popExploratorily(n) - state := p.gram.GoTo(p.stateStack.topExploratorily(), lhs) - p.stateStack.pushExploratorily(state) - default: // Error - return false - } - } -} - -func (p *Parser) nextToken() (VToken, error) { - for { - // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0, - // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect - // a syntax error because the parser cannot find an entry corresponding to the invalid token. - tok, err := p.toks.Next() - if err != nil { - return nil, err - } - - if p.gram.SkipTerminal(tok.TerminalID()) { - continue - } - - return tok, nil - } -} - -func (p *Parser) tokenToTerminal(tok VToken) int { - if tok.EOF() { - return p.gram.EOF() - } - - return tok.TerminalID() -} - -func (p *Parser) lookupAction(tok VToken) int { - if !p.disableLAC { - term := p.tokenToTerminal(tok) - if !p.validateLookahead(term) { - return 0 - } - } - - return p.gram.Action(p.stateStack.top(), p.tokenToTerminal(tok)) -} - -func (p *Parser) lookupActionOnError() (int, error) { - act := p.gram.Action(p.stateStack.top(), p.gram.Error()) - if act >= 0 { - return 0, fmt.Errorf("an entry must be a shift action by the error symbol; entry: %v, state: %v, symbol: %v", act, p.stateStack.top(), p.gram.Terminal(p.gram.Error())) - } - - return act, nil -} - -func (p *Parser) shift(nextState int) { - p.stateStack.push(nextState) -} - -func (p *Parser) reduce(prodNum int) bool { - lhs := p.gram.LHS(prodNum) - if lhs == p.gram.LHS(p.gram.StartProduction()) { - return true - } - n := p.gram.AlternativeSymbolCount(prodNum) - p.stateStack.pop(n) - nextState := p.gram.GoTo(p.stateStack.top(), lhs) - p.stateStack.push(nextState) - return false -} - -func (p *Parser) trapError() (int, bool) { - count := 0 - for { - if p.gram.ErrorTrapperState(p.stateStack.top()) { - return count, true - } - - if p.stateStack.top() != p.gram.InitialState() { - p.stateStack.pop(1) - count++ - } else { - return 0, false - } - } -} - -func (p *Parser) SyntaxErrors() []*SyntaxError { - return p.synErrs -} - -func (p *Parser) searchLookahead(state int) []string { - kinds := []string{} - termCount := p.gram.TerminalCount() - for term := 0; term < termCount; term++ { - if p.disableLAC { - if p.gram.Action(p.stateStack.top(), term) == 0 { - continue - } - } else { - if !p.validateLookahead(term) { - continue - } - } - - // We don't add the error symbol to the look-ahead symbols because users cannot input the error symbol - // intentionally. - if term == p.gram.Error() { - continue - } - - kinds = append(kinds, p.gram.Terminal(term)) - } - - return kinds -} - -type stateStack struct { - items []int - itemsExp []int -} - -func (s *stateStack) enableExploratoryMode() { - s.itemsExp = make([]int, len(s.items)) - copy(s.itemsExp, s.items) -} - -func (s *stateStack) disableExploratoryMode() { - s.itemsExp = nil -} - -func (s *stateStack) top() int { - return s.items[len(s.items)-1] -} - -func (s *stateStack) topExploratorily() int { - return s.itemsExp[len(s.itemsExp)-1] -} - -func (s *stateStack) push(state int) { - s.items = append(s.items, state) -} - -func (s *stateStack) pushExploratorily(state int) { - s.itemsExp = append(s.itemsExp, state) -} - -func (s *stateStack) pop(n int) { - s.items = s.items[:len(s.items)-n] -} - -func (s *stateStack) popExploratorily(n int) { - s.itemsExp = s.itemsExp[:len(s.itemsExp)-n] -} - -type grammarImpl struct { - recoverProductions []int - action []int - goTo []int - alternativeSymbolCounts []int - errorTrapperStates []int - nonTerminals []string - lhsSymbols []int - terminals []string - terminalSkip []int - astActions [][]int -} - -func NewGrammar() *grammarImpl { - return &grammarImpl{ - recoverProductions: []int{ - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - }, - action: []int{ - 0, 0, 0, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -3, 0, 0, 0, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, - -2, 7, 0, -11, 0, 0, -12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, - 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -14, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 12, 12, 0, 0, -17, 12, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -22, 0, 16, 16, 0, 0, 0, 0, 0, -23, - -24, -25, -26, -27, -28, -29, 16, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -30, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -23, -24, -25, -26, -27, -28, -29, 15, - 0, 18, 0, 0, 18, 18, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 0, - 25, 0, 0, 25, 25, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -33, 0, 19, 0, - 0, 19, 19, 0, 0, 0, 0, 0, 19, 19, 19, 19, 19, 19, 19, 19, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, -34, 0, 0, 0, 0, 0, 0, 20, 0, 0, 20, - 20, 0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 20, 20, 20, 0, 21, 0, 0, 21, 21, - 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 21, 21, 0, 22, 0, 0, 22, 22, 0, - 0, 0, 0, 0, 22, 22, 22, 22, 22, 22, 22, 22, 0, 23, 0, 0, 23, 23, 0, 0, - 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23, 0, 24, 0, 0, 24, 24, 0, 0, 0, - 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 0, 10, 0, 0, 10, 10, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 13, 13, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 17, 17, 0, 0, 0, 0, 0, 17, - 17, 17, 17, 17, 17, 17, 17, 0, 14, 0, 0, 14, 14, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, -35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -36, - 0, 0, 0, 0, 0, 26, 0, 0, 26, 26, 0, 0, 0, 0, 0, 26, 26, 26, 26, 26, - 26, 26, 26, - }, - goTo: []int{ - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 9, 0, 10, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 19, 20, 21, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 21, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, - }, - alternativeSymbolCounts: []int{ - 0, 1, 4, 4, 3, 2, 1, 0, 1, 1, 3, 1, 0, 3, 3, 1, 0, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 4, - }, - errorTrapperStates: []int{ - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, - nonTerminals: []string{ - "", - "tree'", - "tree", - "tree_list", - "string", - "raw_string", - "opt_raw_string_body", - "interpreted_string", - "opt_interpreted_string_body", - "interpreted_string_body", - "interpreted_string_elem", - "codepoint_expr", - }, - lhsSymbols: []int{ - 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 10, 10, 10, 10, 10, 10, 11, - }, - terminals: []string{ - "", - "<eof>", - "error", - "ws", - "l_paren", - "r_paren", - "identifier", - "raw_string_open", - "raw_string_body", - "raw_string_close", - "interpreted_string_open", - "interpreted_seq", - "codepoint_prefix", - "l_brace", - "r_brace", - "hex_digits", - "escaped_seq", - "escape_char", - "interpreted_string_close", - }, - terminalSkip: []int{ - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }, - astActions: [][]int{ - nil, - nil, - { - 2, -3, - }, - { - 2, 3, - }, - { - 2, - }, - { - -1, 2, - }, - nil, - nil, - nil, - nil, - { - -2, - }, - nil, - nil, - { - -2, - }, - { - 2, - }, - { - -1, - }, - nil, - { - -1, -2, - }, - { - -1, - }, - nil, - nil, - nil, - nil, - nil, - nil, - nil, - { - 3, - }, - }, - } -} - -func (g *grammarImpl) InitialState() int { - return 0 -} - -func (g *grammarImpl) StartProduction() int { - return 1 -} - -func (g *grammarImpl) RecoverProduction(prod int) bool { - return g.recoverProductions[prod] != 0 -} - -func (g *grammarImpl) Action(state int, terminal int) int { - return g.action[state*19+terminal] -} - -func (g *grammarImpl) GoTo(state int, lhs int) int { - return g.goTo[state*12+lhs] -} - -func (g *grammarImpl) AlternativeSymbolCount(prod int) int { - return g.alternativeSymbolCounts[prod] -} - -func (g *grammarImpl) TerminalCount() int { - return 19 -} - -func (g *grammarImpl) SkipTerminal(terminal int) bool { - return g.terminalSkip[terminal] == 1 -} - -func (g *grammarImpl) ErrorTrapperState(state int) bool { - return g.errorTrapperStates[state] != 0 -} - -func (g *grammarImpl) NonTerminal(nonTerminal int) string { - return g.nonTerminals[nonTerminal] -} - -func (g *grammarImpl) LHS(prod int) int { - return g.lhsSymbols[prod] -} - -func (g *grammarImpl) EOF() int { - return 1 -} - -func (g *grammarImpl) Error() int { - return 2 -} - -func (g *grammarImpl) Terminal(terminal int) string { - return g.terminals[terminal] -} - -func (g *grammarImpl) ASTAction(prod int) []int { - return g.astActions[prod] -} - -type vToken struct { - terminalID int - tok *Token -} - -func (t *vToken) TerminalID() int { - return t.terminalID -} - -func (t *vToken) Lexeme() []byte { - return t.tok.Lexeme -} - -func (t *vToken) EOF() bool { - return t.tok.EOF -} - -func (t *vToken) Invalid() bool { - return t.tok.Invalid -} - -func (t *vToken) Position() (int, int) { - return t.tok.Row, t.tok.Col -} - -var kindToTerminal = []int{ - 0, 3, 4, 5, 6, 7, 10, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, -} - -type tokenStream struct { - lex *Lexer - kindToTerminal []int -} - -func NewTokenStream(src io.Reader) (*tokenStream, error) { - lex, err := NewLexer(NewLexSpec(), src) - if err != nil { - return nil, err - } - - return &tokenStream{ - lex: lex, - }, nil -} - -func (t *tokenStream) Next() (VToken, error) { - tok, err := t.lex.Next() - if err != nil { - return nil, err - } - return &vToken{ - terminalID: kindToTerminal[tok.KindID], - tok: tok, - }, nil -} diff --git a/src/urubu/spec/test/tree_semantic_action.go b/src/urubu/spec/test/tree_semantic_action.go deleted file mode 100644 index c1d5a25..0000000 --- a/src/urubu/spec/test/tree_semantic_action.go +++ /dev/null @@ -1,367 +0,0 @@ -// Code generated by vartan-go. DO NOT EDIT. -package test - -import ( - "encoding/json" - "fmt" - "io" - "strconv" -) - -// SemanticActionSet is a set of semantic actions a parser calls. -type SemanticActionSet interface { - // Shift runs when the parser shifts a symbol onto a state stack. `tok` is a token corresponding to the symbol. - // When the parser recovered from an error state by shifting the token, `recovered` is true. - Shift(tok VToken, recovered bool) - - // Reduce runs when the parser reduces an RHS of a production to its LHS. `prodNum` is a number of the production. - // When the parser recovered from an error state by reducing the production, `recovered` is true. - Reduce(prodNum int, recovered bool) - - // Accept runs when the parser accepts an input. - Accept() - - // TrapAndShiftError runs when the parser traps a syntax error and shifts a error symbol onto the state stack. - // `cause` is a token that caused a syntax error. `popped` is the number of frames that the parser discards - // from the state stack. - // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token - // corresponding to the error symbol doesn't exist. - TrapAndShiftError(cause VToken, popped int) - - // MissError runs when the parser fails to trap a syntax error. `cause` is a token that caused a syntax error. - MissError(cause VToken) -} - -var _ SemanticActionSet = &SyntaxTreeActionSet{} - -// SyntaxTreeNode is a node of a syntax tree. A node type used in SyntaxTreeActionSet must implement SyntaxTreeNode interface. -type SyntaxTreeNode interface { - // ChildCount returns a child count of a node. A parser calls this method to know the child count to be expanded by an `#ast` - // directive with `...` operator. - ChildCount() int - - // ExpandChildren returns children of a node. A parser calls this method to fetch the children to be expanded by an `#ast` - // directive with `...` operator. - ExpandChildren() []SyntaxTreeNode -} - -var _ SyntaxTreeNode = &Node{} - -// SyntaxTreeBuilder allows you to construct a syntax tree containing arbitrary user-defined node types. -// The parser uses SyntaxTreeBuilder interface as a part of semantic actions via SyntaxTreeActionSet interface. -type SyntaxTreeBuilder interface { - Shift(kindName string, text string, row, col int) SyntaxTreeNode - ShiftError(kindName string) SyntaxTreeNode - Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode - Accept(f SyntaxTreeNode) -} - -var _ SyntaxTreeBuilder = &DefaulSyntaxTreeBuilder{} - -// DefaulSyntaxTreeBuilder is a implementation of SyntaxTreeBuilder. -type DefaulSyntaxTreeBuilder struct { - tree *Node -} - -// NewDefaultSyntaxTreeBuilder returns a new DefaultSyntaxTreeBuilder. -func NewDefaultSyntaxTreeBuilder() *DefaulSyntaxTreeBuilder { - return &DefaulSyntaxTreeBuilder{} -} - -// Shift is a implementation of SyntaxTreeBuilder.Shift. -func (b *DefaulSyntaxTreeBuilder) Shift(kindName string, text string, row, col int) SyntaxTreeNode { - return &Node{ - Type: NodeTypeTerminal, - KindName: kindName, - Text: text, - Row: row, - Col: col, - } -} - -// ShiftError is a implementation of SyntaxTreeBuilder.ShiftError. -func (b *DefaulSyntaxTreeBuilder) ShiftError(kindName string) SyntaxTreeNode { - return &Node{ - Type: NodeTypeError, - KindName: kindName, - } -} - -// Reduce is a implementation of SyntaxTreeBuilder.Reduce. -func (b *DefaulSyntaxTreeBuilder) Reduce(kindName string, children []SyntaxTreeNode) SyntaxTreeNode { - cNodes := make([]*Node, len(children)) - for i, c := range children { - cNodes[i] = c.(*Node) - } - return &Node{ - Type: NodeTypeNonTerminal, - KindName: kindName, - Children: cNodes, - } -} - -// Accept is a implementation of SyntaxTreeBuilder.Accept. -func (b *DefaulSyntaxTreeBuilder) Accept(f SyntaxTreeNode) { - b.tree = f.(*Node) -} - -// Tree returns a syntax tree when the parser has accepted an input. If a syntax error occurs, the return value is nil. -func (b *DefaulSyntaxTreeBuilder) Tree() *Node { - return b.tree -} - -// SyntaxTreeActionSet is a implementation of SemanticActionSet interface and constructs a syntax tree. -type SyntaxTreeActionSet struct { - gram Grammar - builder SyntaxTreeBuilder - semStack *semanticStack - disableASTAction bool -} - -// NewASTActionSet returns a new SyntaxTreeActionSet that constructs an AST (Abstract Syntax Tree). -// When grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns interprets them. -func NewASTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { - return &SyntaxTreeActionSet{ - gram: gram, - builder: builder, - semStack: newSemanticStack(), - } -} - -// NewCSTTActionSet returns a new SyntaxTreeActionSet that constructs a CST (Concrete Syntax Tree). -// Even if grammar `gram` contains `#ast` directives, the new SyntaxTreeActionSet this function returns ignores them. -func NewCSTActionSet(gram Grammar, builder SyntaxTreeBuilder) *SyntaxTreeActionSet { - return &SyntaxTreeActionSet{ - gram: gram, - builder: builder, - semStack: newSemanticStack(), - disableASTAction: true, - } -} - -// Shift is a implementation of SemanticActionSet.Shift method. -func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) { - term := a.tokenToTerminal(tok) - row, col := tok.Position() - a.semStack.push(a.builder.Shift(a.gram.Terminal(term), string(tok.Lexeme()), row, col)) -} - -// Reduce is a implementation of SemanticActionSet.Reduce method. -func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { - lhs := a.gram.LHS(prodNum) - - // When an alternative is empty, `n` will be 0, and `handle` will be empty slice. - n := a.gram.AlternativeSymbolCount(prodNum) - handle := a.semStack.pop(n) - - var astAct []int - if !a.disableASTAction { - astAct = a.gram.ASTAction(prodNum) - } - var children []SyntaxTreeNode - if astAct != nil { - // Count the number of children in advance to avoid frequent growth in a slice for children. - { - l := 0 - for _, e := range astAct { - if e > 0 { - l++ - } else { - offset := e*-1 - 1 - l += handle[offset].ChildCount() - } - } - - children = make([]SyntaxTreeNode, l) - } - - p := 0 - for _, e := range astAct { - if e > 0 { - offset := e - 1 - children[p] = handle[offset] - p++ - } else { - offset := e*-1 - 1 - for _, c := range handle[offset].ExpandChildren() { - children[p] = c - p++ - } - } - } - } else { - // If an alternative has no AST action, a driver generates - // a node with the same structure as a CST. - children = handle - } - - a.semStack.push(a.builder.Reduce(a.gram.NonTerminal(lhs), children)) -} - -// Accept is a implementation of SemanticActionSet.Accept method. -func (a *SyntaxTreeActionSet) Accept() { - top := a.semStack.pop(1) - a.builder.Accept(top[0]) -} - -// TrapAndShiftError is a implementation of SemanticActionSet.TrapAndShiftError method. -func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) { - a.semStack.pop(popped) - a.semStack.push(a.builder.ShiftError(a.gram.Terminal(a.gram.Error()))) -} - -// MissError is a implementation of SemanticActionSet.MissError method. -func (a *SyntaxTreeActionSet) MissError(cause VToken) { -} - -func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int { - if tok.EOF() { - return a.gram.EOF() - } - - return tok.TerminalID() -} - -type semanticStack struct { - frames []SyntaxTreeNode -} - -func newSemanticStack() *semanticStack { - return &semanticStack{ - frames: make([]SyntaxTreeNode, 0, 100), - } -} - -func (s *semanticStack) push(f SyntaxTreeNode) { - s.frames = append(s.frames, f) -} - -func (s *semanticStack) pop(n int) []SyntaxTreeNode { - fs := s.frames[len(s.frames)-n:] - s.frames = s.frames[:len(s.frames)-n] - - return fs -} - -type NodeType int - -const ( - NodeTypeError = 0 - NodeTypeTerminal = 1 - NodeTypeNonTerminal = 2 -) - -// Node is a implementation of SyntaxTreeNode interface. -type Node struct { - Type NodeType - KindName string - Text string - Row int - Col int - Children []*Node -} - -func (n *Node) MarshalJSON() ([]byte, error) { - switch n.Type { - case NodeTypeError: - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - }{ - Type: n.Type, - KindName: n.KindName, - }) - case NodeTypeTerminal: - if n.KindName == "" { - return json.Marshal(struct { - Type NodeType `json:"type"` - Text string `json:"text"` - Row int `json:"row"` - Col int `json:"col"` - }{ - Type: n.Type, - Text: n.Text, - Row: n.Row, - Col: n.Col, - }) - } - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - Text string `json:"text"` - Row int `json:"row"` - Col int `json:"col"` - }{ - Type: n.Type, - KindName: n.KindName, - Text: n.Text, - Row: n.Row, - Col: n.Col, - }) - case NodeTypeNonTerminal: - return json.Marshal(struct { - Type NodeType `json:"type"` - KindName string `json:"kind_name"` - Children []*Node `json:"children"` - }{ - Type: n.Type, - KindName: n.KindName, - Children: n.Children, - }) - default: - return nil, fmt.Errorf("invalid node type: %v", n.Type) - } -} - -// ChildCount is a implementation of SyntaxTreeNode.ChildCount. -func (n *Node) ChildCount() int { - return len(n.Children) -} - -// ExpandChildren is a implementation of SyntaxTreeNode.ExpandChildren. -func (n *Node) ExpandChildren() []SyntaxTreeNode { - fs := make([]SyntaxTreeNode, len(n.Children)) - for i, n := range n.Children { - fs[i] = n - } - return fs -} - -// PrintTree prints a syntax tree whose root is `node`. -func PrintTree(w io.Writer, node *Node) { - printTree(w, node, "", "") -} - -func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix string) { - if node == nil { - return - } - - switch node.Type { - case NodeTypeError: - fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) - case NodeTypeTerminal: - fmt.Fprintf(w, "%v%v %v\n", ruledLine, node.KindName, strconv.Quote(node.Text)) - case NodeTypeNonTerminal: - fmt.Fprintf(w, "%v%v\n", ruledLine, node.KindName) - - num := len(node.Children) - for i, child := range node.Children { - var line string - if num > 1 && i < num-1 { - line = "├─ " - } else { - line = "└─ " - } - - var prefix string - if i >= num-1 { - prefix = " " - } else { - prefix = "│ " - } - - printTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } - } -} diff --git a/src/urubu/spec/test/tree-report.json b/src/urubu/spec/tree-report.json index c2018e5..c2018e5 100644 --- a/src/urubu/spec/test/tree-report.json +++ b/src/urubu/spec/tree-report.json diff --git a/src/urubu/spec/test/tree.json b/src/urubu/spec/tree.json index f05c2f2..f05c2f2 100644 --- a/src/urubu/spec/test/tree.json +++ b/src/urubu/spec/tree.json diff --git a/src/urubu/spec/test/tree.vartan b/src/urubu/spec/tree.vartan index aa8f733..aa8f733 100644 --- a/src/urubu/spec/test/tree.vartan +++ b/src/urubu/spec/tree.vartan diff --git a/src/urubu/tester/tester.go b/src/urubu/tester.go index cae52b2..cae52b2 100644 --- a/src/urubu/tester/tester.go +++ b/src/urubu/tester.go diff --git a/src/urubu/ucd/codepoint.go b/src/urubu/ucd.go index e9b411e..3c3da17 100644 --- a/src/urubu/ucd/codepoint.go +++ b/src/urubu/ucd.go @@ -1,7 +1,190 @@ -// Code generated by generator/main.go; DO NOT EDIT. +//go:generate go run ../cmd/ucdgen/main.go +//go:generate go fmt codepoint.go package ucd +import ( + "bufio" + "encoding/binary" + "encoding/hex" + "fmt" + "io" + "regexp" + "strings" +) + +const ( + // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf + // 3.4 Characters and Encoding + // > D9 Unicode codespace: A range of integers from 0 to 10FFFF16. + codePointMin = 0x0 + codePointMax = 0x10FFFF +) + +func NormalizeCharacterProperty(propName, propVal string) (string, error) { + if propName == "" { + propName = "gc" + } + + name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)] + if !ok { + return "", fmt.Errorf("unsupported character property name: %v", propName) + } + props, ok := derivedCoreProperties[name] + if !ok { + return "", nil + } + var b strings.Builder + yes, ok := binaryValues[normalizeSymbolicValue(propVal)] + if !ok { + return "", fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + fmt.Fprint(&b, "[") + } else { + fmt.Fprint(&b, "[^") + } + for _, prop := range props { + fmt.Fprint(&b, prop) + } + fmt.Fprint(&b, "]") + + return b.String(), nil +} + +func IsContributoryProperty(propName string) bool { + if propName == "" { + return false + } + + for _, p := range contributoryProperties { + if propName == p { + return true + } + } + return false +} + +func FindCodePointRanges(propName, propVal string) ([]*CodePointRange, bool, error) { + if propName == "" { + propName = "gc" + } + + name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property name: %v", propName) + } + switch name { + case "gc": + val, ok := generalCategoryValueAbbs[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if val == generalCategoryValueAbbs[normalizeSymbolicValue(generalCategoryDefaultValue)] { + var allCPs []*CodePointRange + if generalCategoryDefaultRange.From > codePointMin { + allCPs = append(allCPs, &CodePointRange{ + From: codePointMin, + To: generalCategoryDefaultRange.From - 1, + }) + } + if generalCategoryDefaultRange.To < codePointMax { + allCPs = append(allCPs, &CodePointRange{ + From: generalCategoryDefaultRange.To + 1, + To: codePointMax, + }) + } + for _, cp := range generalCategoryCodePoints { + allCPs = append(allCPs, cp...) + } + return allCPs, true, nil + } + vals, ok := compositGeneralCategories[val] + if !ok { + vals = []string{val} + } + var ranges []*CodePointRange + for _, v := range vals { + rs, ok := generalCategoryCodePoints[v] + if !ok { + return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v) + } + ranges = append(ranges, rs...) + } + return ranges, false, nil + case "sc": + val, ok := scriptValueAbbs[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if val == scriptValueAbbs[normalizeSymbolicValue(scriptDefaultValue)] { + var allCPs []*CodePointRange + if scriptDefaultRange.From > codePointMin { + allCPs = append(allCPs, &CodePointRange{ + From: codePointMin, + To: scriptDefaultRange.From - 1, + }) + } + if scriptDefaultRange.To < codePointMax { + allCPs = append(allCPs, &CodePointRange{ + From: scriptDefaultRange.To + 1, + To: codePointMax, + }) + } + for _, cp := range scriptCodepoints { + allCPs = append(allCPs, cp...) + } + return allCPs, true, nil + } + return scriptCodepoints[val], false, nil + case "oalpha": + yes, ok := binaryValues[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return otherAlphabeticCodePoints, false, nil + } else { + return otherAlphabeticCodePoints, true, nil + } + case "olower": + yes, ok := binaryValues[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return otherLowercaseCodePoints, false, nil + } else { + return otherLowercaseCodePoints, true, nil + } + case "oupper": + yes, ok := binaryValues[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return otherUppercaseCodePoints, false, nil + } else { + return otherUppercaseCodePoints, true, nil + } + case "wspace": + yes, ok := binaryValues[normalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return whiteSpaceCodePoints, false, nil + } else { + return whiteSpaceCodePoints, true, nil + } + } + + // If the process reaches this code, it's a bug. We must handle all of the properties registered with + // the `propertyNameAbbs`. + return nil, false, fmt.Errorf("character property '%v' is unavailable", propName) +} +// Code generated by generator/main.go; DO NOT EDIT. + // https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt var generalCategoryValueAbbs = map[string]string{ "c": "c", @@ -6550,3 +6733,467 @@ var whiteSpaceCodePoints = []*CodePointRange{ &CodePointRange{From: rune(8287), To: rune(8287)}, &CodePointRange{From: rune(12288), To: rune(12288)}, } + +type CodePointRange struct { + From rune + To rune +} + +var codePointRangeNil = &CodePointRange{ + From: 0, + To: 0, +} + +type field string + +func (f field) codePointRange() (*CodePointRange, error) { + var from, to rune + var err error + cp := reCodePointRange.FindStringSubmatch(string(f)) + from, err = decodeHexToRune(cp[1]) + if err != nil { + return codePointRangeNil, err + } + if cp[2] != "" { + to, err = decodeHexToRune(cp[2]) + if err != nil { + return codePointRangeNil, err + } + } else { + to = from + } + return &CodePointRange{ + From: from, + To: to, + }, nil +} + +func decodeHexToRune(hexCodePoint string) (rune, error) { + h := hexCodePoint + if len(h)%2 != 0 { + h = "0" + h + } + b, err := hex.DecodeString(h) + if err != nil { + return 0, err + } + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) + } + n := binary.BigEndian.Uint32(b) + return rune(n), nil +} + +func (f field) symbol() string { + return string(f) +} + +func (f field) normalizedSymbol() string { + return normalizeSymbolicValue(string(f)) +} + +var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") + +// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. +// +// https://www.unicode.org/reports/tr44/#UAX44-LM3 +func normalizeSymbolicValue(s string) string { + v := strings.ToLower(symValReplacer.Replace(s)) + if strings.HasPrefix(v, "is") && v != "is" { + return v[2:] + } + return v +} + +var ( + reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) + reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) + + specialCommentPrefix = "# @missing:" +) + +// This parser can parse data files of Unicode Character Database (UCD). +// Specifically, it has the following two functions: +// - Converts each line of the data files into a slice of fields. +// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. +// +// However, for practical purposes, each field needs to be analyzed more specifically. +// For instance, in UnicodeData.txt, the first field represents a range of code points, +// so it needs to be recognized as a hexadecimal string. +// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. +// +// https://www.unicode.org/reports/tr44/#Format_Conventions +type parser struct { + scanner *bufio.Scanner + fields []field + defaultFields []field + err error + + fieldBuf []field + defaultFieldBuf []field +} + +func newParser(r io.Reader) *parser { + return &parser{ + scanner: bufio.NewScanner(r), + fieldBuf: make([]field, 50), + defaultFieldBuf: make([]field, 50), + } +} + +func (p *parser) parse() bool { + for p.scanner.Scan() { + p.parseRecord(p.scanner.Text()) + if p.fields != nil || p.defaultFields != nil { + return true + } + } + p.err = p.scanner.Err() + return false +} + +func (p *parser) parseRecord(src string) { + ms := reLine.FindStringSubmatch(src) + mFields := ms[1] + mComment := ms[2] + if mFields != "" { + p.fields = parseFields(p.fieldBuf, mFields) + } else { + p.fields = nil + } + if strings.HasPrefix(mComment, specialCommentPrefix) { + p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) + } else { + p.defaultFields = nil + } +} + +func parseFields(buf []field, src string) []field { + n := 0 + for _, f := range strings.Split(src, ";") { + buf[n] = field(strings.TrimSpace(f)) + n++ + } + + return buf[:n] +} + +// contributoryProperties is a set of contributory properties vartan uses internally. +// Property statuses are defined in the following table. +// +// https://unicode.org/reports/tr44/#Property_List_Table +var contributoryProperties = []string{ + "oalpha", + "olower", + "oupper", +} + +func ContributoryProperties() []string { + return contributoryProperties +} + +// https://www.unicode.org/reports/tr44/#GC_Values_Table +var compositGeneralCategories = map[string][]string{ + // Cased_Letter + "lc": {"lu", "ll", "lt"}, + // Letter + "l": {"lu", "ll", "lt", "lm", "lo"}, + // Mark + "m": {"mm", "mc", "me"}, + // Number + "n": {"nd", "nl", "no"}, + // Punctuation + "p": {"pc", "pd", "ps", "pi", "pe", "pf", "po"}, + // Symbol + "s": {"sm", "sc", "sk", "so"}, + // Separator + "z": {"zs", "zl", "zp"}, + // Other + "c": {"cc", "cf", "cs", "co", "cn"}, +} + +// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt +var derivedCoreProperties = map[string][]string{ + // Alphabetic + "alpha": { + `\p{Lowercase=yes}`, + `\p{Uppercase=yes}`, + `\p{Lt}`, + `\p{Lm}`, + `\p{Lo}`, + `\p{Nl}`, + `\p{Other_Alphabetic=yes}`, + }, + // Lowercase + "lower": { + `\p{Ll}`, + `\p{Other_Lowercase=yes}`, + }, + // Uppercase + "upper": { + `\p{Lu}`, + `\p{Other_Uppercase=yes}`, + }, +} + +// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt +var propertyNameAbbs = map[string]string{ + "generalcategory": "gc", + "gc": "gc", + "script": "sc", + "sc": "sc", + "alphabetic": "alpha", + "alpha": "alpha", + "otheralphabetic": "oalpha", + "oalpha": "oalpha", + "lowercase": "lower", + "lower": "lower", + "uppercase": "upper", + "upper": "upper", + "otherlowercase": "olower", + "olower": "olower", + "otheruppercase": "oupper", + "oupper": "oupper", + "whitespace": "wspace", + "wspace": "wspace", + "space": "wspace", +} + +// https://www.unicode.org/reports/tr44/#Type_Key_Table +// https://www.unicode.org/reports/tr44/#Binary_Values_Table +var binaryValues = map[string]bool{ + "yes": true, + "y": true, + "true": true, + "t": true, + "no": false, + "n": false, + "false": false, + "f": false, +} + +type PropertyValueAliases struct { + GeneralCategory map[string]string + GeneralCategoryDefaultRange *CodePointRange + GeneralCategoryDefaultValue string + + Script map[string]string +} + +// ParsePropertyValueAliases parses the PropertyValueAliases.txt. +func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) { + gcAbbs := map[string]string{} + var defaultGCCPRange *CodePointRange + var defaultGCVal string + scAbbs := map[string]string{} + p := newParser(r) + for p.parse() { + // https://www.unicode.org/reports/tr44/#Property_Value_Aliases + // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property, + // > the second field specifies an abbreviated symbolic name for a value of that property, and the third + // > field specifies the long symbolic name for that value of that property. These are the preferred + // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent + // > fields. + if len(p.fields) > 0 { + switch p.fields[0].symbol() { + case "gc": + gcShort := p.fields[1].normalizedSymbol() + gcLong := p.fields[2].normalizedSymbol() + gcAbbs[gcShort] = gcShort + gcAbbs[gcLong] = gcShort + for _, f := range p.fields[3:] { + gcShortOther := f.normalizedSymbol() + gcAbbs[gcShortOther] = gcShort + } + case "sc": + scShort := p.fields[1].normalizedSymbol() + scLong := p.fields[2].normalizedSymbol() + scAbbs[scShort] = scShort + scAbbs[scLong] = scShort + for _, f := range p.fields[3:] { + scShortOther := f.normalizedSymbol() + scAbbs[scShortOther] = scShort + } + } + } + + // https://www.unicode.org/reports/tr44/#Missing_Conventions + // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt. + // > ... + // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below: + // > 1. code_point_range; default_prop_val + // > 2. code_point_range; property_name; default_prop_val + // > ... + // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which + // > contain values associated with many properties. For example: + // > # @missing: 0000..10FFFF; NFD_QC; Yes + if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" { + var err error + defaultGCCPRange, err = p.defaultFields[0].codePointRange() + if err != nil { + return nil, err + } + defaultGCVal = p.defaultFields[2].normalizedSymbol() + } + } + if p.err != nil { + return nil, p.err + } + return &PropertyValueAliases{ + GeneralCategory: gcAbbs, + GeneralCategoryDefaultRange: defaultGCCPRange, + GeneralCategoryDefaultValue: defaultGCVal, + Script: scAbbs, + }, nil +} + +func (a *PropertyValueAliases) gcAbb(gc string) string { + return a.GeneralCategory[gc] +} + +type PropList struct { + OtherAlphabetic []*CodePointRange + OtherLowercase []*CodePointRange + OtherUppercase []*CodePointRange + WhiteSpace []*CodePointRange +} + +// ParsePropList parses the PropList.txt. +func ParsePropList(r io.Reader) (*PropList, error) { + var oa []*CodePointRange + var ol []*CodePointRange + var ou []*CodePointRange + var ws []*CodePointRange + p := newParser(r) + for p.parse() { + if len(p.fields) == 0 { + continue + } + + cp, err := p.fields[0].codePointRange() + if err != nil { + return nil, err + } + + switch p.fields[1].symbol() { + case "Other_Alphabetic": + oa = append(oa, cp) + case "Other_Lowercase": + ol = append(ol, cp) + case "Other_Uppercase": + ou = append(ou, cp) + case "White_Space": + ws = append(ws, cp) + } + } + if p.err != nil { + return nil, p.err + } + + return &PropList{ + OtherAlphabetic: oa, + OtherLowercase: ol, + OtherUppercase: ou, + WhiteSpace: ws, + }, nil +} + +type Scripts struct { + Script map[string][]*CodePointRange + ScriptDefaultRange *CodePointRange + ScriptDefaultValue string +} + +// ParseScripts parses the Scripts.txt. +func ParseScripts(r io.Reader, propValAliases *PropertyValueAliases) (*Scripts, error) { + ss := map[string][]*CodePointRange{} + var defaultRange *CodePointRange + var defaultValue string + p := newParser(r) + for p.parse() { + if len(p.fields) > 0 { + cp, err := p.fields[0].codePointRange() + if err != nil { + return nil, err + } + + name, ok := propValAliases.Script[p.fields[1].normalizedSymbol()] + if !ok { + return nil, fmt.Errorf("unknown property: %v", p.fields[1].symbol()) + } + ss[name] = append(ss[name], cp) + } + + if len(p.defaultFields) > 0 { + var err error + defaultRange, err = p.defaultFields[0].codePointRange() + if err != nil { + return nil, err + } + defaultValue = p.defaultFields[1].normalizedSymbol() + } + } + if p.err != nil { + return nil, p.err + } + + return &Scripts{ + Script: ss, + ScriptDefaultRange: defaultRange, + ScriptDefaultValue: defaultValue, + }, nil +} + +type UnicodeData struct { + GeneralCategory map[string][]*CodePointRange + + propValAliases *PropertyValueAliases +} + +// ParseUnicodeData parses the UnicodeData.txt. +func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { + unicodeData := &UnicodeData{ + GeneralCategory: map[string][]*CodePointRange{}, + propValAliases: propValAliases, + } + + p := newParser(r) + for p.parse() { + if len(p.fields) == 0 { + continue + } + cp, err := p.fields[0].codePointRange() + if err != nil { + return nil, err + } + gc := p.fields[2].normalizedSymbol() + unicodeData.addGC(gc, cp) + } + if p.err != nil { + return nil, p.err + } + + return unicodeData, nil +} + +func (u *UnicodeData) addGC(gc string, cp *CodePointRange) { + // https://www.unicode.org/reports/tr44/#Empty_Fields + // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line + // > for a code point is empty, that indicates that the property takes the default value for that code point. + if gc == "" { + return + } + + cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)] + if ok { + c := cps[len(cps)-1] + if cp.From-c.To == 1 { + c.To = cp.To + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp) + } + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp} + } +} diff --git a/src/urubu/ucd/api.go b/src/urubu/ucd/api.go deleted file mode 100644 index 8265d54..0000000 --- a/src/urubu/ucd/api.go +++ /dev/null @@ -1,180 +0,0 @@ -//go:generate go run ../cmd/ucdgen/main.go -//go:generate go fmt codepoint.go - -package ucd - -import ( - "fmt" - "strings" -) - -const ( - // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf - // 3.4 Characters and Encoding - // > D9 Unicode codespace: A range of integers from 0 to 10FFFF16. - codePointMin = 0x0 - codePointMax = 0x10FFFF -) - -func NormalizeCharacterProperty(propName, propVal string) (string, error) { - if propName == "" { - propName = "gc" - } - - name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)] - if !ok { - return "", fmt.Errorf("unsupported character property name: %v", propName) - } - props, ok := derivedCoreProperties[name] - if !ok { - return "", nil - } - var b strings.Builder - yes, ok := binaryValues[normalizeSymbolicValue(propVal)] - if !ok { - return "", fmt.Errorf("unsupported character property value: %v", propVal) - } - if yes { - fmt.Fprint(&b, "[") - } else { - fmt.Fprint(&b, "[^") - } - for _, prop := range props { - fmt.Fprint(&b, prop) - } - fmt.Fprint(&b, "]") - - return b.String(), nil -} - -func IsContributoryProperty(propName string) bool { - if propName == "" { - return false - } - - for _, p := range contributoryProperties { - if propName == p { - return true - } - } - return false -} - -func FindCodePointRanges(propName, propVal string) ([]*CodePointRange, bool, error) { - if propName == "" { - propName = "gc" - } - - name, ok := propertyNameAbbs[normalizeSymbolicValue(propName)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property name: %v", propName) - } - switch name { - case "gc": - val, ok := generalCategoryValueAbbs[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if val == generalCategoryValueAbbs[normalizeSymbolicValue(generalCategoryDefaultValue)] { - var allCPs []*CodePointRange - if generalCategoryDefaultRange.From > codePointMin { - allCPs = append(allCPs, &CodePointRange{ - From: codePointMin, - To: generalCategoryDefaultRange.From - 1, - }) - } - if generalCategoryDefaultRange.To < codePointMax { - allCPs = append(allCPs, &CodePointRange{ - From: generalCategoryDefaultRange.To + 1, - To: codePointMax, - }) - } - for _, cp := range generalCategoryCodePoints { - allCPs = append(allCPs, cp...) - } - return allCPs, true, nil - } - vals, ok := compositGeneralCategories[val] - if !ok { - vals = []string{val} - } - var ranges []*CodePointRange - for _, v := range vals { - rs, ok := generalCategoryCodePoints[v] - if !ok { - return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v) - } - ranges = append(ranges, rs...) - } - return ranges, false, nil - case "sc": - val, ok := scriptValueAbbs[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if val == scriptValueAbbs[normalizeSymbolicValue(scriptDefaultValue)] { - var allCPs []*CodePointRange - if scriptDefaultRange.From > codePointMin { - allCPs = append(allCPs, &CodePointRange{ - From: codePointMin, - To: scriptDefaultRange.From - 1, - }) - } - if scriptDefaultRange.To < codePointMax { - allCPs = append(allCPs, &CodePointRange{ - From: scriptDefaultRange.To + 1, - To: codePointMax, - }) - } - for _, cp := range scriptCodepoints { - allCPs = append(allCPs, cp...) - } - return allCPs, true, nil - } - return scriptCodepoints[val], false, nil - case "oalpha": - yes, ok := binaryValues[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if yes { - return otherAlphabeticCodePoints, false, nil - } else { - return otherAlphabeticCodePoints, true, nil - } - case "olower": - yes, ok := binaryValues[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if yes { - return otherLowercaseCodePoints, false, nil - } else { - return otherLowercaseCodePoints, true, nil - } - case "oupper": - yes, ok := binaryValues[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if yes { - return otherUppercaseCodePoints, false, nil - } else { - return otherUppercaseCodePoints, true, nil - } - case "wspace": - yes, ok := binaryValues[normalizeSymbolicValue(propVal)] - if !ok { - return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) - } - if yes { - return whiteSpaceCodePoints, false, nil - } else { - return whiteSpaceCodePoints, true, nil - } - } - - // If the process reaches this code, it's a bug. We must handle all of the properties registered with - // the `propertyNameAbbs`. - return nil, false, fmt.Errorf("character property '%v' is unavailable", propName) -} diff --git a/src/urubu/ucd/codepoint.go.tmpl b/src/urubu/ucd/codepoint.go.tmpl deleted file mode 100644 index cc0d48e..0000000 --- a/src/urubu/ucd/codepoint.go.tmpl +++ /dev/null @@ -1,65 +0,0 @@ -// Code generated by {{ .GeneratorName }}; DO NOT EDIT. - -package ucd - -// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt -var generalCategoryValueAbbs = map[string]string{ {{ range $long, $abb := .PropertyValueAliases.GeneralCategory }} - "{{ $long }}": "{{ $abb }}",{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt -var scriptValueAbbs = map[string]string{ {{ range $long, $abb := .PropertyValueAliases.Script }} - "{{ $long }}": "{{ $abb }}",{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt -var ( - generalCategoryDefaultRange = &CodePointRange{ - From: rune({{ .PropertyValueAliases.GeneralCategoryDefaultRange.From }}), - To: rune({{ .PropertyValueAliases.GeneralCategoryDefaultRange.To }}), - } - generalCategoryDefaultValue = "{{ .PropertyValueAliases.GeneralCategoryDefaultValue }}" -) - -// https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt -var generalCategoryCodePoints = map[string][]*CodePointRange{ {{ range $propName, $codePoints := .UnicodeData.GeneralCategory }} - "{{ $propName }}": { {{ range $codePoints }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} - },{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt -var ( - scriptDefaultRange = &CodePointRange{ - From: rune({{ .Scripts.ScriptDefaultRange.From }}), - To: rune({{ .Scripts.ScriptDefaultRange.To }}), - } - scriptDefaultValue = "{{ .Scripts.ScriptDefaultValue }}" -) - -// https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt -var scriptCodepoints = map[string][]*CodePointRange{ {{ range $script, $codePoints := .Scripts.Script }} - "{{ $script }}": { {{ range $codePoints }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} - },{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt -var otherAlphabeticCodePoints = []*CodePointRange{ {{ range .PropList.OtherAlphabetic }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt -var otherLowercaseCodePoints = []*CodePointRange{ {{ range .PropList.OtherLowercase }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt -var otherUppercaseCodePoints = []*CodePointRange{ {{ range .PropList.OtherUppercase }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt -var whiteSpaceCodePoints = []*CodePointRange{ {{ range .PropList.WhiteSpace }} - &CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} -} diff --git a/src/urubu/ucd/parser.go b/src/urubu/ucd/parser.go deleted file mode 100644 index 88d7134..0000000 --- a/src/urubu/ucd/parser.go +++ /dev/null @@ -1,155 +0,0 @@ -package ucd - -import ( - "bufio" - "encoding/binary" - "encoding/hex" - "io" - "regexp" - "strings" -) - -type CodePointRange struct { - From rune - To rune -} - -var codePointRangeNil = &CodePointRange{ - From: 0, - To: 0, -} - -type field string - -func (f field) codePointRange() (*CodePointRange, error) { - var from, to rune - var err error - cp := reCodePointRange.FindStringSubmatch(string(f)) - from, err = decodeHexToRune(cp[1]) - if err != nil { - return codePointRangeNil, err - } - if cp[2] != "" { - to, err = decodeHexToRune(cp[2]) - if err != nil { - return codePointRangeNil, err - } - } else { - to = from - } - return &CodePointRange{ - From: from, - To: to, - }, nil -} - -func decodeHexToRune(hexCodePoint string) (rune, error) { - h := hexCodePoint - if len(h)%2 != 0 { - h = "0" + h - } - b, err := hex.DecodeString(h) - if err != nil { - return 0, err - } - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - return rune(n), nil -} - -func (f field) symbol() string { - return string(f) -} - -func (f field) normalizedSymbol() string { - return normalizeSymbolicValue(string(f)) -} - -var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") - -// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. -// -// https://www.unicode.org/reports/tr44/#UAX44-LM3 -func normalizeSymbolicValue(s string) string { - v := strings.ToLower(symValReplacer.Replace(s)) - if strings.HasPrefix(v, "is") && v != "is" { - return v[2:] - } - return v -} - -var ( - reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) - reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) - - specialCommentPrefix = "# @missing:" -) - -// This parser can parse data files of Unicode Character Database (UCD). -// Specifically, it has the following two functions: -// - Converts each line of the data files into a slice of fields. -// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. -// -// However, for practical purposes, each field needs to be analyzed more specifically. -// For instance, in UnicodeData.txt, the first field represents a range of code points, -// so it needs to be recognized as a hexadecimal string. -// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. -// -// https://www.unicode.org/reports/tr44/#Format_Conventions -type parser struct { - scanner *bufio.Scanner - fields []field - defaultFields []field - err error - - fieldBuf []field - defaultFieldBuf []field -} - -func newParser(r io.Reader) *parser { - return &parser{ - scanner: bufio.NewScanner(r), - fieldBuf: make([]field, 50), - defaultFieldBuf: make([]field, 50), - } -} - -func (p *parser) parse() bool { - for p.scanner.Scan() { - p.parseRecord(p.scanner.Text()) - if p.fields != nil || p.defaultFields != nil { - return true - } - } - p.err = p.scanner.Err() - return false -} - -func (p *parser) parseRecord(src string) { - ms := reLine.FindStringSubmatch(src) - mFields := ms[1] - mComment := ms[2] - if mFields != "" { - p.fields = parseFields(p.fieldBuf, mFields) - } else { - p.fields = nil - } - if strings.HasPrefix(mComment, specialCommentPrefix) { - p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) - } else { - p.defaultFields = nil - } -} - -func parseFields(buf []field, src string) []field { - n := 0 - for _, f := range strings.Split(src, ";") { - buf[n] = field(strings.TrimSpace(f)) - n++ - } - - return buf[:n] -} diff --git a/src/urubu/ucd/prop_list.go b/src/urubu/ucd/prop_list.go deleted file mode 100644 index 31db70c..0000000 --- a/src/urubu/ucd/prop_list.go +++ /dev/null @@ -1,50 +0,0 @@ -package ucd - -import "io" - -type PropList struct { - OtherAlphabetic []*CodePointRange - OtherLowercase []*CodePointRange - OtherUppercase []*CodePointRange - WhiteSpace []*CodePointRange -} - -// ParsePropList parses the PropList.txt. -func ParsePropList(r io.Reader) (*PropList, error) { - var oa []*CodePointRange - var ol []*CodePointRange - var ou []*CodePointRange - var ws []*CodePointRange - p := newParser(r) - for p.parse() { - if len(p.fields) == 0 { - continue - } - - cp, err := p.fields[0].codePointRange() - if err != nil { - return nil, err - } - - switch p.fields[1].symbol() { - case "Other_Alphabetic": - oa = append(oa, cp) - case "Other_Lowercase": - ol = append(ol, cp) - case "Other_Uppercase": - ou = append(ou, cp) - case "White_Space": - ws = append(ws, cp) - } - } - if p.err != nil { - return nil, p.err - } - - return &PropList{ - OtherAlphabetic: oa, - OtherLowercase: ol, - OtherUppercase: ou, - WhiteSpace: ws, - }, nil -} diff --git a/src/urubu/ucd/property.go b/src/urubu/ucd/property.go deleted file mode 100644 index ba60e80..0000000 --- a/src/urubu/ucd/property.go +++ /dev/null @@ -1,95 +0,0 @@ -package ucd - -// contributoryProperties is a set of contributory properties vartan uses internally. -// Property statuses are defined in the following table. -// -// https://unicode.org/reports/tr44/#Property_List_Table -var contributoryProperties = []string{ - "oalpha", - "olower", - "oupper", -} - -func ContributoryProperties() []string { - return contributoryProperties -} - -// https://www.unicode.org/reports/tr44/#GC_Values_Table -var compositGeneralCategories = map[string][]string{ - // Cased_Letter - "lc": {"lu", "ll", "lt"}, - // Letter - "l": {"lu", "ll", "lt", "lm", "lo"}, - // Mark - "m": {"mm", "mc", "me"}, - // Number - "n": {"nd", "nl", "no"}, - // Punctuation - "p": {"pc", "pd", "ps", "pi", "pe", "pf", "po"}, - // Symbol - "s": {"sm", "sc", "sk", "so"}, - // Separator - "z": {"zs", "zl", "zp"}, - // Other - "c": {"cc", "cf", "cs", "co", "cn"}, -} - -// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt -var derivedCoreProperties = map[string][]string{ - // Alphabetic - "alpha": { - `\p{Lowercase=yes}`, - `\p{Uppercase=yes}`, - `\p{Lt}`, - `\p{Lm}`, - `\p{Lo}`, - `\p{Nl}`, - `\p{Other_Alphabetic=yes}`, - }, - // Lowercase - "lower": { - `\p{Ll}`, - `\p{Other_Lowercase=yes}`, - }, - // Uppercase - "upper": { - `\p{Lu}`, - `\p{Other_Uppercase=yes}`, - }, -} - -// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt -var propertyNameAbbs = map[string]string{ - "generalcategory": "gc", - "gc": "gc", - "script": "sc", - "sc": "sc", - "alphabetic": "alpha", - "alpha": "alpha", - "otheralphabetic": "oalpha", - "oalpha": "oalpha", - "lowercase": "lower", - "lower": "lower", - "uppercase": "upper", - "upper": "upper", - "otherlowercase": "olower", - "olower": "olower", - "otheruppercase": "oupper", - "oupper": "oupper", - "whitespace": "wspace", - "wspace": "wspace", - "space": "wspace", -} - -// https://www.unicode.org/reports/tr44/#Type_Key_Table -// https://www.unicode.org/reports/tr44/#Binary_Values_Table -var binaryValues = map[string]bool{ - "yes": true, - "y": true, - "true": true, - "t": true, - "no": false, - "n": false, - "false": false, - "f": false, -} diff --git a/src/urubu/ucd/property_value_aliases.go b/src/urubu/ucd/property_value_aliases.go deleted file mode 100644 index 4bc69db..0000000 --- a/src/urubu/ucd/property_value_aliases.go +++ /dev/null @@ -1,82 +0,0 @@ -package ucd - -import "io" - -type PropertyValueAliases struct { - GeneralCategory map[string]string - GeneralCategoryDefaultRange *CodePointRange - GeneralCategoryDefaultValue string - - Script map[string]string -} - -// ParsePropertyValueAliases parses the PropertyValueAliases.txt. -func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) { - gcAbbs := map[string]string{} - var defaultGCCPRange *CodePointRange - var defaultGCVal string - scAbbs := map[string]string{} - p := newParser(r) - for p.parse() { - // https://www.unicode.org/reports/tr44/#Property_Value_Aliases - // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property, - // > the second field specifies an abbreviated symbolic name for a value of that property, and the third - // > field specifies the long symbolic name for that value of that property. These are the preferred - // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent - // > fields. - if len(p.fields) > 0 { - switch p.fields[0].symbol() { - case "gc": - gcShort := p.fields[1].normalizedSymbol() - gcLong := p.fields[2].normalizedSymbol() - gcAbbs[gcShort] = gcShort - gcAbbs[gcLong] = gcShort - for _, f := range p.fields[3:] { - gcShortOther := f.normalizedSymbol() - gcAbbs[gcShortOther] = gcShort - } - case "sc": - scShort := p.fields[1].normalizedSymbol() - scLong := p.fields[2].normalizedSymbol() - scAbbs[scShort] = scShort - scAbbs[scLong] = scShort - for _, f := range p.fields[3:] { - scShortOther := f.normalizedSymbol() - scAbbs[scShortOther] = scShort - } - } - } - - // https://www.unicode.org/reports/tr44/#Missing_Conventions - // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt. - // > ... - // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below: - // > 1. code_point_range; default_prop_val - // > 2. code_point_range; property_name; default_prop_val - // > ... - // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which - // > contain values associated with many properties. For example: - // > # @missing: 0000..10FFFF; NFD_QC; Yes - if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" { - var err error - defaultGCCPRange, err = p.defaultFields[0].codePointRange() - if err != nil { - return nil, err - } - defaultGCVal = p.defaultFields[2].normalizedSymbol() - } - } - if p.err != nil { - return nil, p.err - } - return &PropertyValueAliases{ - GeneralCategory: gcAbbs, - GeneralCategoryDefaultRange: defaultGCCPRange, - GeneralCategoryDefaultValue: defaultGCVal, - Script: scAbbs, - }, nil -} - -func (a *PropertyValueAliases) gcAbb(gc string) string { - return a.GeneralCategory[gc] -} diff --git a/src/urubu/ucd/scripts.go b/src/urubu/ucd/scripts.go deleted file mode 100644 index 5040283..0000000 --- a/src/urubu/ucd/scripts.go +++ /dev/null @@ -1,52 +0,0 @@ -package ucd - -import ( - "fmt" - "io" -) - -type Scripts struct { - Script map[string][]*CodePointRange - ScriptDefaultRange *CodePointRange - ScriptDefaultValue string -} - -// ParseScripts parses the Scripts.txt. -func ParseScripts(r io.Reader, propValAliases *PropertyValueAliases) (*Scripts, error) { - ss := map[string][]*CodePointRange{} - var defaultRange *CodePointRange - var defaultValue string - p := newParser(r) - for p.parse() { - if len(p.fields) > 0 { - cp, err := p.fields[0].codePointRange() - if err != nil { - return nil, err - } - - name, ok := propValAliases.Script[p.fields[1].normalizedSymbol()] - if !ok { - return nil, fmt.Errorf("unknown property: %v", p.fields[1].symbol()) - } - ss[name] = append(ss[name], cp) - } - - if len(p.defaultFields) > 0 { - var err error - defaultRange, err = p.defaultFields[0].codePointRange() - if err != nil { - return nil, err - } - defaultValue = p.defaultFields[1].normalizedSymbol() - } - } - if p.err != nil { - return nil, p.err - } - - return &Scripts{ - Script: ss, - ScriptDefaultRange: defaultRange, - ScriptDefaultValue: defaultValue, - }, nil -} diff --git a/src/urubu/ucd/unicode_data.go b/src/urubu/ucd/unicode_data.go deleted file mode 100644 index e2a8e87..0000000 --- a/src/urubu/ucd/unicode_data.go +++ /dev/null @@ -1,56 +0,0 @@ -package ucd - -import "io" - -type UnicodeData struct { - GeneralCategory map[string][]*CodePointRange - - propValAliases *PropertyValueAliases -} - -// ParseUnicodeData parses the UnicodeData.txt. -func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { - unicodeData := &UnicodeData{ - GeneralCategory: map[string][]*CodePointRange{}, - propValAliases: propValAliases, - } - - p := newParser(r) - for p.parse() { - if len(p.fields) == 0 { - continue - } - cp, err := p.fields[0].codePointRange() - if err != nil { - return nil, err - } - gc := p.fields[2].normalizedSymbol() - unicodeData.addGC(gc, cp) - } - if p.err != nil { - return nil, p.err - } - - return unicodeData, nil -} - -func (u *UnicodeData) addGC(gc string, cp *CodePointRange) { - // https://www.unicode.org/reports/tr44/#Empty_Fields - // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line - // > for a code point is empty, that indicates that the property takes the default value for that code point. - if gc == "" { - return - } - - cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)] - if ok { - c := cps[len(cps)-1] - if cp.From-c.To == 1 { - c.To = cp.To - } else { - u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp) - } - } else { - u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp} - } -} diff --git a/src/urubu/utf8/utf8.go b/src/urubu/utf8.go index 4f52bd4..4f52bd4 100644 --- a/src/urubu/utf8/utf8.go +++ b/src/urubu/utf8.go |