Namespace packages with "urubu/"

author: EuAndreh <eu@euandre.org> 2024-12-10 12:29:03 -0300
committer: EuAndreh <eu@euandre.org> 2024-12-10 12:29:03 -0300
commit: 8359c047aaebe274a2d811d61922b571ca7d10df (patch)
tree: 070e0ed93d27a842776ada805eeb4270e7e3c806 /tests/unit/driver
parent: Start building test files (diff)
download: cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.gz
cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.xz
6 files changed, 2942 insertions, 0 deletions
diff --git a/tests/unit/driver/lexer/lexer_test.go b/tests/unit/driver/lexer/lexer_test.go
new file mode 100644
index 0000000..a3d0231
--- /dev/null
+++ b/tests/unit/driver/lexer/lexer_test.go
@@ -0,0 +1,932 @@
+package lexer
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"urubu/grammar/lexical"
+	spec "urubu/spec/grammar"
+)
+
+func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *lexical.LexEntry {
+	ms := []spec.LexModeName{}
+	for _, m := range modes {
+		ms = append(ms, spec.LexModeName(m))
+	}
+	return &lexical.LexEntry{
+		Kind:    spec.LexKindName(kind),
+		Pattern: pattern,
+		Modes:   ms,
+		Push:    spec.LexModeName(push),
+		Pop:     pop,
+	}
+}
+
+func newLexEntryDefaultNOP(kind string, pattern string) *lexical.LexEntry {
+	return &lexical.LexEntry{
+		Kind:    spec.LexKindName(kind),
+		Pattern: pattern,
+		Modes: []spec.LexModeName{
+			spec.LexModeNameDefault,
+		},
+	}
+}
+
+func newLexEntryFragment(kind string, pattern string) *lexical.LexEntry {
+	return &lexical.LexEntry{
+		Kind:     spec.LexKindName(kind),
+		Pattern:  pattern,
+		Fragment: true,
+	}
+}
+
+func newToken(modeID ModeID, kindID KindID, modeKindID ModeKindID, lexeme []byte) *Token {
+	return &Token{
+		ModeID:     modeID,
+		KindID:     kindID,
+		ModeKindID: modeKindID,
+		Lexeme:     lexeme,
+	}
+}
+
+func newTokenDefault(kindID int, modeKindID int, lexeme []byte) *Token {
+	return newToken(
+		ModeID(spec.LexModeIDDefault.Int()),
+		KindID(spec.LexKindID(kindID).Int()),
+		ModeKindID(spec.LexModeKindID(modeKindID).Int()),
+		lexeme,
+	)
+}
+
+func newEOFToken(modeID ModeID, modeName string) *Token {
+	return &Token{
+		ModeID:     modeID,
+		ModeKindID: 0,
+		EOF:        true,
+	}
+}
+
+func newEOFTokenDefault() *Token {
+	return newEOFToken(ModeID(spec.LexModeIDDefault.Int()), spec.LexModeNameDefault.String())
+}
+
+func newInvalidTokenDefault(lexeme []byte) *Token {
+	return &Token{
+		ModeID:     ModeID(spec.LexModeIDDefault.Int()),
+		ModeKindID: 0,
+		Lexeme:     lexeme,
+		Invalid:    true,
+	}
+}
+
+func withPos(tok *Token, bytePos int, byteLen int, row int, col int) *Token {
+	tok.BytePos = bytePos
+	tok.ByteLen = byteLen
+	tok.Row = row
+	tok.Col = col
+	return tok
+}
+
+func TestLexer_Next(t *testing.T) {
+	test := []struct {
+		lspec           *lexical.LexSpec
+		src             string
+		tokens          []*Token
+		passiveModeTran bool
+		tran            func(l *Lexer, tok *Token) error
+	}{
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "(a|b)*abb"),
+					newLexEntryDefaultNOP("t2", " +"),
+				},
+			},
+			src: "abb aabb aaabb babb bbabb abbbabb",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("abb")), 0, 3, 0, 0),
+				withPos(newTokenDefault(2, 2, []byte(" ")), 3, 1, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte("aabb")), 4, 4, 0, 4),
+				withPos(newTokenDefault(2, 2, []byte(" ")), 8, 1, 0, 8),
+				withPos(newTokenDefault(1, 1, []byte("aaabb")), 9, 5, 0, 9),
+				withPos(newTokenDefault(2, 2, []byte(" ")), 14, 1, 0, 14),
+				withPos(newTokenDefault(1, 1, []byte("babb")), 15, 4, 0, 15),
+				withPos(newTokenDefault(2, 2, []byte(" ")), 19, 1, 0, 19),
+				withPos(newTokenDefault(1, 1, []byte("bbabb")), 20, 5, 0, 20),
+				withPos(newTokenDefault(2, 2, []byte(" ")), 25, 1, 0, 25),
+				withPos(newTokenDefault(1, 1, []byte("abbbabb")), 26, 7, 0, 26),
+				withPos(newEOFTokenDefault(), 33, 0, 0, 33),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "b?a+"),
+					newLexEntryDefaultNOP("t2", "(ab)?(cd)+"),
+					newLexEntryDefaultNOP("t3", " +"),
+				},
+			},
+			src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("ba")), 0, 2, 0, 0),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 2, 1, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte("baaa")), 3, 4, 0, 3),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 7, 1, 0, 7),
+				withPos(newTokenDefault(1, 1, []byte("a")), 8, 1, 0, 8),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 9, 1, 0, 9),
+				withPos(newTokenDefault(1, 1, []byte("aaa")), 10, 3, 0, 10),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 13, 1, 0, 13),
+				withPos(newTokenDefault(2, 2, []byte("abcd")), 14, 4, 0, 14),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 18, 1, 0, 18),
+				withPos(newTokenDefault(2, 2, []byte("abcdcdcd")), 19, 8, 0, 19),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 27, 1, 0, 27),
+				withPos(newTokenDefault(2, 2, []byte("cd")), 28, 2, 0, 28),
+				withPos(newTokenDefault(3, 3, []byte(" ")), 30, 1, 0, 30),
+				withPos(newTokenDefault(2, 2, []byte("cdcdcd")), 31, 6, 0, 31),
+				withPos(newEOFTokenDefault(), 37, 0, 0, 37),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "."),
+				},
+			},
+			src: string([]byte{
+				0x00,
+				0x7f,
+				0xc2, 0x80,
+				0xdf, 0xbf,
+				0xe1, 0x80, 0x80,
+				0xec, 0xbf, 0xbf,
+				0xed, 0x80, 0x80,
+				0xed, 0x9f, 0xbf,
+				0xee, 0x80, 0x80,
+				0xef, 0xbf, 0xbf,
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0xbf, 0xbf, 0xbf,
+				0xf1, 0x80, 0x80, 0x80,
+				0xf3, 0xbf, 0xbf, 0xbf,
+				0xf4, 0x80, 0x80, 0x80,
+				0xf4, 0x8f, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0x00}), 0, 1, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0x7f}), 1, 1, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xc2, 0x80}), 2, 2, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xdf, 0xbf}), 4, 2, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte{0xe1, 0x80, 0x80}), 6, 3, 0, 4),
+				withPos(newTokenDefault(1, 1, []byte{0xec, 0xbf, 0xbf}), 9, 3, 0, 5),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x80, 0x80}), 12, 3, 0, 6),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x9f, 0xbf}), 15, 3, 0, 7),
+				withPos(newTokenDefault(1, 1, []byte{0xee, 0x80, 0x80}), 18, 3, 0, 8),
+				withPos(newTokenDefault(1, 1, []byte{0xef, 0xbf, 0xbf}), 21, 3, 0, 9),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 24, 4, 0, 10),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0xbf, 0xbf, 0xbf}), 28, 4, 0, 11),
+				withPos(newTokenDefault(1, 1, []byte{0xf1, 0x80, 0x80, 0x80}), 32, 4, 0, 12),
+				withPos(newTokenDefault(1, 1, []byte{0xf3, 0xbf, 0xbf, 0xbf}), 36, 4, 0, 13),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x80, 0x80, 0x80}), 40, 4, 0, 14),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x8f, 0xbf, 0xbf}), 44, 4, 0, 15),
+				withPos(newEOFTokenDefault(), 48, 0, 0, 16),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"),
+				},
+			},
+			src: "ab.*+?|()[]",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("a")), 0, 1, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte("b")), 1, 1, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte(".")), 2, 1, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte("*")), 3, 1, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte("+")), 4, 1, 0, 4),
+				withPos(newTokenDefault(1, 1, []byte("?")), 5, 1, 0, 5),
+				withPos(newTokenDefault(1, 1, []byte("|")), 6, 1, 0, 6),
+				withPos(newTokenDefault(1, 1, []byte("(")), 7, 1, 0, 7),
+				withPos(newTokenDefault(1, 1, []byte(")")), 8, 1, 0, 8),
+				withPos(newTokenDefault(1, 1, []byte("[")), 9, 1, 0, 9),
+				withPos(newTokenDefault(1, 1, []byte("]")), 10, 1, 0, 10),
+				withPos(newEOFTokenDefault(), 11, 0, 0, 11),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// all 1 byte characters except null character (U+0000)
+					//
+					// NOTE:
+					// vartan cannot handle the null character in patterns because lexical.lexer,
+					// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
+					// If a pattern needs a null character, use code point expression \u{0000}.
+					newLexEntryDefaultNOP("char_1_byte", "[\x01-\x7f]"),
+				},
+			},
+			src: string([]byte{
+				0x01,
+				0x02,
+				0x7e,
+				0x7f,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0x01}), 0, 1, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0x02}), 1, 1, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0x7e}), 2, 1, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0x7f}), 3, 1, 0, 3),
+				withPos(newEOFTokenDefault(), 4, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// all 2 byte characters
+					newLexEntryDefaultNOP("char_2_byte", "[\xc2\x80-\xdf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xc2, 0x80,
+				0xc2, 0x81,
+				0xdf, 0xbe,
+				0xdf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xc2, 0x80}), 0, 2, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xc2, 0x81}), 2, 2, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xdf, 0xbe}), 4, 2, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xdf, 0xbf}), 6, 2, 0, 3),
+				withPos(newEOFTokenDefault(), 8, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// All bytes are the same.
+					newLexEntryDefaultNOP("char_3_byte", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x80}), 0, 3, 0, 0),
+				withPos(newEOFTokenDefault(), 3, 0, 0, 1),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// The first two bytes are the same.
+					newLexEntryDefaultNOP("char_3_byte", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xa0, 0xbe,
+				0xe0, 0xa0, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x80}), 0, 3, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x81}), 3, 3, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0xbe}), 6, 3, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0xbf}), 9, 3, 0, 3),
+				withPos(newEOFTokenDefault(), 12, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// The first byte are the same.
+					newLexEntryDefaultNOP("char_3_byte", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xbf, 0xbe,
+				0xe0, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x80}), 0, 3, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x81}), 3, 3, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xbf, 0xbe}), 6, 3, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xbf, 0xbf}), 9, 3, 0, 3),
+				withPos(newEOFTokenDefault(), 12, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// all 3 byte characters
+					newLexEntryDefaultNOP("char_3_byte", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xbf, 0xbe,
+				0xe0, 0xbf, 0xbf,
+				0xe1, 0x80, 0x80,
+				0xe1, 0x80, 0x81,
+				0xec, 0xbf, 0xbe,
+				0xec, 0xbf, 0xbf,
+				0xed, 0x80, 0x80,
+				0xed, 0x80, 0x81,
+				0xed, 0x9f, 0xbe,
+				0xed, 0x9f, 0xbf,
+				0xee, 0x80, 0x80,
+				0xee, 0x80, 0x81,
+				0xef, 0xbf, 0xbe,
+				0xef, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x80}), 0, 3, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xa0, 0x81}), 3, 3, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xbf, 0xbe}), 6, 3, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xe0, 0xbf, 0xbf}), 9, 3, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte{0xe1, 0x80, 0x80}), 12, 3, 0, 4),
+				withPos(newTokenDefault(1, 1, []byte{0xe1, 0x80, 0x81}), 15, 3, 0, 5),
+				withPos(newTokenDefault(1, 1, []byte{0xec, 0xbf, 0xbe}), 18, 3, 0, 6),
+				withPos(newTokenDefault(1, 1, []byte{0xec, 0xbf, 0xbf}), 21, 3, 0, 7),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x80, 0x80}), 24, 3, 0, 8),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x80, 0x81}), 27, 3, 0, 9),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x9f, 0xbe}), 30, 3, 0, 10),
+				withPos(newTokenDefault(1, 1, []byte{0xed, 0x9f, 0xbf}), 33, 3, 0, 11),
+				withPos(newTokenDefault(1, 1, []byte{0xee, 0x80, 0x80}), 36, 3, 0, 12),
+				withPos(newTokenDefault(1, 1, []byte{0xee, 0x80, 0x81}), 39, 3, 0, 13),
+				withPos(newTokenDefault(1, 1, []byte{0xef, 0xbf, 0xbe}), 42, 3, 0, 14),
+				withPos(newTokenDefault(1, 1, []byte{0xef, 0xbf, 0xbf}), 45, 3, 0, 15),
+				withPos(newEOFTokenDefault(), 48, 0, 0, 16),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// All bytes are the same.
+					newLexEntryDefaultNOP("char_4_byte", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 0, 4, 0, 0),
+				withPos(newEOFTokenDefault(), 4, 0, 0, 1),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// The first 3 bytes are the same.
+					newLexEntryDefaultNOP("char_4_byte", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0x90, 0x80, 0xbe,
+				0xf0, 0x90, 0x80, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 0, 4, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x81}), 4, 4, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0xbe}), 8, 4, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0xbf}), 12, 4, 0, 3),
+				withPos(newEOFTokenDefault(), 16, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// The first 2 bytes are the same.
+					newLexEntryDefaultNOP("char_4_byte", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0x90, 0xbf, 0xbe,
+				0xf0, 0x90, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 0, 4, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x81}), 4, 4, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0xbf, 0xbe}), 8, 4, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0xbf, 0xbf}), 12, 4, 0, 3),
+				withPos(newEOFTokenDefault(), 16, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// The first byte are the same.
+					newLexEntryDefaultNOP("char_4_byte", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0xbf, 0xbf, 0xbe,
+				0xf0, 0xbf, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 0, 4, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x81}), 4, 4, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0xbf, 0xbf, 0xbe}), 8, 4, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0xbf, 0xbf, 0xbf}), 12, 4, 0, 3),
+				withPos(newEOFTokenDefault(), 16, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// all 4 byte characters
+					newLexEntryDefaultNOP("char_4_byte", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0xbf, 0xbf, 0xbe,
+				0xf0, 0xbf, 0xbf, 0xbf,
+				0xf1, 0x80, 0x80, 0x80,
+				0xf1, 0x80, 0x80, 0x81,
+				0xf3, 0xbf, 0xbf, 0xbe,
+				0xf3, 0xbf, 0xbf, 0xbf,
+				0xf4, 0x80, 0x80, 0x80,
+				0xf4, 0x80, 0x80, 0x81,
+				0xf4, 0x8f, 0xbf, 0xbe,
+				0xf4, 0x8f, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x80}), 0, 4, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0x90, 0x80, 0x81}), 4, 4, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0xbf, 0xbf, 0xbe}), 8, 4, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xf0, 0xbf, 0xbf, 0xbf}), 12, 4, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte{0xf1, 0x80, 0x80, 0x80}), 16, 4, 0, 4),
+				withPos(newTokenDefault(1, 1, []byte{0xf1, 0x80, 0x80, 0x81}), 20, 4, 0, 5),
+				withPos(newTokenDefault(1, 1, []byte{0xf3, 0xbf, 0xbf, 0xbe}), 24, 4, 0, 6),
+				withPos(newTokenDefault(1, 1, []byte{0xf3, 0xbf, 0xbf, 0xbf}), 28, 4, 0, 7),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x80, 0x80, 0x80}), 32, 4, 0, 8),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x80, 0x80, 0x81}), 36, 4, 0, 9),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x8f, 0xbf, 0xbe}), 40, 4, 0, 10),
+				withPos(newTokenDefault(1, 1, []byte{0xf4, 0x8f, 0xbf, 0xbf}), 44, 4, 0, 11),
+				withPos(newEOFTokenDefault(), 48, 0, 0, 12),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("non_number", "[^0-9]+[0-9]"),
+				},
+			},
+			src: "foo9",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("foo9")), 0, 4, 0, 0),
+				withPos(newEOFTokenDefault(), 4, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("char_1_byte", "\\u{006E}"),
+					newLexEntryDefaultNOP("char_2_byte", "\\u{03BD}"),
+					newLexEntryDefaultNOP("char_3_byte", "\\u{306B}"),
+					newLexEntryDefaultNOP("char_4_byte", "\\u{01F638}"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0x6E}), 0, 1, 0, 0),
+				withPos(newTokenDefault(2, 2, []byte{0xCE, 0xBD}), 1, 2, 0, 1),
+				withPos(newTokenDefault(3, 3, []byte{0xE3, 0x81, 0xAB}), 3, 3, 0, 2),
+				withPos(newTokenDefault(4, 4, []byte{0xF0, 0x9F, 0x98, 0xB8}), 6, 4, 0, 3),
+				withPos(newEOFTokenDefault(), 10, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("code_points_alt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte{0x6E}), 0, 1, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte{0xCE, 0xBD}), 1, 2, 0, 1),
+				withPos(newTokenDefault(1, 1, []byte{0xE3, 0x81, 0xAB}), 3, 3, 0, 2),
+				withPos(newTokenDefault(1, 1, []byte{0xF0, 0x9F, 0x98, 0xB8}), 6, 4, 0, 3),
+				withPos(newEOFTokenDefault(), 10, 0, 0, 4),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "\\f{a2c}\\f{d2f}+"),
+					newLexEntryFragment("a2c", "abc"),
+					newLexEntryFragment("d2f", "def"),
+				},
+			},
+			src: "abcdefdefabcdef",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("abcdefdef")), 0, 9, 0, 0),
+				withPos(newTokenDefault(1, 1, []byte("abcdef")), 9, 6, 0, 9),
+				withPos(newEOFTokenDefault(), 15, 0, 0, 15),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "(\\f{a2c}|\\f{d2f})+"),
+					newLexEntryFragment("a2c", "abc"),
+					newLexEntryFragment("d2f", "def"),
+				},
+			},
+			src: "abcdefdefabc",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("abcdefdefabc")), 0, 12, 0, 0),
+				withPos(newEOFTokenDefault(), 12, 0, 0, 12),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("t1", "\\f{a2c_or_d2f}+"),
+					newLexEntryFragment("a2c_or_d2f", "\\f{a2c}|\\f{d2f}"),
+					newLexEntryFragment("a2c", "abc"),
+					newLexEntryFragment("d2f", "def"),
+				},
+			},
+			src: "abcdefdefabc",
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte("abcdefdefabc")), 0, 12, 0, 0),
+				withPos(newEOFTokenDefault(), 12, 0, 0, 12),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("white_space", ` *`),
+					newLexEntry([]string{"default"}, "string_open", `"`, "string", false),
+					newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false),
+					newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false),
+					newLexEntry([]string{"string"}, "string_close", `"`, "", true),
+				},
+			},
+			src: `"" "Hello world.\n\"Hello world.\""`,
+			tokens: []*Token{
+				withPos(newToken(1, 2, 2, []byte(`"`)), 0, 1, 0, 0),
+				withPos(newToken(2, 5, 3, []byte(`"`)), 1, 1, 0, 1),
+				withPos(newToken(1, 1, 1, []byte(` `)), 2, 1, 0, 2),
+				withPos(newToken(1, 2, 2, []byte(`"`)), 3, 1, 0, 3),
+				withPos(newToken(2, 4, 2, []byte(`Hello world.`)), 4, 12, 0, 4),
+				withPos(newToken(2, 3, 1, []byte(`\n`)), 16, 2, 0, 16),
+				withPos(newToken(2, 3, 1, []byte(`\"`)), 18, 2, 0, 18),
+				withPos(newToken(2, 4, 2, []byte(`Hello world.`)), 20, 12, 0, 20),
+				withPos(newToken(2, 3, 1, []byte(`\"`)), 32, 2, 0, 32),
+				withPos(newToken(2, 5, 3, []byte(`"`)), 34, 1, 0, 34),
+				withPos(newEOFTokenDefault(), 35, 0, 0, 35),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					// `white_space` is enabled in multiple modes.
+					newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false),
+					newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false),
+					newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false),
+					newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true),
+					newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true),
+				},
+			},
+			src: ` a b < < `,
+			tokens: []*Token{
+				withPos(newToken(1, 1, 1, []byte(` `)), 0, 1, 0, 0),
+				withPos(newToken(1, 2, 2, []byte(`a`)), 1, 1, 0, 1),
+				withPos(newToken(2, 1, 1, []byte(` `)), 2, 1, 0, 2),
+				withPos(newToken(2, 3, 2, []byte(`b`)), 3, 1, 0, 3),
+				withPos(newToken(3, 1, 1, []byte(` `)), 4, 1, 0, 4),
+				withPos(newToken(3, 5, 2, []byte(`<`)), 5, 1, 0, 5),
+				withPos(newToken(2, 1, 1, []byte(` `)), 6, 1, 0, 6),
+				withPos(newToken(2, 4, 3, []byte(`<`)), 7, 1, 0, 7),
+				withPos(newToken(1, 1, 1, []byte(` `)), 8, 1, 0, 8),
+				withPos(newEOFTokenDefault(), 9, 0, 0, 9),
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false),
+					newLexEntry([]string{"default"}, "char", `.`, "", false),
+					newLexEntry([]string{"default"}, "push_1", `-> 1`, "", false),
+					newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false),
+					newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false),
+					newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", false),
+				},
+			},
+			src: `-> 1 -> 2 <- <- a`,
+			tokens: []*Token{
+				withPos(newToken(1, 3, 3, []byte(`-> 1`)), 0, 4, 0, 0),
+				withPos(newToken(2, 1, 1, []byte(` `)), 4, 1, 0, 4),
+				withPos(newToken(2, 4, 2, []byte(`-> 2`)), 5, 4, 0, 5),
+				withPos(newToken(3, 1, 1, []byte(` `)), 9, 1, 0, 9),
+				withPos(newToken(3, 6, 2, []byte(`<-`)), 10, 2, 0, 10),
+				withPos(newToken(2, 1, 1, []byte(` `)), 12, 1, 0, 12),
+				withPos(newToken(2, 5, 3, []byte(`<-`)), 13, 2, 0, 13),
+				withPos(newToken(1, 1, 1, []byte(` `)), 15, 1, 0, 15),
+				withPos(newToken(1, 2, 2, []byte(`a`)), 16, 1, 0, 16),
+				withPos(newEOFTokenDefault(), 17, 0, 0, 17),
+			},
+			passiveModeTran: true,
+			tran: func(l *Lexer, tok *Token) error {
+				switch l.spec.ModeName(l.Mode()) {
+				case "default":
+					switch tok.KindID {
+					case 3: // push_1
+						l.PushMode(2)
+					}
+				case "mode_1":
+					switch tok.KindID {
+					case 4: // push_2
+						l.PushMode(3)
+					case 5: // pop_1
+						return l.PopMode()
+					}
+				case "mode_2":
+					switch tok.KindID {
+					case 6: // pop_2
+						return l.PopMode()
+					}
+				}
+				return nil
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false),
+					newLexEntry([]string{"default"}, "char", `.`, "", false),
+					newLexEntry([]string{"default"}, "push_1", `-> 1`, "mode_1", false),
+					newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false),
+					newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false),
+					newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", true),
+				},
+			},
+			src: `-> 1 -> 2 <- <- a`,
+			tokens: []*Token{
+				withPos(newToken(1, 3, 3, []byte(`-> 1`)), 0, 4, 0, 0),
+				withPos(newToken(2, 1, 1, []byte(` `)), 4, 1, 0, 4),
+				withPos(newToken(2, 4, 2, []byte(`-> 2`)), 5, 4, 0, 5),
+				withPos(newToken(3, 1, 1, []byte(` `)), 9, 1, 0, 9),
+				withPos(newToken(3, 6, 2, []byte(`<-`)), 10, 2, 0, 10),
+				withPos(newToken(2, 1, 1, []byte(` `)), 12, 1, 0, 12),
+				withPos(newToken(2, 5, 3, []byte(`<-`)), 13, 2, 0, 13),
+				withPos(newToken(1, 1, 1, []byte(` `)), 15, 1, 0, 15),
+				withPos(newToken(1, 2, 2, []byte(`a`)), 16, 1, 0, 16),
+				withPos(newEOFTokenDefault(), 17, 0, 0, 17),
+			},
+			// Active mode transition and an external transition function can be used together.
+			passiveModeTran: false,
+			tran: func(l *Lexer, tok *Token) error {
+				switch l.spec.ModeName(l.Mode()) {
+				case "mode_1":
+					switch tok.KindID {
+					case 4: // push_2
+						l.PushMode(3)
+					case 5: // pop_1
+						return l.PopMode()
+					}
+				}
+				return nil
+			},
+		},
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("dot", spec.EscapePattern(`.`)),
+					newLexEntryDefaultNOP("star", spec.EscapePattern(`*`)),
+					newLexEntryDefaultNOP("plus", spec.EscapePattern(`+`)),
+					newLexEntryDefaultNOP("question", spec.EscapePattern(`?`)),
+					newLexEntryDefaultNOP("vbar", spec.EscapePattern(`|`)),
+					newLexEntryDefaultNOP("lparen", spec.EscapePattern(`(`)),
+					newLexEntryDefaultNOP("rparen", spec.EscapePattern(`)`)),
+					newLexEntryDefaultNOP("lbrace", spec.EscapePattern(`[`)),
+					newLexEntryDefaultNOP("backslash", spec.EscapePattern(`\`)),
+				},
+			},
+			src: `.*+?|()[\`,
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte(`.`)), 0, 1, 0, 0),
+				withPos(newTokenDefault(2, 2, []byte(`*`)), 1, 1, 0, 1),
+				withPos(newTokenDefault(3, 3, []byte(`+`)), 2, 1, 0, 2),
+				withPos(newTokenDefault(4, 4, []byte(`?`)), 3, 1, 0, 3),
+				withPos(newTokenDefault(5, 5, []byte(`|`)), 4, 1, 0, 4),
+				withPos(newTokenDefault(6, 6, []byte(`(`)), 5, 1, 0, 5),
+				withPos(newTokenDefault(7, 7, []byte(`)`)), 6, 1, 0, 6),
+				withPos(newTokenDefault(8, 8, []byte(`[`)), 7, 1, 0, 7),
+				withPos(newTokenDefault(9, 9, []byte(`\`)), 8, 1, 0, 8),
+				withPos(newEOFTokenDefault(), 9, 0, 0, 9),
+			},
+		},
+		// Character properties are available in a bracket expression.
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("letter", `[\p{Letter}]+`),
+					newLexEntryDefaultNOP("non_letter", `[^\p{Letter}]+`),
+				},
+			},
+			src: `foo123`,
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte(`foo`)), 0, 3, 0, 0),
+				withPos(newTokenDefault(2, 2, []byte(`123`)), 3, 3, 0, 3),
+				withPos(newEOFTokenDefault(), 6, 0, 0, 6),
+			},
+		},
+		// The driver can continue lexical analysis even after it detects an invalid token.
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("lower", `[a-z]+`),
+				},
+			},
+			src: `foo123bar`,
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte(`foo`)), 0, 3, 0, 0),
+				withPos(newInvalidTokenDefault([]byte(`123`)), 3, 3, 0, 3),
+				withPos(newTokenDefault(1, 1, []byte(`bar`)), 6, 3, 0, 6),
+				withPos(newEOFTokenDefault(), 9, 0, 0, 9),
+			},
+		},
+		// The driver can detect an invalid token immediately preceding an EOF.
+		{
+			lspec: &lexical.LexSpec{
+				Entries: []*lexical.LexEntry{
+					newLexEntryDefaultNOP("lower", `[a-z]+`),
+				},
+			},
+			src: `foo123`,
+			tokens: []*Token{
+				withPos(newTokenDefault(1, 1, []byte(`foo`)), 0, 3, 0, 0),
+				withPos(newInvalidTokenDefault([]byte(`123`)), 3, 3, 0, 3),
+				withPos(newEOFTokenDefault(), 6, 0, 0, 6),
+			},
+		},
+	}
+	for i, tt := range test {
+		for compLv := lexical.CompressionLevelMin; compLv <= lexical.CompressionLevelMax; compLv++ {
+			t.Run(fmt.Sprintf("#%v-%v", i, compLv), func(t *testing.T) {
+				clspec, err, cerrs := lexical.Compile(tt.lspec, compLv)
+				if err != nil {
+					for _, cerr := range cerrs {
+						t.Logf("%#v", cerr)
+					}
+					t.Fatalf("unexpected error: %v", err)
+				}
+				opts := []LexerOption{}
+				if tt.passiveModeTran {
+					opts = append(opts, DisableModeTransition())
+				}
+				lexer, err := NewLexer(NewLexSpec(clspec), strings.NewReader(tt.src), opts...)
+				if err != nil {
+					t.Fatalf("unexpected error: %v", err)
+				}
+				for _, eTok := range tt.tokens {
+					tok, err := lexer.Next()
+					if err != nil {
+						t.Log(err)
+						break
+					}
+					testToken(t, eTok, tok)
+
+					if tok.EOF {
+						break
+					}
+
+					if tt.tran != nil {
+						err := tt.tran(lexer, tok)
+						if err != nil {
+							t.Fatalf("unexpected error: %v", err)
+						}
+					}
+				}
+			})
+		}
+	}
+}
+
+func TestLexer_Next_WithPosition(t *testing.T) {
+	lspec := &lexical.LexSpec{
+		Entries: []*lexical.LexEntry{
+			newLexEntryDefaultNOP("newline", `\u{000A}+`),
+			newLexEntryDefaultNOP("any", `.`),
+		},
+	}
+
+	clspec, err, _ := lexical.Compile(lspec, lexical.CompressionLevelMax)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	src := string([]byte{
+		0x00,
+		0x7F,
+		0x0A,
+
+		0xC2, 0x80,
+		0xDF, 0xBF,
+		0x0A,
+
+		0xE0, 0xA0, 0x80,
+		0xE0, 0xBF, 0xBF,
+		0xE1, 0x80, 0x80,
+		0xEC, 0xBF, 0xBF,
+		0xED, 0x80, 0x80,
+		0xED, 0x9F, 0xBF,
+		0xEE, 0x80, 0x80,
+		0xEF, 0xBF, 0xBF,
+		0x0A,
+
+		0xF0, 0x90, 0x80, 0x80,
+		0xF0, 0xBF, 0xBF, 0xBF,
+		0xF1, 0x80, 0x80, 0x80,
+		0xF3, 0xBF, 0xBF, 0xBF,
+		0xF4, 0x80, 0x80, 0x80,
+		0xF4, 0x8F, 0xBF, 0xBF,
+		0x0A,
+		0x0A,
+		0x0A,
+	})
+
+	expected := []*Token{
+		withPos(newTokenDefault(2, 2, []byte{0x00}), 0, 1, 0, 0),
+		withPos(newTokenDefault(2, 2, []byte{0x7F}), 1, 1, 0, 1),
+		withPos(newTokenDefault(1, 1, []byte{0x0A}), 2, 1, 0, 2),
+
+		withPos(newTokenDefault(2, 2, []byte{0xC2, 0x80}), 3, 2, 1, 0),
+		withPos(newTokenDefault(2, 2, []byte{0xDF, 0xBF}), 5, 2, 1, 1),
+		withPos(newTokenDefault(1, 1, []byte{0x0A}), 7, 1, 1, 2),
+
+		withPos(newTokenDefault(2, 2, []byte{0xE0, 0xA0, 0x80}), 8, 3, 2, 0),
+		withPos(newTokenDefault(2, 2, []byte{0xE0, 0xBF, 0xBF}), 11, 3, 2, 1),
+		withPos(newTokenDefault(2, 2, []byte{0xE1, 0x80, 0x80}), 14, 3, 2, 2),
+		withPos(newTokenDefault(2, 2, []byte{0xEC, 0xBF, 0xBF}), 17, 3, 2, 3),
+		withPos(newTokenDefault(2, 2, []byte{0xED, 0x80, 0x80}), 20, 3, 2, 4),
+		withPos(newTokenDefault(2, 2, []byte{0xED, 0x9F, 0xBF}), 23, 3, 2, 5),
+		withPos(newTokenDefault(2, 2, []byte{0xEE, 0x80, 0x80}), 26, 3, 2, 6),
+		withPos(newTokenDefault(2, 2, []byte{0xEF, 0xBF, 0xBF}), 29, 3, 2, 7),
+		withPos(newTokenDefault(1, 1, []byte{0x0A}), 32, 1, 2, 8),
+
+		withPos(newTokenDefault(2, 2, []byte{0xF0, 0x90, 0x80, 0x80}), 33, 4, 3, 0),
+		withPos(newTokenDefault(2, 2, []byte{0xF0, 0xBF, 0xBF, 0xBF}), 37, 4, 3, 1),
+		withPos(newTokenDefault(2, 2, []byte{0xF1, 0x80, 0x80, 0x80}), 41, 4, 3, 2),
+		withPos(newTokenDefault(2, 2, []byte{0xF3, 0xBF, 0xBF, 0xBF}), 45, 4, 3, 3),
+		withPos(newTokenDefault(2, 2, []byte{0xF4, 0x80, 0x80, 0x80}), 49, 4, 3, 4),
+		withPos(newTokenDefault(2, 2, []byte{0xF4, 0x8F, 0xBF, 0xBF}), 53, 4, 3, 5),
+		// When a token contains multiple line breaks, the driver sets the token position to
+		// the line number where a lexeme first appears.
+		withPos(newTokenDefault(1, 1, []byte{0x0A, 0x0A, 0x0A}), 57, 3, 3, 6),
+
+		withPos(newEOFTokenDefault(), 60, 0, 6, 0),
+	}
+
+	lexer, err := NewLexer(NewLexSpec(clspec), strings.NewReader(src))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	for _, eTok := range expected {
+		tok, err := lexer.Next()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		testToken(t, eTok, tok)
+
+		if tok.EOF {
+			break
+		}
+	}
+}
+
+func testToken(t *testing.T, expected, actual *Token) {
+	t.Helper()
+
+	if actual.ModeID != expected.ModeID ||
+		actual.KindID != expected.KindID ||
+		actual.ModeKindID != expected.ModeKindID ||
+		!bytes.Equal(actual.Lexeme, expected.Lexeme) ||
+		actual.EOF != expected.EOF ||
+		actual.Invalid != expected.Invalid {
+		t.Fatalf(`unexpected token; want: %+v, got: %+v`, expected, actual)
+	}
+
+	if actual.BytePos != expected.BytePos || actual.ByteLen != expected.ByteLen ||
+		actual.Row != expected.Row || actual.Col != expected.Col {
+		t.Fatalf(`unexpected token; want: %+v, got: %+v`, expected, actual)
+	}
+}
diff --git a/tests/unit/driver/parser/conflict_test.go b/tests/unit/driver/parser/conflict_test.go
new file mode 100644
index 0000000..0bc14d4
--- /dev/null
+++ b/tests/unit/driver/parser/conflict_test.go
@@ -0,0 +1,524 @@
+package parser
+
+import (
+	"strings"
+	"testing"
+
+	"urubu/grammar"
+	"urubu/spec/grammar/parser"
+)
+
+func TestParserWithConflicts(t *testing.T) {
+	tests := []struct {
+		caption string
+		specSrc string
+		src     string
+		cst     *Node
+	}{
+		{
+			caption: "when a shift/reduce conflict occurred, we prioritize the shift action",
+			specSrc: `
+#name test;
+
+expr
+    : expr assign expr
+	| id
+	;
+
+id: "[A-Za-z0-9_]+";
+assign: '=';
+`,
+			src: `foo=bar=baz`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					termNode("id", "foo"),
+				),
+				termNode("assign", "="),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("id", "bar"),
+					),
+					termNode("assign", "="),
+					nonTermNode("expr",
+						termNode("id", "baz"),
+					),
+				),
+			),
+		},
+		{
+			caption: "when a reduce/reduce conflict occurred, we prioritize the production defined earlier in the grammar",
+			specSrc: `
+#name test;
+
+s
+    : a
+	| b
+	;
+a
+    : id
+	;
+b
+    : id
+	;
+
+id: "[A-Za-z0-9_]+";
+`,
+			src: `foo`,
+			cst: nonTermNode("s",
+				nonTermNode("a",
+					termNode("id", "foo"),
+				),
+			),
+		},
+		{
+			caption: "left associativities defined earlier in the grammar have higher precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #left mul
+    #left add
+);
+
+expr
+    : expr add expr
+    | expr mul expr
+	| id
+    ;
+
+id: "[A-Za-z0-9_]+";
+add: '+';
+mul: '*';
+`,
+			src: `a+b*c*d+e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("id", "a"),
+					),
+					termNode("add", "+"),
+					nonTermNode("expr",
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "b"),
+							),
+							termNode("mul", "*"),
+							nonTermNode("expr",
+								termNode("id", "c"),
+							),
+						),
+						termNode("mul", "*"),
+						nonTermNode("expr",
+							termNode("id", "d"),
+						),
+					),
+				),
+				termNode("add", "+"),
+				nonTermNode("expr",
+					termNode("id", "e"),
+				),
+			),
+		},
+		{
+			caption: "left associativities defined in the same line have the same precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #left add sub
+);
+
+expr
+    : expr add expr
+    | expr sub expr
+	| id
+    ;
+
+id: "[A-Za-z0-9_]+";
+add: '+';
+sub: '-';
+`,
+			src: `a-b+c+d-e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("expr",
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "a"),
+							),
+							termNode("sub", "-"),
+							nonTermNode("expr",
+								termNode("id", "b"),
+							),
+						),
+						termNode("add", "+"),
+						nonTermNode("expr",
+							termNode("id", "c"),
+						),
+					),
+					termNode("add", "+"),
+					nonTermNode("expr",
+						termNode("id", "d"),
+					),
+				),
+				termNode("sub", "-"),
+				nonTermNode("expr",
+					termNode("id", "e"),
+				),
+			),
+		},
+		{
+			caption: "right associativities defined earlier in the grammar have higher precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #right r1
+    #right r2
+);
+
+expr
+    : expr r2 expr
+    | expr r1 expr
+	| id
+    ;
+
+whitespaces #skip
+    : "[\u{0009}\u{0020}]+";
+r1
+    : 'r1';
+r2
+    : 'r2';
+id
+    : "[A-Za-z0-9_]+";
+`,
+			src: `a r2 b r1 c r1 d r2 e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					termNode("id", "a"),
+				),
+				termNode("r2", "r2"),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						nonTermNode("expr",
+							termNode("id", "b"),
+						),
+						termNode("r1", "r1"),
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "c"),
+							),
+							termNode("r1", "r1"),
+							nonTermNode("expr",
+								termNode("id", "d"),
+							),
+						),
+					),
+					termNode("r2", "r2"),
+					nonTermNode("expr",
+						termNode("id", "e"),
+					),
+				),
+			),
+		},
+		{
+			caption: "right associativities defined in the same line have the same precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #right r1 r2
+);
+
+expr
+    : expr r2 expr
+    | expr r1 expr
+	| id
+    ;
+
+whitespaces #skip
+    : "[\u{0009}\u{0020}]+";
+r1
+    : 'r1';
+r2
+    : 'r2';
+id
+    : "[A-Za-z0-9_]+";
+`,
+			src: `a r2 b r1 c r1 d r2 e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					termNode("id", "a"),
+				),
+				termNode("r2", "r2"),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("id", "b"),
+					),
+					termNode("r1", "r1"),
+					nonTermNode("expr",
+						nonTermNode("expr",
+							termNode("id", "c"),
+						),
+						termNode("r1", "r1"),
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "d"),
+							),
+							termNode("r2", "r2"),
+							nonTermNode("expr",
+								termNode("id", "e"),
+							),
+						),
+					),
+				),
+			),
+		},
+		{
+			caption: "terminal symbols with an #assign directive defined earlier in the grammar have higher precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #assign a1
+    #assign a2
+);
+
+expr
+    : expr a2 expr
+    | expr a1 expr
+	| id
+    ;
+
+whitespaces #skip
+    : "[\u{0009}\u{0020}]+";
+a1
+    : 'a1';
+a2
+    : 'a2';
+id
+    : "[A-Za-z0-9_]+";
+`,
+			src: `a a2 b a1 c a1 d a2 e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					termNode("id", "a"),
+				),
+				termNode("a2", "a2"),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						nonTermNode("expr",
+							termNode("id", "b"),
+						),
+						termNode("a1", "a1"),
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "c"),
+							),
+							termNode("a1", "a1"),
+							nonTermNode("expr",
+								termNode("id", "d"),
+							),
+						),
+					),
+					termNode("a2", "a2"),
+					nonTermNode("expr",
+						termNode("id", "e"),
+					),
+				),
+			),
+		},
+		{
+			caption: "terminal symbols with an #assign directive defined in the same line have the same precedence",
+			specSrc: `
+#name test;
+
+#prec (
+    #assign a1 a2
+);
+
+expr
+    : expr a2 expr
+    | expr a1 expr
+	| id
+    ;
+
+whitespaces #skip
+    : "[\u{0009}\u{0020}]+";
+a1
+    : 'a1';
+a2
+    : 'a2';
+id
+    : "[A-Za-z0-9_]+";
+`,
+			src: `a a2 b a1 c a1 d a2 e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					termNode("id", "a"),
+				),
+				termNode("a2", "a2"),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("id", "b"),
+					),
+					termNode("a1", "a1"),
+					nonTermNode("expr",
+						nonTermNode("expr",
+							termNode("id", "c"),
+						),
+						termNode("a1", "a1"),
+						nonTermNode("expr",
+							nonTermNode("expr",
+								termNode("id", "d"),
+							),
+							termNode("a2", "a2"),
+							nonTermNode("expr",
+								termNode("id", "e"),
+							),
+						),
+					),
+				),
+			),
+		},
+		{
+			caption: "#left, #right, and #assign can be mixed",
+			specSrc: `
+#name test;
+
+#prec (
+    #left mul div
+    #left add sub
+    #assign else
+    #assign then
+    #right assign
+);
+
+expr
+    : expr add expr
+    | expr sub expr
+    | expr mul expr
+    | expr div expr
+    | expr assign expr
+    | if expr then expr
+    | if expr then expr else expr
+    | id
+    ;
+
+ws #skip: "[\u{0009}\u{0020}]+";
+if: 'if';
+then: 'then';
+else: 'else';
+id: "[A-Za-z0-9_]+";
+add: '+';
+sub: '-';
+mul: '*';
+div: '/';
+assign: '=';
+`,
+			src: `x = y = a + b * c - d / e + if f then if g then h else i`,
+			cst: nonTermNode(
+				"expr",
+				nonTermNode("expr",
+					termNode("id", "x"),
+				),
+				termNode("assign", "="),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("id", "y"),
+					),
+					termNode("assign", "="),
+					nonTermNode("expr",
+						nonTermNode("expr",
+							nonTermNode("expr",
+								nonTermNode("expr",
+									termNode("id", "a"),
+								),
+								termNode("add", "+"),
+								nonTermNode("expr",
+									nonTermNode("expr",
+										termNode("id", "b"),
+									),
+									termNode("mul", "*"),
+									nonTermNode("expr",
+										termNode("id", "c"),
+									),
+								),
+							),
+							termNode("sub", "-"),
+							nonTermNode("expr",
+								nonTermNode("expr",
+									termNode("id", "d"),
+								),
+								termNode("div", "/"),
+								nonTermNode("expr",
+									termNode("id", "e"),
+								),
+							),
+						),
+						termNode("add", "+"),
+						nonTermNode("expr",
+							termNode("if", "if"),
+							nonTermNode("expr",
+								termNode("id", "f"),
+							),
+							termNode("then", "then"),
+							nonTermNode("expr",
+								termNode("if", "if"),
+								nonTermNode("expr",
+									termNode("id", "g"),
+								),
+								termNode("then", "then"),
+								nonTermNode("expr",
+									termNode("id", "h"),
+								),
+								termNode("else", "else"),
+								nonTermNode("expr",
+									termNode("id", "i"),
+								),
+							),
+						),
+					),
+				),
+			),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.caption, func(t *testing.T) {
+			ast, err := parser.Parse(strings.NewReader(tt.specSrc))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			b := grammar.GrammarBuilder{
+				AST: ast,
+			}
+			cg, _, err := b.Build()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			toks, err := NewTokenStream(cg, strings.NewReader(tt.src))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			gram := NewGrammar(cg)
+			tb := NewDefaultSyntaxTreeBuilder()
+			p, err := NewParser(toks, gram, SemanticAction(NewCSTActionSet(gram, tb)))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			err = p.Parse()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if tt.cst != nil {
+				testTree(t, tb.Tree(), tt.cst)
+			}
+		})
+	}
+}
diff --git a/tests/unit/driver/parser/lac_test.go b/tests/unit/driver/parser/lac_test.go
new file mode 100644
index 0000000..c2368e8
--- /dev/null
+++ b/tests/unit/driver/parser/lac_test.go
@@ -0,0 +1,120 @@
+package parser
+
+import (
+	"strings"
+	"testing"
+
+	"urubu/grammar"
+	"urubu/spec/grammar/parser"
+)
+
+func TestParserWithLAC(t *testing.T) {
+	specSrc := `
+#name test;
+
+s
+    : t t
+	;
+t
+    : c t
+	| d
+	;
+
+c: 'c';
+d: 'd';
+`
+
+	src := `ccd`
+
+	actLogWithLAC := []string{
+		"shift/c",
+		"shift/c",
+		"shift/d",
+		"miss",
+	}
+
+	actLogWithoutLAC := []string{
+		"shift/c",
+		"shift/c",
+		"shift/d",
+		"reduce/t",
+		"reduce/t",
+		"reduce/t",
+		"miss",
+	}
+
+	ast, err := parser.Parse(strings.NewReader(specSrc))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	b := grammar.GrammarBuilder{
+		AST: ast,
+	}
+	gram, _, err := b.Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	t.Run("LAC is enabled", func(t *testing.T) {
+		semAct := &testSemAct{
+			gram: gram,
+		}
+
+		toks, err := NewTokenStream(gram, strings.NewReader(src))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		err = p.Parse()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if len(semAct.actLog) != len(actLogWithLAC) {
+			t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithLAC, semAct.actLog)
+		}
+
+		for i, e := range actLogWithLAC {
+			if semAct.actLog[i] != e {
+				t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithLAC, semAct.actLog)
+			}
+		}
+	})
+
+	t.Run("LAC is disabled", func(t *testing.T) {
+		semAct := &testSemAct{
+			gram: gram,
+		}
+
+		toks, err := NewTokenStream(gram, strings.NewReader(src))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct), DisableLAC())
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		err = p.Parse()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if len(semAct.actLog) != len(actLogWithoutLAC) {
+			t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithoutLAC, semAct.actLog)
+		}
+
+		for i, e := range actLogWithoutLAC {
+			if semAct.actLog[i] != e {
+				t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithoutLAC, semAct.actLog)
+			}
+		}
+	})
+}
diff --git a/tests/unit/driver/parser/parser_test.go b/tests/unit/driver/parser/parser_test.go
new file mode 100644
index 0000000..bca0391
--- /dev/null
+++ b/tests/unit/driver/parser/parser_test.go
@@ -0,0 +1,833 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"urubu/grammar"
+	"urubu/spec/grammar/parser"
+)
+
+func termNode(kind string, text string, children ...*Node) *Node {
+	return &Node{
+		Type:     NodeTypeTerminal,
+		KindName: kind,
+		Text:     text,
+		Children: children,
+	}
+}
+
+func errorNode() *Node {
+	return &Node{
+		Type:     NodeTypeError,
+		KindName: "error",
+	}
+}
+
+func nonTermNode(kind string, children ...*Node) *Node {
+	return &Node{
+		Type:     NodeTypeNonTerminal,
+		KindName: kind,
+		Children: children,
+	}
+}
+
+func TestParser_Parse(t *testing.T) {
+	tests := []struct {
+		specSrc string
+		src     string
+		synErr  bool
+		cst     *Node
+		ast     *Node
+	}{
+		{
+			specSrc: `
+#name test;
+
+expr
+    : expr add term
+    | term
+    ;
+term
+    : term mul factor
+    | factor
+    ;
+factor
+    : l_paren expr r_paren
+    | id
+    ;
+
+add
+    : '+';
+mul
+    : '*';
+l_paren
+    : '(';
+r_paren
+    : ')';
+id
+    : "[A-Za-z_][0-9A-Za-z_]*";
+`,
+			src: `(a+(b+c))*d+e`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("term",
+						nonTermNode("term",
+							nonTermNode("factor",
+								termNode("l_paren", "("),
+								nonTermNode("expr",
+									nonTermNode("expr",
+										nonTermNode("term",
+											nonTermNode("factor",
+												termNode("id", "a"),
+											),
+										),
+									),
+									termNode("add", "+"),
+									nonTermNode("term",
+										nonTermNode("factor",
+											termNode("l_paren", "("),
+											nonTermNode("expr",
+												nonTermNode("expr",
+													nonTermNode("term",
+														nonTermNode("factor",
+															termNode("id", "b"),
+														),
+													),
+												),
+												termNode("add", "+"),
+												nonTermNode("term",
+													nonTermNode("factor",
+														termNode("id", "c"),
+													),
+												),
+											),
+											termNode("r_paren", ")"),
+										),
+									),
+								),
+								termNode("r_paren", ")"),
+							),
+						),
+						termNode("mul", "*"),
+						nonTermNode("factor",
+							termNode("id", "d"),
+						),
+					),
+				),
+				termNode("add", "+"),
+				nonTermNode("term",
+					nonTermNode("factor",
+						termNode("id", "e"),
+					),
+				),
+			),
+		},
+		// Fragments (\f{}), code point expressions (\u{}), and character property expressions (\p{}) are
+		// not allowed in string literals.
+		{
+			specSrc: `
+#name test;
+
+s
+    : a b c
+    ;
+
+a
+    : '\f{foo}';
+b
+    : '\u{0000}';
+c
+    : '\p{gc=Letter}';
+`,
+			src: `\f{foo}\u{0000}\p{gc=Letter}`,
+			cst: nonTermNode("s",
+				termNode("a", `\f{foo}`),
+				termNode("b", `\u{0000}`),
+				termNode("c", `\p{gc=Letter}`),
+			),
+		},
+		// The driver can reduce productions that have the empty alternative and can generate a CST (and AST) node.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar
+    ;
+foo
+    :
+    ;
+bar
+    : bar_text
+    |
+    ;
+bar_text: "bar";
+`,
+			src: ``,
+			cst: nonTermNode("s",
+				nonTermNode("foo"),
+				nonTermNode("bar"),
+			),
+		},
+		// The driver can reduce productions that have the empty alternative and can generate a CST (and AST) node.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar
+    ;
+foo
+    :
+    ;
+bar
+    : bar_text
+    |
+    ;
+
+bar_text
+    : "bar";
+`,
+			src: `bar`,
+			cst: nonTermNode("s",
+				nonTermNode("foo"),
+				nonTermNode("bar",
+					termNode("bar_text", "bar"),
+				),
+			),
+		},
+		// A production can have multiple alternative productions.
+		{
+			specSrc: `
+#name test;
+
+#prec (
+    #assign $uminus
+    #left mul div
+    #left add sub
+);
+
+expr
+    : expr add expr
+    | expr sub expr
+    | expr mul expr
+    | expr div expr
+    | int
+    | sub int #prec $uminus // This 'sub' means the unary minus symbol.
+    ;
+
+int
+    : "0|[1-9][0-9]*";
+add
+    : '+';
+sub
+    : '-';
+mul
+    : '*';
+div
+    : '/';
+`,
+			src: `-1*-2+3-4/5`,
+			ast: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("expr",
+						nonTermNode("expr",
+							termNode("sub", "-"),
+							termNode("int", "1"),
+						),
+						termNode("mul", "*"),
+						nonTermNode("expr",
+							termNode("sub", "-"),
+							termNode("int", "2"),
+						),
+					),
+					termNode("add", "+"),
+					nonTermNode("expr",
+						termNode("int", "3"),
+					),
+				),
+				termNode("sub", "-"),
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("int", "4"),
+					),
+					termNode("div", "/"),
+					nonTermNode("expr",
+						termNode("int", "5"),
+					),
+				),
+			),
+		},
+		// A lexical production can have multiple production directives.
+		{
+			specSrc: `
+#name test;
+
+s
+    : push_a push_b pop pop
+    ;
+
+push_a #mode default #push a
+    : '->a';
+push_b #mode a #push b
+    : '->b';
+pop #mode a b #pop
+    : '<-';
+`,
+			src: `->a->b<-<-`,
+			ast: nonTermNode("s",
+				termNode("push_a", "->a"),
+				termNode("push_b", "->b"),
+				termNode("pop", "<-"),
+				termNode("pop", "<-"),
+			),
+		},
+		{
+			specSrc: `
+#name test;
+
+mode_tran_seq
+    : mode_tran_seq mode_tran
+    | mode_tran
+    ;
+mode_tran
+    : push_m1
+    | push_m2
+    | pop_m1
+    | pop_m2
+    ;
+
+push_m1 #push m1
+    : "->";
+push_m2 #mode m1 #push m2
+    : "-->";
+pop_m1 #mode m1 #pop
+    : "<-";
+pop_m2 #mode m2 #pop
+    : "<--";
+whitespace #mode default m1 m2 #skip
+    : "\u{0020}+";
+`,
+			src: ` -> --> <-- <- `,
+		},
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar
+    ;
+
+foo
+    : "foo";
+bar #mode default
+    : "bar";
+`,
+			src: `foobar`,
+		},
+		// When #push and #pop are applied to the same symbol, #pop will run first, then #push.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar baz
+    ;
+
+foo #push m1
+    : 'foo';
+bar #mode m1 #pop #push m2
+    : 'bar';
+baz #mode m2
+    : 'baz';
+`,
+			src: `foobarbaz`,
+			ast: nonTermNode("s",
+				termNode("foo", "foo"),
+				termNode("bar", "bar"),
+				termNode("baz", "baz"),
+			),
+		},
+		// When #push and #pop are applied to the same symbol, #pop will run first, then #push, even if #push appears first
+		// in a definition. That is, the order in which #push and #pop appear in grammar has nothing to do with the order in which
+		// they are executed.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar baz
+    ;
+
+foo #push m1
+    : 'foo';
+bar #mode m1 #push m2 #pop
+    : 'bar';
+baz #mode m2
+    : 'baz';
+`,
+			src: `foobarbaz`,
+			ast: nonTermNode("s",
+				termNode("foo", "foo"),
+				termNode("bar", "bar"),
+				termNode("baz", "baz"),
+			),
+		},
+		// The parser can skips specified tokens.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo bar
+    ;
+
+foo
+    : "foo";
+bar
+    : "bar";
+white_space #skip
+    : "[\u{0009}\u{0020}]+";
+`,
+			src: `foo bar`,
+		},
+		// A grammar can contain fragments.
+		{
+			specSrc: `
+#name test;
+
+s
+    : tagline
+    ;
+tagline
+    : "\f{words} IS OUT THERE.";
+fragment words
+    : "[A-Za-z\u{0020}]+";
+`,
+			src: `THE TRUTH IS OUT THERE.`,
+		},
+		// A grammar can contain ast actions.
+		{
+			specSrc: `
+#name test;
+
+list
+    : l_bracket elems r_bracket #ast elems...
+    ;
+elems
+    : elems comma id #ast elems... id
+    | id
+    ;
+
+whitespace #skip
+    : "\u{0020}+";
+l_bracket
+    : '[';
+r_bracket
+    : ']';
+comma
+    : ',';
+id
+    : "[A-Za-z]+";
+`,
+			src: `[Byers, Frohike, Langly]`,
+			cst: nonTermNode("list",
+				termNode("x_1", "["),
+				nonTermNode("elems",
+					nonTermNode("elems",
+						nonTermNode("elems",
+							termNode("id", "Byers"),
+						),
+						termNode("x_3", ","),
+						termNode("id", "Frohike"),
+					),
+					termNode("x_3", ","),
+					termNode("id", "Langly"),
+				),
+				termNode("x_2", "]"),
+			),
+			ast: nonTermNode("list",
+				termNode("id", "Byers"),
+				termNode("id", "Frohike"),
+				termNode("id", "Langly"),
+			),
+		},
+		// The '...' operator can expand child nodes.
+		{
+			specSrc: `
+#name test;
+
+s
+    : a #ast a...
+    ;
+a
+    : a comma foo #ast a... foo
+    | foo
+    ;
+
+comma
+    : ',';
+foo
+    : 'foo';
+`,
+			src: `foo,foo,foo`,
+			ast: nonTermNode("s",
+				termNode("foo", "foo"),
+				termNode("foo", "foo"),
+				termNode("foo", "foo"),
+			),
+		},
+		// The '...' operator also can applied to an element having no children.
+		{
+			specSrc: `
+#name test;
+
+s
+    : a semi_colon #ast a...
+    ;
+a
+    :
+    ;
+
+semi_colon
+    : ';';
+`,
+			src: `;`,
+			ast: nonTermNode("s"),
+		},
+		// A label can be a parameter of #ast directive.
+		{
+			specSrc: `
+#name test;
+
+#prec (
+    #left add sub
+);
+
+expr
+    : expr@lhs add expr@rhs #ast add lhs rhs
+    | expr@lhs sub expr@rhs #ast sub lhs rhs
+    | num
+    ;
+
+add
+    : '+';
+sub
+    : '-';
+num
+    : "0|[1-9][0-9]*";
+`,
+			src: `1+2-3`,
+			ast: nonTermNode("expr",
+				termNode("sub", "-"),
+				nonTermNode("expr",
+					termNode("add", "+"),
+					nonTermNode("expr",
+						termNode("num", "1"),
+					),
+					nonTermNode("expr",
+						termNode("num", "2"),
+					),
+				),
+				nonTermNode("expr",
+					termNode("num", "3"),
+				),
+			),
+		},
+		// An AST can contain a symbol name, even if the symbol has a label. That is, unused labels are allowed.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo@x semi_colon #ast foo
+    ;
+
+semi_colon
+    : ';';
+foo
+    : 'foo';
+`,
+			src: `foo;`,
+			ast: nonTermNode("s",
+				termNode("foo", "foo"),
+			),
+		},
+		// A production has the same precedence and associativity as the right-most terminal symbol.
+		{
+			specSrc: `
+#name test;
+
+#prec (
+    #left add
+);
+
+expr
+    : expr add expr // This alternative has the same precedence and associativiry as 'add'.
+    | int
+    ;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+int
+    : "0|[1-9][0-9]*";
+add
+    : '+';
+`,
+			// This source is recognized as the following structure because the production `expr → expr add expr` has the same
+			// precedence and associativity as the symbol 'add'.
+			//
+			// ((1+2)+3)
+			//
+			// If the symbol doesn't have the precedence and left associativity, the production also doesn't have the precedence
+			// and associativity and this source will be recognized as the following structure.
+			//
+			// (1+(2+3))
+			src: `1+2+3`,
+			ast: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("int", "1"),
+					),
+					termNode("add", "+"),
+					nonTermNode("expr",
+						termNode("int", "2"),
+					),
+				),
+				termNode("add", "+"),
+				nonTermNode("expr",
+					termNode("int", "3"),
+				),
+			),
+		},
+		// The 'prec' directive can set precedence of a production.
+		{
+			specSrc: `
+#name test;
+
+#prec (
+    #assign $uminus
+    #left mul div
+    #left add sub
+);
+
+expr
+    : expr add expr
+    | expr sub expr
+    | expr mul expr
+    | expr div expr
+    | int
+    | sub int #prec $uminus // This 'sub' means a unary minus symbol.
+    ;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+int
+    : "0|[1-9][0-9]*";
+add
+    : '+';
+sub
+    : '-';
+mul
+    : '*';
+div
+    : '/';
+`,
+			// This source is recognized as the following structure because the production `expr → sub expr`
+			// has the `#prec mul` directive and has the same precedence of the symbol `mul`.
+			//
+			// (((-1) * 20) / 5)
+			//
+			// If the production doesn't have the `#prec` directive, this source will be recognized as
+			// the following structure.
+			//
+			// (- ((1 * 20) / 5))
+			src: `-1*20/5`,
+			cst: nonTermNode("expr",
+				nonTermNode("expr",
+					nonTermNode("expr",
+						termNode("sub", "-"),
+						termNode("int", "1"),
+					),
+					termNode("mul", "*"),
+					nonTermNode("expr",
+						termNode("int", "20"),
+					),
+				),
+				termNode("div", "/"),
+				nonTermNode("expr",
+					termNode("int", "5"),
+				),
+			),
+		},
+		// The grammar can contain the 'error' symbol.
+		{
+			specSrc: `
+#name test;
+
+s
+    : id id id semi_colon
+    | error semi_colon
+    ;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semi_colon
+    : ';';
+id
+    : "[A-Za-z_]+";
+`,
+			src: `foo bar baz ;`,
+		},
+		// The 'error' symbol can appear in an #ast directive.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo semi_colon
+    | error semi_colon #ast error
+    ;
+
+semi_colon
+    : ';';
+foo
+    : 'foo';
+`,
+			src:    `bar;`,
+			synErr: true,
+			ast: nonTermNode("s",
+				errorNode(),
+			),
+		},
+		// The 'error' symbol can have a label, and an #ast can reference it.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo semi_colon
+    | error@e semi_colon #ast e
+    ;
+
+semi_colon
+    : ';';
+foo
+    : 'foo';
+`,
+			src:    `bar;`,
+			synErr: true,
+			ast: nonTermNode("s",
+				errorNode(),
+			),
+		},
+		// The grammar can contain the 'recover' directive.
+		{
+			specSrc: `
+#name test;
+
+seq
+    : seq elem
+    | elem
+    ;
+elem
+    : id id id semi_colon
+    | error semi_colon #recover
+    ;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semi_colon
+    : ';';
+id
+    : "[A-Za-z_]+";
+`,
+			src: `a b c ; d e f ;`,
+		},
+		// The same label can be used between different alternatives.
+		{
+			specSrc: `
+#name test;
+
+s
+    : foo@x bar
+    | foo@x
+    ;
+
+foo: 'foo';
+bar: 'bar';
+`,
+			src: `foo`,
+		},
+	}
+
+	for i, tt := range tests {
+		t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
+			ast, err := parser.Parse(strings.NewReader(tt.specSrc))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			b := grammar.GrammarBuilder{
+				AST: ast,
+			}
+			cg, _, err := b.Build()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			toks, err := NewTokenStream(cg, strings.NewReader(tt.src))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			gram := NewGrammar(cg)
+			tb := NewDefaultSyntaxTreeBuilder()
+			var opt []ParserOption
+			switch {
+			case tt.ast != nil:
+				opt = append(opt, SemanticAction(NewASTActionSet(gram, tb)))
+			case tt.cst != nil:
+				opt = append(opt, SemanticAction(NewCSTActionSet(gram, tb)))
+			}
+			p, err := NewParser(toks, gram, opt...)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			err = p.Parse()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if !tt.synErr && len(p.SyntaxErrors()) > 0 {
+				for _, synErr := range p.SyntaxErrors() {
+					t.Fatalf("unexpected syntax errors occurred: %v", synErr)
+				}
+			}
+
+			switch {
+			case tt.ast != nil:
+				testTree(t, tb.Tree(), tt.ast)
+			case tt.cst != nil:
+				testTree(t, tb.Tree(), tt.cst)
+			}
+		})
+	}
+}
+
+func testTree(t *testing.T, node, expected *Node) {
+	t.Helper()
+
+	if node.Type != expected.Type || node.KindName != expected.KindName || node.Text != expected.Text {
+		t.Fatalf("unexpected node; want: %+v, got: %+v", expected, node)
+	}
+	if len(node.Children) != len(expected.Children) {
+		t.Fatalf("unexpected children; want: %v, got: %v", len(expected.Children), len(node.Children))
+	}
+	for i, c := range node.Children {
+		testTree(t, c, expected.Children[i])
+	}
+}
diff --git a/tests/unit/driver/parser/semantic_action_test.go b/tests/unit/driver/parser/semantic_action_test.go
new file mode 100644
index 0000000..cb3ee70
--- /dev/null
+++ b/tests/unit/driver/parser/semantic_action_test.go
@@ -0,0 +1,227 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"urubu/grammar"
+	spec "urubu/spec/grammar"
+	"urubu/spec/grammar/parser"
+)
+
+type testSemAct struct {
+	gram   *spec.CompiledGrammar
+	actLog []string
+}
+
+func (a *testSemAct) Shift(tok VToken, recovered bool) {
+	t := a.gram.Syntactic.Terminals[tok.TerminalID()]
+	if recovered {
+		a.actLog = append(a.actLog, fmt.Sprintf("shift/%v/recovered", t))
+	} else {
+		a.actLog = append(a.actLog, fmt.Sprintf("shift/%v", t))
+	}
+}
+
+func (a *testSemAct) Reduce(prodNum int, recovered bool) {
+	lhsSym := a.gram.Syntactic.LHSSymbols[prodNum]
+	lhsText := a.gram.Syntactic.NonTerminals[lhsSym]
+	if recovered {
+		a.actLog = append(a.actLog, fmt.Sprintf("reduce/%v/recovered", lhsText))
+	} else {
+		a.actLog = append(a.actLog, fmt.Sprintf("reduce/%v", lhsText))
+	}
+}
+
+func (a *testSemAct) Accept() {
+	a.actLog = append(a.actLog, "accept")
+}
+
+func (a *testSemAct) TrapAndShiftError(cause VToken, popped int) {
+	a.actLog = append(a.actLog, fmt.Sprintf("trap/%v/shift/error", popped))
+}
+
+func (a *testSemAct) MissError(cause VToken) {
+	a.actLog = append(a.actLog, "miss")
+}
+
+func TestParserWithSemanticAction(t *testing.T) {
+	specSrcWithErrorProd := `
+#name test;
+
+seq
+    : seq elem semicolon
+	| elem semicolon
+    | error star star semicolon
+	| error semicolon #recover
+	;
+elem
+    : char char char
+	;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semicolon
+    : ';';
+star
+    : '*';
+char
+    : "[a-z]";
+`
+
+	specSrcWithoutErrorProd := `
+#name test;
+
+seq
+    : seq elem semicolon
+	| elem semicolon
+	;
+elem
+    : char char char
+	;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semicolon
+    : ';';
+char
+    : "[a-z]";
+`
+
+	tests := []struct {
+		caption string
+		specSrc string
+		src     string
+		actLog  []string
+	}{
+		{
+			caption: "when an input contains no syntax error, the driver calls `Shift`, `Reduce`, and `Accept`.",
+			specSrc: specSrcWithErrorProd,
+			src:     `a b c; d e f;`,
+			actLog: []string{
+				"shift/char",
+				"shift/char",
+				"shift/char",
+				"reduce/elem",
+				"shift/semicolon",
+				"reduce/seq",
+
+				"shift/char",
+				"shift/char",
+				"shift/char",
+				"reduce/elem",
+				"shift/semicolon",
+				"reduce/seq",
+
+				"accept",
+			},
+		},
+		{
+			caption: "when a grammar has `error` symbol, the driver calls `TrapAndShiftError`.",
+			specSrc: specSrcWithErrorProd,
+			src:     `a; b !; c d !; e ! * *; h i j;`,
+			actLog: []string{
+				"shift/char",
+				"trap/1/shift/error",
+				"shift/semicolon",
+				"reduce/seq/recovered",
+
+				"shift/char",
+				"trap/2/shift/error",
+				"shift/semicolon",
+				"reduce/seq/recovered",
+
+				"shift/char",
+				"shift/char",
+				"trap/3/shift/error",
+				"shift/semicolon",
+				"reduce/seq/recovered",
+
+				"shift/char",
+				"trap/2/shift/error",
+				"shift/star",
+				"shift/star",
+				// When the driver shifts three times, it recovers from an error.
+				"shift/semicolon/recovered",
+				"reduce/seq",
+
+				"shift/char",
+				"shift/char",
+				"shift/char",
+				"reduce/elem",
+				"shift/semicolon",
+				"reduce/seq",
+
+				// Even if the input contains syntax errors, the driver calls `Accept` when the input is accepted
+				// according to the error production.
+				"accept",
+			},
+		},
+		{
+			caption: "when the input doesn't meet the error production, the driver calls `MissError`.",
+			specSrc: specSrcWithErrorProd,
+			src:     `a !`,
+			actLog: []string{
+				"shift/char",
+				"trap/1/shift/error",
+
+				"miss",
+			},
+		},
+		{
+			caption: "when a syntax error isn't trapped, the driver calls `MissError`.",
+			specSrc: specSrcWithoutErrorProd,
+			src:     `a !`,
+			actLog: []string{
+				"shift/char",
+
+				"miss",
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.caption, func(t *testing.T) {
+			ast, err := parser.Parse(strings.NewReader(tt.specSrc))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			b := grammar.GrammarBuilder{
+				AST: ast,
+			}
+			gram, _, err := b.Build()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			toks, err := NewTokenStream(gram, strings.NewReader(tt.src))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			semAct := &testSemAct{
+				gram: gram,
+			}
+			p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			err = p.Parse()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if len(semAct.actLog) != len(tt.actLog) {
+				t.Fatalf("unexpected action log; want: %+v, got: %+v", tt.actLog, semAct.actLog)
+			}
+
+			for i, e := range tt.actLog {
+				if semAct.actLog[i] != e {
+					t.Fatalf("unexpected action log; want: %+v, got: %+v", tt.actLog, semAct.actLog)
+				}
+			}
+		})
+	}
+}
diff --git a/tests/unit/driver/parser/syntax_error_test.go b/tests/unit/driver/parser/syntax_error_test.go
new file mode 100644
index 0000000..90e5bd2
--- /dev/null
+++ b/tests/unit/driver/parser/syntax_error_test.go
@@ -0,0 +1,306 @@
+package parser
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"testing"
+
+	"urubu/grammar"
+	"urubu/spec/grammar/parser"
+)
+
+func TestParserWithSyntaxErrors(t *testing.T) {
+	tests := []struct {
+		caption     string
+		specSrc     string
+		src         string
+		synErrCount int
+	}{
+		{
+			caption: "the parser can report a syntax error",
+			specSrc: `
+#name test;
+
+s
+    : foo
+    ;
+
+foo
+    : 'foo';
+`,
+			src:         `bar`,
+			synErrCount: 1,
+		},
+		{
+			caption: "when the parser reduced a production having the reduce directive, the parser will recover from an error state",
+			specSrc: `
+#name test;
+
+seq
+    : seq elem semi_colon
+	| elem semi_colon
+	| error semi_colon #recover
+	;
+elem
+    : a b c
+	;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semi_colon
+    : ';';
+a
+    : 'a';
+b
+    : 'b';
+c
+    : 'c';
+`,
+			src:         `!; a!; ab!;`,
+			synErrCount: 3,
+		},
+		{
+			caption: "After the parser shifts the error symbol, symbols are ignored until a symbol the parser can perform shift appears",
+			specSrc: `
+#name test;
+
+seq
+    : seq elem semi_colon
+	| elem semi_colon
+	| error semi_colon #recover
+	;
+elem
+    : a b c
+	;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semi_colon
+    : ';';
+a
+    : 'a';
+b
+    : 'b';
+c
+    : 'c';
+`,
+			// After the parser trasits to the error state reading the first invalid symbol ('!'),
+			// the second and third invalid symbols ('!') are ignored.
+			src:         `! ! !; a!; ab!;`,
+			synErrCount: 3,
+		},
+		{
+			caption: "when the parser performs shift three times, the parser recovers from the error state",
+			specSrc: `
+#name test;
+
+seq
+    : seq elem semi_colon
+	| elem semi_colon
+	| error star star semi_colon
+	;
+elem
+    : a b c
+	;
+
+ws #skip
+    : "[\u{0009}\u{0020}]+";
+semi_colon
+    : ';';
+star
+    : '*';
+a
+    : 'a';
+b
+    : 'b';
+c
+    : 'c';
+`,
+			src:         `!**; a!**; ab!**; abc!`,
+			synErrCount: 4,
+		},
+	}
+	for i, tt := range tests {
+		t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
+			ast, err := parser.Parse(strings.NewReader(tt.specSrc))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			b := grammar.GrammarBuilder{
+				AST: ast,
+			}
+			gram, _, err := b.Build()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			toks, err := NewTokenStream(gram, strings.NewReader(tt.src))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			p, err := NewParser(toks, NewGrammar(gram))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			err = p.Parse()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			synErrs := p.SyntaxErrors()
+			if len(synErrs) != tt.synErrCount {
+				t.Fatalf("unexpected syntax error; want: %v error(s), got: %v error(s)", tt.synErrCount, len(synErrs))
+			}
+		})
+	}
+}
+
+func TestParserWithSyntaxErrorAndExpectedLookahead(t *testing.T) {
+	tests := []struct {
+		caption  string
+		specSrc  string
+		src      string
+		cause    string
+		expected []string
+	}{
+		{
+			caption: "the parser reports an expected lookahead symbol",
+			specSrc: `
+#name test;
+
+s
+    : foo
+    ;
+
+foo
+    : 'foo';
+`,
+			src:   `bar`,
+			cause: `bar`,
+			expected: []string{
+				"foo",
+			},
+		},
+		{
+			caption: "the parser reports expected lookahead symbols",
+			specSrc: `
+#name test;
+
+s
+    : foo
+    | bar
+    ;
+
+foo
+    : 'foo';
+bar
+    : 'bar';
+`,
+			src:   `baz`,
+			cause: `baz`,
+			expected: []string{
+				"foo",
+				"bar",
+			},
+		},
+		{
+			caption: "the parser may report the EOF as an expected lookahead symbol",
+			specSrc: `
+#name test;
+
+s
+    : foo
+    ;
+
+foo
+    : 'foo';
+`,
+			src:   `foobar`,
+			cause: `bar`,
+			expected: []string{
+				"<eof>",
+			},
+		},
+		{
+			caption: "the parser may report the EOF and others as expected lookahead symbols",
+			specSrc: `
+#name test;
+
+s
+    : foo
+    |
+    ;
+
+foo
+    : 'foo';
+`,
+			src:   `bar`,
+			cause: `bar`,
+			expected: []string{
+				"foo",
+				"<eof>",
+			},
+		},
+	}
+	for i, tt := range tests {
+		t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
+			ast, err := parser.Parse(strings.NewReader(tt.specSrc))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			b := grammar.GrammarBuilder{
+				AST: ast,
+			}
+			gram, _, err := b.Build()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			toks, err := NewTokenStream(gram, strings.NewReader(tt.src))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			p, err := NewParser(toks, NewGrammar(gram))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			err = p.Parse()
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			synErrs := p.SyntaxErrors()
+			if synErrs == nil {
+				t.Fatalf("expected one syntax error, but it didn't occur")
+			}
+			if len(synErrs) != 1 {
+				t.Fatalf("too many syntax errors: %v errors", len(synErrs))
+			}
+			synErr := synErrs[0]
+			if string(synErr.Token.Lexeme()) != tt.cause {
+				t.Fatalf("unexpected lexeme: want: %v, got: %v", tt.cause, string(synErr.Token.Lexeme()))
+			}
+			if len(synErr.ExpectedTerminals) != len(tt.expected) {
+				t.Fatalf("unexpected lookahead symbols: want: %v, got: %v", tt.expected, synErr.ExpectedTerminals)
+			}
+			sort.Slice(tt.expected, func(i, j int) bool {
+				return tt.expected[i] < tt.expected[j]
+			})
+			sort.Slice(synErr.ExpectedTerminals, func(i, j int) bool {
+				return synErr.ExpectedTerminals[i] < synErr.ExpectedTerminals[j]
+			})
+			for i, e := range tt.expected {
+				if synErr.ExpectedTerminals[i] != e {
+					t.Errorf("unexpected lookahead symbol: want: %v, got: %v", e, synErr.ExpectedTerminals[i])
+				}
+			}
+		})
+	}
+}
author	EuAndreh <eu@euandre.org>	2024-12-10 12:29:03 -0300
committer	EuAndreh <eu@euandre.org>	2024-12-10 12:29:03 -0300
commit	8359c047aaebe274a2d811d61922b571ca7d10df (patch)
tree	070e0ed93d27a842776ada805eeb4270e7e3c806 /tests/unit/driver
parent	Start building test files (diff)
download	cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.gz cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.xz