aboutsummaryrefslogtreecommitdiff
path: root/spec/grammar/parser/lexer_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'spec/grammar/parser/lexer_test.go')
-rw-r--r--spec/grammar/parser/lexer_test.go236
1 files changed, 236 insertions, 0 deletions
diff --git a/spec/grammar/parser/lexer_test.go b/spec/grammar/parser/lexer_test.go
new file mode 100644
index 0000000..64c2aa2
--- /dev/null
+++ b/spec/grammar/parser/lexer_test.go
@@ -0,0 +1,236 @@
+package parser
+
+import (
+ "strings"
+ "testing"
+
+ verr "github.com/nihei9/vartan/error"
+)
+
+func TestLexer_Run(t *testing.T) {
+ idTok := func(text string) *token {
+ return newIDToken(text, newPosition(1, 0))
+ }
+
+ termPatTok := func(text string) *token {
+ return newTerminalPatternToken(text, newPosition(1, 0))
+ }
+
+ strTok := func(text string) *token {
+ return newStringLiteralToken(text, newPosition(1, 0))
+ }
+
+ symTok := func(kind tokenKind) *token {
+ return newSymbolToken(kind, newPosition(1, 0))
+ }
+
+ invalidTok := func(text string) *token {
+ return newInvalidToken(text, newPosition(1, 0))
+ }
+
+ tests := []struct {
+ caption string
+ src string
+ tokens []*token
+ err error
+ }{
+ {
+ caption: "the lexer can recognize all kinds of tokens",
+ src: `id"terminal"'string':|;@...#$()`,
+ tokens: []*token{
+ idTok("id"),
+ termPatTok("terminal"),
+ strTok(`string`),
+ symTok(tokenKindColon),
+ symTok(tokenKindOr),
+ symTok(tokenKindSemicolon),
+ symTok(tokenKindLabelMarker),
+ symTok(tokenKindExpantion),
+ symTok(tokenKindDirectiveMarker),
+ symTok(tokenKindOrderedSymbolMarker),
+ symTok(tokenKindLParen),
+ symTok(tokenKindRParen),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "the lexer can recognize keywords",
+ src: `fragment`,
+ tokens: []*token{
+ symTok(tokenKindKWFragment),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "the lexer can recognize character sequences and escape sequences in a terminal",
+ src: `"abc\"\\"`,
+ tokens: []*token{
+ termPatTok(`abc"\\`),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "backslashes are recognized as they are because escape sequences are not allowed in strings",
+ src: `'\\\'`,
+ tokens: []*token{
+ strTok(`\\\`),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "a pattern must include at least one character",
+ src: `""`,
+ err: synErrEmptyPattern,
+ },
+ {
+ caption: "a string must include at least one character",
+ src: `''`,
+ err: synErrEmptyString,
+ },
+ {
+ caption: "the lexer can recognize newlines and combine consecutive newlines into one",
+ src: "\u000A | \u000D | \u000D\u000A | \u000A\u000A \u000D\u000D \u000D\u000A\u000D\u000A",
+ tokens: []*token{
+ symTok(tokenKindNewline),
+ symTok(tokenKindOr),
+ symTok(tokenKindNewline),
+ symTok(tokenKindOr),
+ symTok(tokenKindNewline),
+ symTok(tokenKindOr),
+ symTok(tokenKindNewline),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "the lexer ignores line comments",
+ src: `
+// This is the first comment.
+foo
+// This is the second comment.
+// This is the third comment.
+bar // This is the fourth comment.
+`,
+ tokens: []*token{
+ symTok(tokenKindNewline),
+ idTok("foo"),
+ symTok(tokenKindNewline),
+ idTok("bar"),
+ symTok(tokenKindNewline),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "an identifier cannot contain the capital-case letters",
+ src: `Abc`,
+ err: synErrIDInvalidChar,
+ },
+ {
+ caption: "an identifier cannot contain the capital-case letters",
+ src: `Zyx`,
+ err: synErrIDInvalidChar,
+ },
+ {
+ caption: "the underscore cannot be placed at the beginning of an identifier",
+ src: `_abc`,
+ err: synErrIDInvalidUnderscorePos,
+ },
+ {
+ caption: "the underscore cannot be placed at the end of an identifier",
+ src: `abc_`,
+ err: synErrIDInvalidUnderscorePos,
+ },
+ {
+ caption: "the underscore cannot be placed consecutively",
+ src: `a__b`,
+ err: synErrIDConsecutiveUnderscores,
+ },
+ {
+ caption: "the digits cannot be placed at the biginning of an identifier",
+ src: `0abc`,
+ err: synErrIDInvalidDigitsPos,
+ },
+ {
+ caption: "the digits cannot be placed at the biginning of an identifier",
+ src: `9abc`,
+ err: synErrIDInvalidDigitsPos,
+ },
+ {
+ caption: "an unclosed terminal is not a valid token",
+ src: `"abc`,
+ err: synErrUnclosedTerminal,
+ },
+ {
+ caption: "an incompleted escape sequence in a pattern is not a valid token",
+ src: `"\`,
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "an unclosed string is not a valid token",
+ src: `'abc`,
+ err: synErrUnclosedString,
+ },
+ {
+ caption: "the lexer can recognize valid tokens following an invalid token",
+ src: `abc!!!def`,
+ tokens: []*token{
+ idTok("abc"),
+ invalidTok("!!!"),
+ idTok("def"),
+ newEOFToken(),
+ },
+ },
+ {
+ caption: "the lexer skips white spaces",
+ // \u0009: HT
+ // \u0020: SP
+ src: "a\u0009b\u0020c",
+ tokens: []*token{
+ idTok("a"),
+ idTok("b"),
+ idTok("c"),
+ newEOFToken(),
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.caption, func(t *testing.T) {
+ l, err := newLexer(strings.NewReader(tt.src))
+ if err != nil {
+ t.Fatal(err)
+ }
+ n := 0
+ for {
+ var tok *token
+ tok, err = l.next()
+ if err != nil {
+ break
+ }
+ testToken(t, tok, tt.tokens[n])
+ n++
+ if tok.kind == tokenKindEOF {
+ break
+ }
+ }
+ if tt.err != nil {
+ synErr, ok := err.(*verr.SpecError)
+ if !ok {
+ t.Fatalf("unexpected error; want: %v, got: %v", tt.err, err)
+ }
+ if tt.err != synErr.Cause {
+ t.Fatalf("unexpected error; want: %v, got: %v", tt.err, synErr.Cause)
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("unexpected error; want: %v, got: %v", tt.err, err)
+ }
+ }
+ })
+ }
+}
+
+func testToken(t *testing.T, tok, expected *token) {
+ t.Helper()
+ if tok.kind != expected.kind || tok.text != expected.text {
+ t.Fatalf("unexpected token; want: %+v, got: %+v", expected, tok)
+ }
+}