From a22b3bfd2a6e394855cb1cac3ae67ad6882980cf Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Sun, 14 Feb 2021 00:47:12 +0900 Subject: Add compiler The compiler takes a lexical specification expressed by regular expressions and generates a DFA accepting the tokens. Operators that you can use in the regular expressions are concatenation, alternation, repeat, and grouping. --- compiler/lexer_test.go | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 compiler/lexer_test.go (limited to 'compiler/lexer_test.go') diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go new file mode 100644 index 0000000..b172ae9 --- /dev/null +++ b/compiler/lexer_test.go @@ -0,0 +1,105 @@ +package compiler + +import ( + "reflect" + "strings" + "testing" +) + +func TestLexer(t *testing.T) { + tests := []struct { + caption string + src string + tokens []*token + err error + }{ + { + caption: "lexer can recognize ordinaly characters", + src: "123abcいろは", + tokens: []*token{ + newToken(tokenKindChar, '1'), + newToken(tokenKindChar, '2'), + newToken(tokenKindChar, '3'), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, 'b'), + newToken(tokenKindChar, 'c'), + newToken(tokenKindChar, 'い'), + newToken(tokenKindChar, 'ろ'), + newToken(tokenKindChar, 'は'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters", + src: "*|()", + tokens: []*token{ + newToken(tokenKindRepeat, nullChar), + newToken(tokenKindAlt, nullChar), + newToken(tokenKindGroupOpen, nullChar), + newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences", + src: "\\\\\\*\\|\\(\\)", + tokens: []*token{ + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "\\@", + err: &SyntaxError{}, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "\\", + err: &SyntaxError{}, + }, + } + for _, tt := range tests { + t.Run(tt.caption, func(t *testing.T) { + lex := newLexer(strings.NewReader(tt.src)) + var err error + var tok *token + i := 0 + for { + tok, err = lex.next() + if err != nil { + break + } + if i >= len(tt.tokens) { + break + } + eTok := tt.tokens[i] + i++ + testToken(t, tok, eTok) + + if tok.kind == tokenKindEOF { + break + } + } + ty := reflect.TypeOf(err) + eTy := reflect.TypeOf(tt.err) + if ty != eTy { + t.Fatalf("unexpected error type; want: %v, got: %v", eTy, ty) + } + if i < len(tt.tokens) { + t.Fatalf("expecte more tokens") + } + }) + } +} + +func testToken(t *testing.T, a, e *token) { + t.Helper() + if e.kind != a.kind || e.char != a.char { + t.Fatalf("unexpected token; want: %v, got: %v", e, a) + } +} -- cgit v1.2.3