From a22b3bfd2a6e394855cb1cac3ae67ad6882980cf Mon Sep 17 00:00:00 2001
From: Ryo Nihei <nihei.dev@gmail.com>
Date: Sun, 14 Feb 2021 00:47:12 +0900
Subject: Add compiler

The compiler takes a lexical specification expressed by regular expressions and generates a DFA accepting the tokens.
Operators that you can use in the regular expressions are concatenation, alternation, repeat, and grouping.
---
 compiler/lexer_test.go | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 compiler/lexer_test.go

(limited to 'compiler/lexer_test.go')

diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
new file mode 100644
index 0000000..b172ae9
--- /dev/null
+++ b/compiler/lexer_test.go
@@ -0,0 +1,105 @@
+package compiler
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestLexer(t *testing.T) {
+	tests := []struct {
+		caption string
+		src     string
+		tokens  []*token
+		err     error
+	}{
+		{
+			caption: "lexer can recognize ordinaly characters",
+			src:     "123abcいろは",
+			tokens: []*token{
+				newToken(tokenKindChar, '1'),
+				newToken(tokenKindChar, '2'),
+				newToken(tokenKindChar, '3'),
+				newToken(tokenKindChar, 'a'),
+				newToken(tokenKindChar, 'b'),
+				newToken(tokenKindChar, 'c'),
+				newToken(tokenKindChar, 'い'),
+				newToken(tokenKindChar, 'ろ'),
+				newToken(tokenKindChar, 'は'),
+				newToken(tokenKindEOF, nullChar),
+			},
+		},
+		{
+			caption: "lexer can recognize the special characters",
+			src:     "*|()",
+			tokens: []*token{
+				newToken(tokenKindRepeat, nullChar),
+				newToken(tokenKindAlt, nullChar),
+				newToken(tokenKindGroupOpen, nullChar),
+				newToken(tokenKindGroupClose, nullChar),
+				newToken(tokenKindEOF, nullChar),
+			},
+		},
+		{
+			caption: "lexer can recognize the escape sequences",
+			src:     "\\\\\\*\\|\\(\\)",
+			tokens: []*token{
+				newToken(tokenKindChar, '\\'),
+				newToken(tokenKindChar, '*'),
+				newToken(tokenKindChar, '|'),
+				newToken(tokenKindChar, '('),
+				newToken(tokenKindChar, ')'),
+				newToken(tokenKindEOF, nullChar),
+			},
+		},
+		{
+			caption: "lexer raises an error when an invalid escape sequence appears",
+			src:     "\\@",
+			err:     &SyntaxError{},
+		},
+		{
+			caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+			src:     "\\",
+			err:     &SyntaxError{},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.caption, func(t *testing.T) {
+			lex := newLexer(strings.NewReader(tt.src))
+			var err error
+			var tok *token
+			i := 0
+			for {
+				tok, err = lex.next()
+				if err != nil {
+					break
+				}
+				if i >= len(tt.tokens) {
+					break
+				}
+				eTok := tt.tokens[i]
+				i++
+				testToken(t, tok, eTok)
+
+				if tok.kind == tokenKindEOF {
+					break
+				}
+			}
+			ty := reflect.TypeOf(err)
+			eTy := reflect.TypeOf(tt.err)
+			if ty != eTy {
+				t.Fatalf("unexpected error type; want: %v, got: %v", eTy, ty)
+			}
+			if i < len(tt.tokens) {
+				t.Fatalf("expecte more tokens")
+			}
+		})
+	}
+}
+
+func testToken(t *testing.T, a, e *token) {
+	t.Helper()
+	if e.kind != a.kind || e.char != a.char {
+		t.Fatalf("unexpected token; want: %v, got: %v", e, a)
+	}
+}
-- 
cgit v1.2.3