aboutsummaryrefslogtreecommitdiff
path: root/driver
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-02-14 01:16:29 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-02-14 01:16:29 +0900
commitb451c9fb7ef92c7fdbbb4ca4afc3794d7080041c (patch)
treed991027b9a00fe1a610a1ca3192c64db35481035 /driver
parentAdd compiler (diff)
downloadtre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.gz
tre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.xz
Add driver
The driver takes a DFA and an input text and generates a lexer. The lexer tokenizes the input text according to the lexical specification that the DFA expresses.
Diffstat (limited to 'driver')
-rw-r--r--driver/lexer.go162
-rw-r--r--driver/lexer_test.go147
2 files changed, 309 insertions, 0 deletions
diff --git a/driver/lexer.go b/driver/lexer.go
new file mode 100644
index 0000000..710d54d
--- /dev/null
+++ b/driver/lexer.go
@@ -0,0 +1,162 @@
+package driver
+
+import (
+ "fmt"
+ "io"
+ "io/ioutil"
+
+ "github.com/nihei9/maleeni/compiler"
+)
+
+type Token struct {
+ ID int
+ Match []byte
+ EOF bool
+ Invalid bool
+}
+
+func newToken(id int, match []byte) *Token {
+ return &Token{
+ ID: id,
+ Match: match,
+ }
+}
+
+func newEOFToken() *Token {
+ return &Token{
+ ID: 0,
+ EOF: true,
+ }
+}
+
+func newInvalidToken(match []byte) *Token {
+ return &Token{
+ ID: 0,
+ Match: match,
+ Invalid: true,
+ }
+}
+
+type lexer struct {
+ tranTab *compiler.TransitionTable
+ src []byte
+ srcPtr int
+ tokBuf []*Token
+}
+
+func NewLexer(tranTab *compiler.TransitionTable, src io.Reader) (*lexer, error) {
+ b, err := ioutil.ReadAll(src)
+ if err != nil {
+ return nil, err
+ }
+ return &lexer{
+ tranTab: tranTab,
+ src: b,
+ srcPtr: 0,
+ }, nil
+}
+
+func (l *lexer) Next() (*Token, error) {
+ if len(l.tokBuf) > 0 {
+ tok := l.tokBuf[0]
+ l.tokBuf = l.tokBuf[1:]
+ return tok, nil
+ }
+
+ tok, err := l.next()
+ if err != nil {
+ return nil, err
+ }
+ if !tok.Invalid {
+ return tok, nil
+ }
+ errTok := tok
+ for {
+ tok, err = l.next()
+ if err != nil {
+ return nil, err
+ }
+ if !tok.Invalid {
+ break
+ }
+ errTok.Match = append(errTok.Match, tok.Match...)
+ }
+ l.tokBuf = append(l.tokBuf, tok)
+ return errTok, nil
+}
+
+func (l *lexer) Peek1() (*Token, error) {
+ return l.peekN(0)
+}
+
+func (l *lexer) Peek2() (*Token, error) {
+ return l.peekN(1)
+}
+
+func (l *lexer) Peek3() (*Token, error) {
+ return l.peekN(2)
+}
+
+func (l *lexer) peekN(n int) (*Token, error) {
+ if n < 0 || n > 2 {
+ return nil, fmt.Errorf("peekN() can handle only [0..2]")
+ }
+ for len(l.tokBuf) < n+1 {
+ tok, err := l.next()
+ if err != nil {
+ return nil, err
+ }
+ l.tokBuf = append(l.tokBuf, tok)
+ }
+ return l.tokBuf[n], nil
+}
+
+func (l *lexer) next() (*Token, error) {
+ state := l.tranTab.InitialState
+ buf := []byte{}
+ unfixedBufLen := 0
+ var tok *Token
+ for {
+ v, eof := l.read()
+ if eof {
+ if tok != nil {
+ l.unread(unfixedBufLen)
+ return tok, nil
+ }
+ return newEOFToken(), nil
+ }
+ buf = append(buf, v)
+ unfixedBufLen++
+ entry := l.tranTab.Transition[state]
+ if len(entry) == 0 {
+ return nil, fmt.Errorf("no transition entry; state: %v", state)
+ }
+ nextState := entry[v]
+ if nextState == 0 {
+ if tok != nil {
+ l.unread(unfixedBufLen)
+ return tok, nil
+ }
+ return newInvalidToken(buf), nil
+ }
+ state = nextState
+ id, ok := l.tranTab.AcceptingStates[state]
+ if ok {
+ tok = newToken(id, buf)
+ unfixedBufLen = 0
+ }
+ }
+}
+
+func (l *lexer) read() (byte, bool) {
+ if l.srcPtr >= len(l.src) {
+ return 0, true
+ }
+ b := l.src[l.srcPtr]
+ l.srcPtr++
+ return b, false
+}
+
+func (l *lexer) unread(n int) {
+ l.srcPtr -= n
+}
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
new file mode 100644
index 0000000..4fd4bf8
--- /dev/null
+++ b/driver/lexer_test.go
@@ -0,0 +1,147 @@
+package driver
+
+import (
+ "bytes"
+ "strings"
+ "testing"
+
+ "github.com/nihei9/maleeni/compiler"
+)
+
+func TestLexer_Next(t *testing.T) {
+ test := []struct {
+ regexps [][]byte
+ src string
+ tokens []*Token
+ }{
+ {
+ regexps: [][]byte{
+ []byte("(a|b)*abb"),
+ []byte(" *"),
+ },
+ src: "abb aabb aaabb babb bbabb abbbabb",
+ tokens: []*Token{
+ newToken(1, []byte("abb")),
+ newToken(2, []byte(" ")),
+ newToken(1, []byte("aabb")),
+ newToken(2, []byte(" ")),
+ newToken(1, []byte("aaabb")),
+ newToken(2, []byte(" ")),
+ newToken(1, []byte("babb")),
+ newToken(2, []byte(" ")),
+ newToken(1, []byte("bbabb")),
+ newToken(2, []byte(" ")),
+ newToken(1, []byte("abbbabb")),
+ newEOFToken(),
+ },
+ },
+ }
+ for _, tt := range test {
+ res := map[int][]byte{}
+ for i, re := range tt.regexps {
+ res[i+1] = re
+ }
+ dfa, err := compiler.Compile(res)
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ tranTab, err := compiler.GenTransitionTable(dfa)
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ lexer, err := NewLexer(tranTab, strings.NewReader(tt.src))
+ if err != nil {
+ t.Fatalf("unexpecated error occurred; %v", err)
+ }
+ for _, eTok := range tt.tokens {
+ tok, err := lexer.Next()
+ if err != nil {
+ t.Log(err)
+ break
+ }
+ testToken(t, eTok, tok)
+ t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid)
+ if tok.EOF {
+ break
+ }
+ }
+ }
+}
+
+func TestLexer_PeekN(t *testing.T) {
+ dfa, err := compiler.Compile(map[int][]byte{
+ 1: []byte("foo"),
+ 2: []byte("bar"),
+ })
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ tranTab, err := compiler.GenTransitionTable(dfa)
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ lex, err := NewLexer(tranTab, strings.NewReader("foobar"))
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+
+ expectedTokens := []*Token{
+ {
+ ID: 1,
+ Match: []byte("foo"),
+ },
+ {
+ ID: 2,
+ Match: []byte("bar"),
+ },
+ {
+ EOF: true,
+ },
+ }
+
+ tok, err := lex.Peek1()
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ if tok == nil {
+ t.Fatalf("token is nil")
+ }
+ testToken(t, expectedTokens[0], tok)
+
+ tok, err = lex.Peek2()
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ if tok == nil {
+ t.Fatalf("token is nil")
+ }
+ testToken(t, expectedTokens[1], tok)
+
+ tok, err = lex.Peek3()
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ if tok == nil {
+ t.Fatalf("token is nil")
+ }
+ testToken(t, expectedTokens[2], tok)
+
+ for _, eTok := range expectedTokens {
+ tok, err = lex.Next()
+ if err != nil {
+ t.Fatalf("unexpected error occurred: %v", err)
+ }
+ if tok == nil {
+ t.Fatalf("token is nil")
+ }
+ testToken(t, eTok, tok)
+ }
+}
+
+func testToken(t *testing.T, expected, actual *Token) {
+ t.Helper()
+
+ if actual.ID != expected.ID || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid {
+ t.Errorf("unexpected token; want: %v, got: %v", expected, actual)
+ }
+}