diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 01:16:29 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 01:16:29 +0900 |
commit | b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c (patch) | |
tree | d991027b9a00fe1a610a1ca3192c64db35481035 /driver | |
parent | Add compiler (diff) | |
download | tre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.gz tre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.xz |
Add driver
The driver takes a DFA and an input text and generates a lexer. The lexer tokenizes the input text according to the lexical specification that the DFA expresses.
Diffstat (limited to 'driver')
-rw-r--r-- | driver/lexer.go | 162 | ||||
-rw-r--r-- | driver/lexer_test.go | 147 |
2 files changed, 309 insertions, 0 deletions
diff --git a/driver/lexer.go b/driver/lexer.go new file mode 100644 index 0000000..710d54d --- /dev/null +++ b/driver/lexer.go @@ -0,0 +1,162 @@ +package driver + +import ( + "fmt" + "io" + "io/ioutil" + + "github.com/nihei9/maleeni/compiler" +) + +type Token struct { + ID int + Match []byte + EOF bool + Invalid bool +} + +func newToken(id int, match []byte) *Token { + return &Token{ + ID: id, + Match: match, + } +} + +func newEOFToken() *Token { + return &Token{ + ID: 0, + EOF: true, + } +} + +func newInvalidToken(match []byte) *Token { + return &Token{ + ID: 0, + Match: match, + Invalid: true, + } +} + +type lexer struct { + tranTab *compiler.TransitionTable + src []byte + srcPtr int + tokBuf []*Token +} + +func NewLexer(tranTab *compiler.TransitionTable, src io.Reader) (*lexer, error) { + b, err := ioutil.ReadAll(src) + if err != nil { + return nil, err + } + return &lexer{ + tranTab: tranTab, + src: b, + srcPtr: 0, + }, nil +} + +func (l *lexer) Next() (*Token, error) { + if len(l.tokBuf) > 0 { + tok := l.tokBuf[0] + l.tokBuf = l.tokBuf[1:] + return tok, nil + } + + tok, err := l.next() + if err != nil { + return nil, err + } + if !tok.Invalid { + return tok, nil + } + errTok := tok + for { + tok, err = l.next() + if err != nil { + return nil, err + } + if !tok.Invalid { + break + } + errTok.Match = append(errTok.Match, tok.Match...) + } + l.tokBuf = append(l.tokBuf, tok) + return errTok, nil +} + +func (l *lexer) Peek1() (*Token, error) { + return l.peekN(0) +} + +func (l *lexer) Peek2() (*Token, error) { + return l.peekN(1) +} + +func (l *lexer) Peek3() (*Token, error) { + return l.peekN(2) +} + +func (l *lexer) peekN(n int) (*Token, error) { + if n < 0 || n > 2 { + return nil, fmt.Errorf("peekN() can handle only [0..2]") + } + for len(l.tokBuf) < n+1 { + tok, err := l.next() + if err != nil { + return nil, err + } + l.tokBuf = append(l.tokBuf, tok) + } + return l.tokBuf[n], nil +} + +func (l *lexer) next() (*Token, error) { + state := l.tranTab.InitialState + buf := []byte{} + unfixedBufLen := 0 + var tok *Token + for { + v, eof := l.read() + if eof { + if tok != nil { + l.unread(unfixedBufLen) + return tok, nil + } + return newEOFToken(), nil + } + buf = append(buf, v) + unfixedBufLen++ + entry := l.tranTab.Transition[state] + if len(entry) == 0 { + return nil, fmt.Errorf("no transition entry; state: %v", state) + } + nextState := entry[v] + if nextState == 0 { + if tok != nil { + l.unread(unfixedBufLen) + return tok, nil + } + return newInvalidToken(buf), nil + } + state = nextState + id, ok := l.tranTab.AcceptingStates[state] + if ok { + tok = newToken(id, buf) + unfixedBufLen = 0 + } + } +} + +func (l *lexer) read() (byte, bool) { + if l.srcPtr >= len(l.src) { + return 0, true + } + b := l.src[l.srcPtr] + l.srcPtr++ + return b, false +} + +func (l *lexer) unread(n int) { + l.srcPtr -= n +} diff --git a/driver/lexer_test.go b/driver/lexer_test.go new file mode 100644 index 0000000..4fd4bf8 --- /dev/null +++ b/driver/lexer_test.go @@ -0,0 +1,147 @@ +package driver + +import ( + "bytes" + "strings" + "testing" + + "github.com/nihei9/maleeni/compiler" +) + +func TestLexer_Next(t *testing.T) { + test := []struct { + regexps [][]byte + src string + tokens []*Token + }{ + { + regexps: [][]byte{ + []byte("(a|b)*abb"), + []byte(" *"), + }, + src: "abb aabb aaabb babb bbabb abbbabb", + tokens: []*Token{ + newToken(1, []byte("abb")), + newToken(2, []byte(" ")), + newToken(1, []byte("aabb")), + newToken(2, []byte(" ")), + newToken(1, []byte("aaabb")), + newToken(2, []byte(" ")), + newToken(1, []byte("babb")), + newToken(2, []byte(" ")), + newToken(1, []byte("bbabb")), + newToken(2, []byte(" ")), + newToken(1, []byte("abbbabb")), + newEOFToken(), + }, + }, + } + for _, tt := range test { + res := map[int][]byte{} + for i, re := range tt.regexps { + res[i+1] = re + } + dfa, err := compiler.Compile(res) + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + tranTab, err := compiler.GenTransitionTable(dfa) + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + lexer, err := NewLexer(tranTab, strings.NewReader(tt.src)) + if err != nil { + t.Fatalf("unexpecated error occurred; %v", err) + } + for _, eTok := range tt.tokens { + tok, err := lexer.Next() + if err != nil { + t.Log(err) + break + } + testToken(t, eTok, tok) + t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) + if tok.EOF { + break + } + } + } +} + +func TestLexer_PeekN(t *testing.T) { + dfa, err := compiler.Compile(map[int][]byte{ + 1: []byte("foo"), + 2: []byte("bar"), + }) + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + tranTab, err := compiler.GenTransitionTable(dfa) + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + lex, err := NewLexer(tranTab, strings.NewReader("foobar")) + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + + expectedTokens := []*Token{ + { + ID: 1, + Match: []byte("foo"), + }, + { + ID: 2, + Match: []byte("bar"), + }, + { + EOF: true, + }, + } + + tok, err := lex.Peek1() + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + if tok == nil { + t.Fatalf("token is nil") + } + testToken(t, expectedTokens[0], tok) + + tok, err = lex.Peek2() + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + if tok == nil { + t.Fatalf("token is nil") + } + testToken(t, expectedTokens[1], tok) + + tok, err = lex.Peek3() + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + if tok == nil { + t.Fatalf("token is nil") + } + testToken(t, expectedTokens[2], tok) + + for _, eTok := range expectedTokens { + tok, err = lex.Next() + if err != nil { + t.Fatalf("unexpected error occurred: %v", err) + } + if tok == nil { + t.Fatalf("token is nil") + } + testToken(t, eTok, tok) + } +} + +func testToken(t *testing.T, expected, actual *Token) { + t.Helper() + + if actual.ID != expected.ID || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { + t.Errorf("unexpected token; want: %v, got: %v", expected, actual) + } +} |