diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 01:16:29 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 01:16:29 +0900 |
commit | b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c (patch) | |
tree | d991027b9a00fe1a610a1ca3192c64db35481035 /driver/lexer.go | |
parent | Add compiler (diff) | |
download | tre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.gz tre-b451c9fb7ef92c7fdbbb4ca4afc3794d7080041c.tar.xz |
Add driver
The driver takes a DFA and an input text and generates a lexer. The lexer tokenizes the input text according to the lexical specification that the DFA expresses.
Diffstat (limited to 'driver/lexer.go')
-rw-r--r-- | driver/lexer.go | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/driver/lexer.go b/driver/lexer.go new file mode 100644 index 0000000..710d54d --- /dev/null +++ b/driver/lexer.go @@ -0,0 +1,162 @@ +package driver + +import ( + "fmt" + "io" + "io/ioutil" + + "github.com/nihei9/maleeni/compiler" +) + +type Token struct { + ID int + Match []byte + EOF bool + Invalid bool +} + +func newToken(id int, match []byte) *Token { + return &Token{ + ID: id, + Match: match, + } +} + +func newEOFToken() *Token { + return &Token{ + ID: 0, + EOF: true, + } +} + +func newInvalidToken(match []byte) *Token { + return &Token{ + ID: 0, + Match: match, + Invalid: true, + } +} + +type lexer struct { + tranTab *compiler.TransitionTable + src []byte + srcPtr int + tokBuf []*Token +} + +func NewLexer(tranTab *compiler.TransitionTable, src io.Reader) (*lexer, error) { + b, err := ioutil.ReadAll(src) + if err != nil { + return nil, err + } + return &lexer{ + tranTab: tranTab, + src: b, + srcPtr: 0, + }, nil +} + +func (l *lexer) Next() (*Token, error) { + if len(l.tokBuf) > 0 { + tok := l.tokBuf[0] + l.tokBuf = l.tokBuf[1:] + return tok, nil + } + + tok, err := l.next() + if err != nil { + return nil, err + } + if !tok.Invalid { + return tok, nil + } + errTok := tok + for { + tok, err = l.next() + if err != nil { + return nil, err + } + if !tok.Invalid { + break + } + errTok.Match = append(errTok.Match, tok.Match...) + } + l.tokBuf = append(l.tokBuf, tok) + return errTok, nil +} + +func (l *lexer) Peek1() (*Token, error) { + return l.peekN(0) +} + +func (l *lexer) Peek2() (*Token, error) { + return l.peekN(1) +} + +func (l *lexer) Peek3() (*Token, error) { + return l.peekN(2) +} + +func (l *lexer) peekN(n int) (*Token, error) { + if n < 0 || n > 2 { + return nil, fmt.Errorf("peekN() can handle only [0..2]") + } + for len(l.tokBuf) < n+1 { + tok, err := l.next() + if err != nil { + return nil, err + } + l.tokBuf = append(l.tokBuf, tok) + } + return l.tokBuf[n], nil +} + +func (l *lexer) next() (*Token, error) { + state := l.tranTab.InitialState + buf := []byte{} + unfixedBufLen := 0 + var tok *Token + for { + v, eof := l.read() + if eof { + if tok != nil { + l.unread(unfixedBufLen) + return tok, nil + } + return newEOFToken(), nil + } + buf = append(buf, v) + unfixedBufLen++ + entry := l.tranTab.Transition[state] + if len(entry) == 0 { + return nil, fmt.Errorf("no transition entry; state: %v", state) + } + nextState := entry[v] + if nextState == 0 { + if tok != nil { + l.unread(unfixedBufLen) + return tok, nil + } + return newInvalidToken(buf), nil + } + state = nextState + id, ok := l.tranTab.AcceptingStates[state] + if ok { + tok = newToken(id, buf) + unfixedBufLen = 0 + } + } +} + +func (l *lexer) read() (byte, bool) { + if l.srcPtr >= len(l.src) { + return 0, true + } + b := l.src[l.srcPtr] + l.srcPtr++ + return b, false +} + +func (l *lexer) unread(n int) { + l.srcPtr -= n +} |