diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 00:47:12 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 00:48:25 +0900 |
commit | a22b3bfd2a6e394855cb1cac3ae67ad6882980cf (patch) | |
tree | 26366f5397df2dcb9fe3fc952d86424b7bac3ecd /compiler/lexer.go | |
parent | Initial commit (diff) | |
download | tre-a22b3bfd2a6e394855cb1cac3ae67ad6882980cf.tar.gz tre-a22b3bfd2a6e394855cb1cac3ae67ad6882980cf.tar.xz |
Add compiler
The compiler takes a lexical specification expressed by regular expressions and generates a DFA accepting the tokens.
Operators that you can use in the regular expressions are concatenation, alternation, repeat, and grouping.
Diffstat (limited to 'compiler/lexer.go')
-rw-r--r-- | compiler/lexer.go | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go new file mode 100644 index 0000000..f78b920 --- /dev/null +++ b/compiler/lexer.go @@ -0,0 +1,120 @@ +package compiler + +import ( + "bufio" + "fmt" + "io" +) + +type tokenKind string + +const ( + tokenKindChar = tokenKind("char") + tokenKindRepeat = tokenKind("*") + tokenKindAlt = tokenKind("|") + tokenKindGroupOpen = tokenKind("(") + tokenKindGroupClose = tokenKind(")") + tokenKindEOF = tokenKind("eof") +) + +type token struct { + kind tokenKind + char rune +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +type lexer struct { + src *bufio.Reader + lastChar rune + prevChar rune + reachedEOF bool +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + lastChar: nullChar, + prevChar: nullChar, + reachedEOF: false, + } +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return nil, &SyntaxError{ + message: "incompleted escape sequence; unexpected EOF follows \\ character", + } + } + switch { + case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')': + return newToken(tokenKindChar, c), nil + default: + return nil, &SyntaxError{ + message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)), + } + } + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar = l.lastChar + l.lastChar = nullChar + l.reachedEOF = true + return nullChar, true, nil + } + return nullChar, false, err + } + l.prevChar = l.lastChar + l.lastChar = c + return c, false, nil +} + +func (l *lexer) restore() error { + if l.reachedEOF { + l.lastChar = l.prevChar + l.prevChar = nullChar + l.reachedEOF = false + return l.src.UnreadRune() + } + if l.lastChar == nullChar { + return fmt.Errorf("the lexer failed to call restore() because the last character is null") + } + l.lastChar = l.prevChar + l.prevChar = nullChar + return l.src.UnreadRune() +} |