aboutsummaryrefslogtreecommitdiff
path: root/compiler/lexer.go
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-02-14 00:47:12 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-02-14 00:48:25 +0900
commita22b3bfd2a6e394855cb1cac3ae67ad6882980cf (patch)
tree26366f5397df2dcb9fe3fc952d86424b7bac3ecd /compiler/lexer.go
parentInitial commit (diff)
downloadtre-a22b3bfd2a6e394855cb1cac3ae67ad6882980cf.tar.gz
tre-a22b3bfd2a6e394855cb1cac3ae67ad6882980cf.tar.xz
Add compiler
The compiler takes a lexical specification expressed by regular expressions and generates a DFA accepting the tokens. Operators that you can use in the regular expressions are concatenation, alternation, repeat, and grouping.
Diffstat (limited to 'compiler/lexer.go')
-rw-r--r--compiler/lexer.go120
1 files changed, 120 insertions, 0 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
new file mode 100644
index 0000000..f78b920
--- /dev/null
+++ b/compiler/lexer.go
@@ -0,0 +1,120 @@
+package compiler
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+)
+
+type tokenKind string
+
+const (
+ tokenKindChar = tokenKind("char")
+ tokenKindRepeat = tokenKind("*")
+ tokenKindAlt = tokenKind("|")
+ tokenKindGroupOpen = tokenKind("(")
+ tokenKindGroupClose = tokenKind(")")
+ tokenKindEOF = tokenKind("eof")
+)
+
+type token struct {
+ kind tokenKind
+ char rune
+}
+
+const nullChar = '\u0000'
+
+func newToken(kind tokenKind, char rune) *token {
+ return &token{
+ kind: kind,
+ char: char,
+ }
+}
+
+type lexer struct {
+ src *bufio.Reader
+ lastChar rune
+ prevChar rune
+ reachedEOF bool
+}
+
+func newLexer(src io.Reader) *lexer {
+ return &lexer{
+ src: bufio.NewReader(src),
+ lastChar: nullChar,
+ prevChar: nullChar,
+ reachedEOF: false,
+ }
+}
+
+func (l *lexer) next() (*token, error) {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return newToken(tokenKindEOF, nullChar), nil
+ }
+
+ switch c {
+ case '*':
+ return newToken(tokenKindRepeat, nullChar), nil
+ case '|':
+ return newToken(tokenKindAlt, nullChar), nil
+ case '(':
+ return newToken(tokenKindGroupOpen, nullChar), nil
+ case ')':
+ return newToken(tokenKindGroupClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return nil, &SyntaxError{
+ message: "incompleted escape sequence; unexpected EOF follows \\ character",
+ }
+ }
+ switch {
+ case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')':
+ return newToken(tokenKindChar, c), nil
+ default:
+ return nil, &SyntaxError{
+ message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)),
+ }
+ }
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) read() (rune, bool, error) {
+ c, _, err := l.src.ReadRune()
+ if err != nil {
+ if err == io.EOF {
+ l.prevChar = l.lastChar
+ l.lastChar = nullChar
+ l.reachedEOF = true
+ return nullChar, true, nil
+ }
+ return nullChar, false, err
+ }
+ l.prevChar = l.lastChar
+ l.lastChar = c
+ return c, false, nil
+}
+
+func (l *lexer) restore() error {
+ if l.reachedEOF {
+ l.lastChar = l.prevChar
+ l.prevChar = nullChar
+ l.reachedEOF = false
+ return l.src.UnreadRune()
+ }
+ if l.lastChar == nullChar {
+ return fmt.Errorf("the lexer failed to call restore() because the last character is null")
+ }
+ l.lastChar = l.prevChar
+ l.prevChar = nullChar
+ return l.src.UnreadRune()
+}