aboutsummaryrefslogtreecommitdiff
path: root/compiler/lexer.go
blob: 1c09260b7232092a7d5253a1450014d551262a2b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
package compiler

import (
	"bufio"
	"fmt"
	"io"
)

type tokenKind string

const (
	tokenKindChar       = tokenKind("char")
	tokenKindAnyChar    = tokenKind(".")
	tokenKindRepeat     = tokenKind("*")
	tokenKindAlt        = tokenKind("|")
	tokenKindGroupOpen  = tokenKind("(")
	tokenKindGroupClose = tokenKind(")")
	tokenKindEOF        = tokenKind("eof")
)

type token struct {
	kind tokenKind
	char rune
}

const nullChar = '\u0000'

func newToken(kind tokenKind, char rune) *token {
	return &token{
		kind: kind,
		char: char,
	}
}

type lexer struct {
	src        *bufio.Reader
	lastChar   rune
	prevChar   rune
	reachedEOF bool
}

func newLexer(src io.Reader) *lexer {
	return &lexer{
		src:        bufio.NewReader(src),
		lastChar:   nullChar,
		prevChar:   nullChar,
		reachedEOF: false,
	}
}

func (l *lexer) next() (*token, error) {
	c, eof, err := l.read()
	if err != nil {
		return nil, err
	}
	if eof {
		return newToken(tokenKindEOF, nullChar), nil
	}

	switch c {
	case '*':
		return newToken(tokenKindRepeat, nullChar), nil
	case '.':
		return newToken(tokenKindAnyChar, nullChar), nil
	case '|':
		return newToken(tokenKindAlt, nullChar), nil
	case '(':
		return newToken(tokenKindGroupOpen, nullChar), nil
	case ')':
		return newToken(tokenKindGroupClose, nullChar), nil
	case '\\':
		c, eof, err := l.read()
		if err != nil {
			return nil, err
		}
		if eof {
			return nil, &SyntaxError{
				message: "incompleted escape sequence; unexpected EOF follows \\ character",
			}
		}
		switch {
		case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')':
			return newToken(tokenKindChar, c), nil
		default:
			return nil, &SyntaxError{
				message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)),
			}
		}
	default:
		return newToken(tokenKindChar, c), nil
	}
}

func (l *lexer) read() (rune, bool, error) {
	c, _, err := l.src.ReadRune()
	if err != nil {
		if err == io.EOF {
			l.prevChar = l.lastChar
			l.lastChar = nullChar
			l.reachedEOF = true
			return nullChar, true, nil
		}
		return nullChar, false, err
	}
	l.prevChar = l.lastChar
	l.lastChar = c
	return c, false, nil
}

func (l *lexer) restore() error {
	if l.reachedEOF {
		l.lastChar = l.prevChar
		l.prevChar = nullChar
		l.reachedEOF = false
		return l.src.UnreadRune()
	}
	if l.lastChar == nullChar {
		return fmt.Errorf("the lexer failed to call restore() because the last character is null")
	}
	l.lastChar = l.prevChar
	l.prevChar = nullChar
	return l.src.UnreadRune()
}