aboutsummaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-02-14 20:19:22 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-02-14 20:19:22 +0900
commit467f223668d13ffa42679e6c928d82d5d402d87d (patch)
treeb518082b8af4d79b19b41e8beacc085d6f0cf84f /compiler
parentAdd dot symbol matching any single character (diff)
downloadtre-467f223668d13ffa42679e6c928d82d5d402d87d.tar.gz
tre-467f223668d13ffa42679e6c928d82d5d402d87d.tar.xz
Add bracket expression matching specified character
The bracket expression matches any single character specified in it. In the bracket expression, the special characters like ., *, and so on are also handled as normal characters.
Diffstat (limited to 'compiler')
-rw-r--r--compiler/lexer.go55
-rw-r--r--compiler/lexer_test.go32
-rw-r--r--compiler/parser.go31
3 files changed, 109 insertions, 9 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index 1c09260..3e3cf35 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -15,6 +15,8 @@ const (
tokenKindAlt = tokenKind("|")
tokenKindGroupOpen = tokenKind("(")
tokenKindGroupClose = tokenKind(")")
+ tokenKindBExpOpen = tokenKind("[")
+ tokenKindBExpClose = tokenKind("]")
tokenKindEOF = tokenKind("eof")
)
@@ -32,11 +34,19 @@ func newToken(kind tokenKind, char rune) *token {
}
}
+type lexerMode string
+
+const (
+ lexerModeDefault = lexerMode("default")
+ lexerModeBExp = lexerMode("bracket expression")
+)
+
type lexer struct {
src *bufio.Reader
lastChar rune
prevChar rune
reachedEOF bool
+ mode lexerMode
}
func newLexer(src io.Reader) *lexer {
@@ -45,6 +55,7 @@ func newLexer(src io.Reader) *lexer {
lastChar: nullChar,
prevChar: nullChar,
reachedEOF: false,
+ mode: lexerModeDefault,
}
}
@@ -57,6 +68,15 @@ func (l *lexer) next() (*token, error) {
return newToken(tokenKindEOF, nullChar), nil
}
+ switch l.mode {
+ case lexerModeBExp:
+ return l.nextInBExp(c)
+ default:
+ return l.nextInDefault(c)
+ }
+}
+
+func (l *lexer) nextInDefault(c rune) (*token, error) {
switch c {
case '*':
return newToken(tokenKindRepeat, nullChar), nil
@@ -68,6 +88,39 @@ func (l *lexer) next() (*token, error) {
return newToken(tokenKindGroupOpen, nullChar), nil
case ')':
return newToken(tokenKindGroupClose, nullChar), nil
+ case '[':
+ l.mode = lexerModeBExp
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ case ']':
+ return newToken(tokenKindBExpClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return nil, &SyntaxError{
+ message: "incompleted escape sequence; unexpected EOF follows \\ character",
+ }
+ }
+ switch {
+ case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']':
+ return newToken(tokenKindChar, c), nil
+ default:
+ return nil, &SyntaxError{
+ message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)),
+ }
+ }
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInBExp(c rune) (*token, error) {
+ switch c {
+ case ']':
+ l.mode = lexerModeDefault
+ return newToken(tokenKindBExpClose, nullChar), nil
case '\\':
c, eof, err := l.read()
if err != nil {
@@ -79,7 +132,7 @@ func (l *lexer) next() (*token, error) {
}
}
switch {
- case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')':
+ case c == '\\' || c == ']':
return newToken(tokenKindChar, c), nil
default:
return nil, &SyntaxError{
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index 75770b0..11cf043 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -31,19 +31,21 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the special characters",
- src: ".*|()",
+ src: ".*|()[]",
tokens: []*token{
newToken(tokenKindAnyChar, nullChar),
newToken(tokenKindRepeat, nullChar),
newToken(tokenKindAlt, nullChar),
newToken(tokenKindGroupOpen, nullChar),
newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
{
caption: "lexer can recognize the escape sequences",
- src: "\\\\\\.\\*\\|\\(\\)",
+ src: "\\\\\\.\\*\\|\\(\\)\\[\\]",
tokens: []*token{
newToken(tokenKindChar, '\\'),
newToken(tokenKindChar, '.'),
@@ -51,6 +53,32 @@ func TestLexer(t *testing.T) {
newToken(tokenKindChar, '|'),
newToken(tokenKindChar, '('),
newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindChar, ']'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "in a bracket expression, the special characters are also handled as normal characters",
+ src: "[\\\\.*|()[\\]].*|()][",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindChar, ']'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindAnyChar, nullChar),
+ newToken(tokenKindRepeat, nullChar),
+ newToken(tokenKindAlt, nullChar),
+ newToken(tokenKindGroupOpen, nullChar),
+ newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
diff --git a/compiler/parser.go b/compiler/parser.go
index 03dc198..ede601d 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -155,9 +155,31 @@ func (p *parser) parseGroup() astNode {
defer p.expect(tokenKindGroupClose)
return p.parseAlt()
}
+ return p.parseSingleChar()
+}
+
+func (p *parser) parseSingleChar() astNode {
if p.consume(tokenKindAnyChar) {
return genAnyCharAST(p.lastTok)
}
+ if p.consume(tokenKindBExpOpen) {
+ defer p.expect(tokenKindBExpClose)
+ left := p.parseNormalChar()
+ if left == nil {
+ raiseSyntaxError("bracket expression must include at least one character")
+ }
+ for {
+ right := p.parseNormalChar()
+ if right == nil {
+ break
+ }
+ left = newAltNode(left, right)
+ }
+ return left
+ }
+ return p.parseNormalChar()
+}
+func (p *parser) parseNormalChar() astNode {
if !p.consume(tokenKindChar) {
return nil
}
@@ -303,12 +325,9 @@ func (p *parser) consume(expected tokenKind) bool {
tok = p.peekedTok
p.peekedTok = nil
} else {
- for {
- tok, err = p.lex.next()
- if err != nil {
- panic(err)
- }
- break
+ tok, err = p.lex.next()
+ if err != nil {
+ panic(err)
}
}
p.lastTok = tok