diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 20:19:22 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 20:19:22 +0900 |
commit | 467f223668d13ffa42679e6c928d82d5d402d87d (patch) | |
tree | b518082b8af4d79b19b41e8beacc085d6f0cf84f /compiler | |
parent | Add dot symbol matching any single character (diff) | |
download | tre-467f223668d13ffa42679e6c928d82d5d402d87d.tar.gz tre-467f223668d13ffa42679e6c928d82d5d402d87d.tar.xz |
Add bracket expression matching specified character
The bracket expression matches any single character specified in it. In the bracket expression, the special characters like ., *, and so on are also handled as normal characters.
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/lexer.go | 55 | ||||
-rw-r--r-- | compiler/lexer_test.go | 32 | ||||
-rw-r--r-- | compiler/parser.go | 31 |
3 files changed, 109 insertions, 9 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go index 1c09260..3e3cf35 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -15,6 +15,8 @@ const ( tokenKindAlt = tokenKind("|") tokenKindGroupOpen = tokenKind("(") tokenKindGroupClose = tokenKind(")") + tokenKindBExpOpen = tokenKind("[") + tokenKindBExpClose = tokenKind("]") tokenKindEOF = tokenKind("eof") ) @@ -32,11 +34,19 @@ func newToken(kind tokenKind, char rune) *token { } } +type lexerMode string + +const ( + lexerModeDefault = lexerMode("default") + lexerModeBExp = lexerMode("bracket expression") +) + type lexer struct { src *bufio.Reader lastChar rune prevChar rune reachedEOF bool + mode lexerMode } func newLexer(src io.Reader) *lexer { @@ -45,6 +55,7 @@ func newLexer(src io.Reader) *lexer { lastChar: nullChar, prevChar: nullChar, reachedEOF: false, + mode: lexerModeDefault, } } @@ -57,6 +68,15 @@ func (l *lexer) next() (*token, error) { return newToken(tokenKindEOF, nullChar), nil } + switch l.mode { + case lexerModeBExp: + return l.nextInBExp(c) + default: + return l.nextInDefault(c) + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { switch c { case '*': return newToken(tokenKindRepeat, nullChar), nil @@ -68,6 +88,39 @@ func (l *lexer) next() (*token, error) { return newToken(tokenKindGroupOpen, nullChar), nil case ')': return newToken(tokenKindGroupClose, nullChar), nil + case '[': + l.mode = lexerModeBExp + return newToken(tokenKindBExpOpen, nullChar), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return nil, &SyntaxError{ + message: "incompleted escape sequence; unexpected EOF follows \\ character", + } + } + switch { + case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']': + return newToken(tokenKindChar, c), nil + default: + return nil, &SyntaxError{ + message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)), + } + } + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case ']': + l.mode = lexerModeDefault + return newToken(tokenKindBExpClose, nullChar), nil case '\\': c, eof, err := l.read() if err != nil { @@ -79,7 +132,7 @@ func (l *lexer) next() (*token, error) { } } switch { - case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')': + case c == '\\' || c == ']': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index 75770b0..11cf043 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -31,19 +31,21 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters", - src: ".*|()", + src: ".*|()[]", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), newToken(tokenKindAlt, nullChar), newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the escape sequences", - src: "\\\\\\.\\*\\|\\(\\)", + src: "\\\\\\.\\*\\|\\(\\)\\[\\]", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), @@ -51,6 +53,32 @@ func TestLexer(t *testing.T) { newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindChar, ']'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "in a bracket expression, the special characters are also handled as normal characters", + src: "[\\\\.*|()[\\]].*|()][", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindChar, ']'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindAnyChar, nullChar), + newToken(tokenKindRepeat, nullChar), + newToken(tokenKindAlt, nullChar), + newToken(tokenKindGroupOpen, nullChar), + newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindEOF, nullChar), }, }, diff --git a/compiler/parser.go b/compiler/parser.go index 03dc198..ede601d 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -155,9 +155,31 @@ func (p *parser) parseGroup() astNode { defer p.expect(tokenKindGroupClose) return p.parseAlt() } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() astNode { if p.consume(tokenKindAnyChar) { return genAnyCharAST(p.lastTok) } + if p.consume(tokenKindBExpOpen) { + defer p.expect(tokenKindBExpClose) + left := p.parseNormalChar() + if left == nil { + raiseSyntaxError("bracket expression must include at least one character") + } + for { + right := p.parseNormalChar() + if right == nil { + break + } + left = newAltNode(left, right) + } + return left + } + return p.parseNormalChar() +} +func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil } @@ -303,12 +325,9 @@ func (p *parser) consume(expected tokenKind) bool { tok = p.peekedTok p.peekedTok = nil } else { - for { - tok, err = p.lex.next() - if err != nil { - panic(err) - } - break + tok, err = p.lex.next() + if err != nil { + panic(err) } } p.lastTok = tok |