aboutsummaryrefslogtreecommitdiff
path: root/compiler/lexer.go
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-05-25 21:55:17 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-05-25 21:57:45 +0900
commit520bf02582be7ab36b17fd78f8931cfdb702b07f (patch)
treea1e7ad54915152fce6f96a18312e28f34f256c84 /compiler/lexer.go
parentFix the initial state number (diff)
downloadtre-520bf02582be7ab36b17fd78f8931cfdb702b07f.tar.gz
tre-520bf02582be7ab36b17fd78f8931cfdb702b07f.tar.xz
Add fragment expression
A fragment entry is defined by an entry whose `fragment` field is `true`, and is referenced by a fragment expression (`\f{...}`).
Diffstat (limited to 'compiler/lexer.go')
-rw-r--r--compiler/lexer.go74
1 files changed, 70 insertions, 4 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index 1aadf50..4f6fdac 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -24,19 +24,22 @@ const (
tokenKindCharRange = tokenKind("-")
tokenKindCodePointLeader = tokenKind("\\u")
tokenKindCharPropLeader = tokenKind("\\p")
+ tokenKindFragmentLeader = tokenKind("\\f")
tokenKindLBrace = tokenKind("{")
tokenKindRBrace = tokenKind("}")
tokenKindEqual = tokenKind("=")
tokenKindCodePoint = tokenKind("code point")
tokenKindCharPropSymbol = tokenKind("character property symbol")
+ tokenKindFragmentSymbol = tokenKind("fragment symbol")
tokenKindEOF = tokenKind("eof")
)
type token struct {
- kind tokenKind
- char rune
- propSymbol string
- codePoint string
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
+ fragmentSymbol string
}
const nullChar = '\u0000'
@@ -62,6 +65,13 @@ func newCharPropSymbolToken(propSymbol string) *token {
}
}
+func newFragmentSymbolToken(fragmentSymbol string) *token {
+ return &token{
+ kind: tokenKindFragmentSymbol,
+ fragmentSymbol: fragmentSymbol,
+ }
+}
+
type lexerMode string
const (
@@ -69,6 +79,7 @@ const (
lexerModeBExp = lexerMode("bracket expression")
lexerModeCPExp = lexerMode("code point expression")
lexerModeCharPropExp = lexerMode("character property expression")
+ lexerModeFragmentExp = lexerMode("fragment expression")
)
type lexerModeStack struct {
@@ -197,6 +208,16 @@ func (l *lexer) next() (*token, error) {
l.modeStack.pop()
}
return tok, nil
+ case lexerModeFragmentExp:
+ tok, err := l.nextInFragment(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
default:
tok, err := l.nextInDefault(c)
if err != nil {
@@ -213,6 +234,8 @@ func (l *lexer) next() (*token, error) {
l.modeStack.push(lexerModeCPExp)
case tokenKindCharPropLeader:
l.modeStack.push(lexerModeCharPropExp)
+ case tokenKindFragmentLeader:
+ l.modeStack.push(lexerModeFragmentExp)
}
return tok, nil
}
@@ -294,6 +317,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
if c == 'p' {
return newToken(tokenKindCharPropLeader, nullChar), nil
}
+ if c == 'f' {
+ return newToken(tokenKindFragmentLeader, nullChar), nil
+ }
if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
return newToken(tokenKindChar, c), nil
}
@@ -455,6 +481,46 @@ func (l *lexer) nextInCharProp(c rune) (*token, error) {
}
}
+func (l *lexer) nextInFragment(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ raiseSyntaxError(SynErrFragmentInvalidSymbol)
+ }
+ return newFragmentSymbolToken(sym), nil
+ }
+}
+
func (l *lexer) read() (rune, bool, error) {
if l.reachedEOF {
return l.lastChar, l.reachedEOF, nil