aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/lexer.go128
-rw-r--r--compiler/lexer_test.go180
-rw-r--r--compiler/parser.go55
-rw-r--r--compiler/parser_test.go127
-rw-r--r--compiler/syntax_error.go3
-rw-r--r--driver/lexer_test.go37
6 files changed, 512 insertions, 18 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index c1aa67e..5987e44 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -4,6 +4,7 @@ import (
"bufio"
"fmt"
"io"
+ "strings"
)
type tokenKind string
@@ -21,12 +22,17 @@ const (
tokenKindInverseBExpOpen = tokenKind("[^")
tokenKindBExpClose = tokenKind("]")
tokenKindCharRange = tokenKind("-")
+ tokenKindCodePointLeader = tokenKind("\\u")
+ tokenKindLBrace = tokenKind("{")
+ tokenKindRBrace = tokenKind("}")
+ tokenKindCodePoint = tokenKind("code point")
tokenKindEOF = tokenKind("eof")
)
type token struct {
- kind tokenKind
- char rune
+ kind tokenKind
+ char rune
+ codePoint string
}
const nullChar = '\u0000'
@@ -38,13 +44,45 @@ func newToken(kind tokenKind, char rune) *token {
}
}
+func newCodePointToken(codePoint string) *token {
+ return &token{
+ kind: tokenKindCodePoint,
+ codePoint: codePoint,
+ }
+}
+
type lexerMode string
const (
lexerModeDefault = lexerMode("default")
lexerModeBExp = lexerMode("bracket expression")
+ lexerModeCPExp = lexerMode("code point expression")
)
+type lexerModeStack struct {
+ stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+ return &lexerModeStack{
+ stack: []lexerMode{
+ lexerModeDefault,
+ },
+ }
+}
+
+func (s *lexerModeStack) top() lexerMode {
+ return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+ s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+ s.stack = s.stack[:len(s.stack)-1]
+}
+
type rangeState string
// [a-z]
@@ -71,7 +109,7 @@ type lexer struct {
prevEOF1 bool
prevChar2 rune
pervEOF2 bool
- mode lexerMode
+ modeStack *lexerModeStack
rangeState rangeState
errMsgDetails string
}
@@ -89,7 +127,7 @@ func newLexer(src io.Reader) *lexer {
prevEOF1: false,
prevChar2: nullChar,
pervEOF2: false,
- mode: lexerModeDefault,
+ modeStack: newLexerModeStack(),
rangeState: rangeStateReady,
}
}
@@ -103,7 +141,7 @@ func (l *lexer) next() (*token, error) {
return newToken(tokenKindEOF, nullChar), nil
}
- switch l.mode {
+ switch l.modeStack.top() {
case lexerModeBExp:
tok, err := l.nextInBExp(c)
if err != nil {
@@ -111,7 +149,7 @@ func (l *lexer) next() (*token, error) {
}
switch tok.kind {
case tokenKindBExpClose:
- l.mode = lexerModeDefault
+ l.modeStack.pop()
case tokenKindCharRange:
l.rangeState = rangeStateExpectRangeTerminator
case tokenKindChar:
@@ -121,6 +159,18 @@ func (l *lexer) next() (*token, error) {
case rangeStateExpectRangeTerminator:
l.rangeState = rangeStateReady
}
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ }
+ return tok, nil
+ case lexerModeCPExp:
+ tok, err := l.nextInCodePoint(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
}
return tok, nil
default:
@@ -130,11 +180,13 @@ func (l *lexer) next() (*token, error) {
}
switch tok.kind {
case tokenKindBExpOpen:
- l.mode = lexerModeBExp
+ l.modeStack.push(lexerModeBExp)
l.rangeState = rangeStateReady
case tokenKindInverseBExpOpen:
- l.mode = lexerModeBExp
+ l.modeStack.push(lexerModeBExp)
l.rangeState = rangeStateReady
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
}
return tok, nil
}
@@ -210,6 +262,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
if eof {
return nil, synErrIncompletedEscSeq
}
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
return newToken(tokenKindChar, c), nil
}
@@ -259,6 +314,9 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
if eof {
return nil, synErrIncompletedEscSeq
}
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
if c == '\\' || c == '^' || c == '-' || c == ']' {
return newToken(tokenKindChar, c), nil
}
@@ -269,6 +327,60 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
}
}
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ if !isHexDigit(c) {
+ return nil, synErrInvalidCodePoint
+ }
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if !isHexDigit(c) || n >= 6 {
+ return nil, synErrInvalidCodePoint
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ cp := b.String()
+ cpLen := len(cp)
+ if !(cpLen == 4 || cpLen == 6) {
+ return nil, synErrInvalidCodePoint
+ }
+ return newCodePointToken(b.String()), nil
+ }
+}
+
+func isHexDigit(c rune) bool {
+ if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+ return true
+ }
+ return false
+}
+
func (l *lexer) read() (rune, bool, error) {
if l.reachedEOF {
return l.lastChar, l.reachedEOF, nil
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index c77d7c7..87e3a81 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -30,7 +30,7 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the special characters in default mode",
- src: ".*+?|()[",
+ src: ".*+?|()[\\u",
tokens: []*token{
newToken(tokenKindAnyChar, nullChar),
newToken(tokenKindRepeat, nullChar),
@@ -40,6 +40,7 @@ func TestLexer(t *testing.T) {
newToken(tokenKindGroupOpen, nullChar),
newToken(tokenKindGroupClose, nullChar),
newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
@@ -60,26 +61,36 @@ func TestLexer(t *testing.T) {
},
},
{
- caption: "] is treated as an ordinary character in default mode",
- src: "]",
+ caption: "], {, and } are treated as an ordinary character in default mode",
+ src: "]{}",
tokens: []*token{
newToken(tokenKindChar, ']'),
+ newToken(tokenKindChar, '{'),
+ newToken(tokenKindChar, '}'),
newToken(tokenKindEOF, nullChar),
},
},
{
caption: "lexer can recognize the special characters in bracket expression mode",
- src: "[a-z][^a-z]",
+ src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
tokens: []*token{
newToken(tokenKindBExpOpen, nullChar),
newToken(tokenKindChar, 'a'),
newToken(tokenKindCharRange, nullChar),
newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09AF"),
+ newToken(tokenKindRBrace, nullChar),
newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindInverseBExpOpen, nullChar),
newToken(tokenKindChar, 'a'),
newToken(tokenKindCharRange, nullChar),
newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09abcf"),
+ newToken(tokenKindRBrace, nullChar),
newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
},
@@ -233,6 +244,163 @@ func TestLexer(t *testing.T) {
},
err: synErrIncompletedEscSeq,
},
+ {
+ caption: "lexer can recognize the special characters and code points in code point expression mode",
+ src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a one digit hex string isn't a valid code point",
+ src: "\\u{0",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a two digits hex string isn't a valid code point",
+ src: "\\u{01",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a three digits hex string isn't a valid code point",
+ src: "\\u{012",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a four digits hex string is a valid code point",
+ src: "\\u{0123}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a five digits hex string isn't a valid code point",
+ src: "\\u{01234",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a six digits hex string is a valid code point",
+ src: "\\u{012345}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("012345"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a seven digits hex string isn't a valid code point",
+ src: "\\u{0123456",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{g",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{G",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
}
for _, tt := range tests {
t.Run(tt.caption, func(t *testing.T) {
@@ -268,7 +436,7 @@ func TestLexer(t *testing.T) {
func testToken(t *testing.T, a, e *token) {
t.Helper()
- if e.kind != a.kind || e.char != a.char {
- t.Fatalf("unexpected token; want: %v, got: %v", e, a)
+ if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+ t.Fatalf("unexpected token; want: %+v, got: %+v", e, a)
}
}
diff --git a/compiler/parser.go b/compiler/parser.go
index 63e5549..ba15cd0 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -2,6 +2,8 @@ package compiler
import (
"bytes"
+ "encoding/binary"
+ "encoding/hex"
"fmt"
"io"
"strings"
@@ -307,6 +309,9 @@ func (p *parser) parseSingleChar() astNode {
p.expect(tokenKindBExpClose)
return inverse
}
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
c := p.parseNormalChar()
if c == nil {
if p.consume(tokenKindBExpClose) {
@@ -318,6 +323,9 @@ func (p *parser) parseSingleChar() astNode {
}
func (p *parser) parseBExpElem() astNode {
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
left := p.parseNormalChar()
if left == nil {
return nil
@@ -338,6 +346,53 @@ func (p *parser) parseBExpElem() astNode {
return genRangeAST(left, right)
}
+func (p *parser) parseCodePoint() astNode {
+ if !p.consume(tokenKindLBrace) {
+ raiseSyntaxError(synErrCPExpInvalidForm)
+ }
+ if !p.consume(tokenKindCodePoint) {
+ raiseSyntaxError(synErrCPExpInvalidForm)
+ }
+
+ var cp []byte
+ {
+ // Although hex.DecodeString method can handle only a hex string that has even length,
+ // `codePoint` always has even length by the lexical specification.
+ b, err := hex.DecodeString(p.lastTok.codePoint)
+ if err != nil {
+ panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err))
+ }
+ // `b` must be 4 bytes to convert it into a 32-bit integer.
+ l := len(b)
+ for i := 0; i < 4-l; i++ {
+ b = append([]byte{0}, b...)
+ }
+ n := binary.BigEndian.Uint32(b)
+ if n < 0x0000 || n > 0x10FFFF {
+ raiseSyntaxError(synErrCPExpOutOfRange)
+ }
+
+ cp = []byte(string(rune(n)))
+ }
+
+ var concat astNode
+ {
+ concat = newSymbolNode(cp[0])
+ for _, b := range cp[1:] {
+ concat = genConcatNode(
+ concat,
+ newSymbolNode(b),
+ )
+ }
+ }
+
+ if !p.consume(tokenKindRBrace) {
+ raiseSyntaxError(synErrCPExpInvalidForm)
+ }
+
+ return concat
+}
+
func (p *parser) parseNormalChar() astNode {
if !p.consume(tokenKindChar) {
return nil
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
index dcbe924..c636d8b 100644
--- a/compiler/parser_test.go
+++ b/compiler/parser_test.go
@@ -71,6 +71,19 @@ func TestParser_parse(t *testing.T) {
),
},
{
+ pattern: "\\u{3042}?",
+ ast: genConcatNode(
+ newOptionNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xE3, symPos(1)),
+ newSymbolNodeWithPos(0x81, symPos(2)),
+ newSymbolNodeWithPos(0x82, symPos(3)),
+ ),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(4)),
+ ),
+ },
+ {
pattern: "(a)?",
ast: genConcatNode(
newOptionNode(
@@ -160,6 +173,19 @@ func TestParser_parse(t *testing.T) {
),
},
{
+ pattern: "\\u{3042}*",
+ ast: genConcatNode(
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xE3, symPos(1)),
+ newSymbolNodeWithPos(0x81, symPos(2)),
+ newSymbolNodeWithPos(0x82, symPos(3)),
+ ),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(4)),
+ ),
+ },
+ {
pattern: "((a*)*)*",
ast: genConcatNode(
newRepeatNode(
@@ -246,6 +272,24 @@ func TestParser_parse(t *testing.T) {
),
},
{
+ pattern: "\\u{3042}+",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xE3, symPos(1)),
+ newSymbolNodeWithPos(0x81, symPos(2)),
+ newSymbolNodeWithPos(0x82, symPos(3)),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xE3, symPos(4)),
+ newSymbolNodeWithPos(0x81, symPos(5)),
+ newSymbolNodeWithPos(0x82, symPos(6)),
+ ),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(7)),
+ ),
+ },
+ {
pattern: "((a+)+)+",
ast: genConcatNode(
genConcatNode(
@@ -715,6 +759,85 @@ func TestParser_parse(t *testing.T) {
),
},
{
+ pattern: "\\u{006E}",
+ ast: genConcatNode(
+ newSymbolNodeWithPos(0x6E, symPos(1)),
+ newEndMarkerNodeWithPos(1, endPos(2)),
+ ),
+ },
+ {
+ pattern: "\\u{03BD}",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xCE, symPos(1)),
+ newSymbolNodeWithPos(0xBD, symPos(2)),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(3)),
+ ),
+ },
+ {
+ pattern: "\\u{306B}",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xE3, symPos(1)),
+ newSymbolNodeWithPos(0x81, symPos(2)),
+ newSymbolNodeWithPos(0xAB, symPos(3)),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(4)),
+ ),
+ },
+ {
+ pattern: "\\u{01F638}",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xF0, symPos(1)),
+ newSymbolNodeWithPos(0x9F, symPos(2)),
+ newSymbolNodeWithPos(0x98, symPos(3)),
+ newSymbolNodeWithPos(0xB8, symPos(4)),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(5)),
+ ),
+ },
+ {
+ pattern: "\\u{0000}",
+ ast: genConcatNode(
+ newSymbolNodeWithPos(0x00, symPos(1)),
+ newEndMarkerNodeWithPos(1, endPos(2)),
+ ),
+ },
+ {
+ pattern: "\\u{10FFFF}",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNodeWithPos(0xF4, symPos(1)),
+ newSymbolNodeWithPos(0x8F, symPos(2)),
+ newSymbolNodeWithPos(0xBF, symPos(3)),
+ newSymbolNodeWithPos(0xBF, symPos(4)),
+ ),
+ newEndMarkerNodeWithPos(1, endPos(5)),
+ ),
+ },
+ {
+ pattern: "\\u{110000}",
+ syntaxError: synErrCPExpOutOfRange,
+ },
+ {
+ pattern: "\\u",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{03BD",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{}",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
pattern: "(a)",
ast: newConcatNode(
newSymbolNodeWithPos(byte('a'), symPos(1)),
@@ -997,8 +1120,8 @@ func testAST(t *testing.T, expected, actual astNode) {
switch e := expected.(type) {
case *symbolNode:
a := actual.(*symbolNode)
- if a.pos != e.pos {
- t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos)
+ if a.pos != e.pos || a.from != e.from || a.to != e.to {
+ t.Fatalf("unexpected node; want: %+v, got: %+v", e, a)
}
case *endMarkerNode:
a := actual.(*endMarkerNode)
diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go
index be2dc38..df8977d 100644
--- a/compiler/syntax_error.go
+++ b/compiler/syntax_error.go
@@ -20,6 +20,7 @@ var (
// lexical errors
synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\")
synErrInvalidEscSeq = newSyntaxError("invalid escape sequence")
+ synErrInvalidCodePoint = newSyntaxError("code points must consist of just 4 or 6 hex digits")
// syntax errors
synErrUnexpectedToken = newSyntaxError("unexpected token")
@@ -34,4 +35,6 @@ var (
synErrBExpUnclosed = newSyntaxError("unclosed bracket expression")
synErrBExpInvalidForm = newSyntaxError("invalid bracket expression")
synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order")
+ synErrCPExpInvalidForm = newSyntaxError("invalid code point expression")
+ synErrCPExpOutOfRange = newSyntaxError("a code point must be between U+0000 to U+10FFFF")
)
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 68830a5..26b5d49 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -143,12 +143,12 @@ func TestLexer_Next(t *testing.T) {
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- // all 1 byte characters
+ // all 1 byte characters except null character (U+0000)
//
// NOTE:
// maleeni cannot handle the null character in patterns because compiler.lexer,
// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
- // There is room for improvement in this behavior of the lexer.
+ // If a pattern needs a null character, use code point expression \u{0000}.
newLexEntry("1ByteChar", "[\x01-\x7f]"),
},
},
@@ -416,6 +416,39 @@ func TestLexer_Next(t *testing.T) {
newEOFToken(),
},
},
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ newLexEntry("1ByteChar", "\\u{006E}"),
+ newLexEntry("2ByteChar", "\\u{03BD}"),
+ newLexEntry("3ByteChar", "\\u{306B}"),
+ newLexEntry("4ByteChar", "\\u{01F638}"),
+ },
+ },
+ src: "nνに😸",
+ tokens: []*Token{
+ newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})),
+ newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+ newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+ },
+ },
+ src: "nνに😸",
+ tokens: []*Token{
+ newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+ newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+ newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFToken(),
+ },
+ },
}
for i, tt := range test {
t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {