diff options
-rw-r--r-- | compiler/lexer.go | 128 | ||||
-rw-r--r-- | compiler/lexer_test.go | 180 | ||||
-rw-r--r-- | compiler/parser.go | 55 | ||||
-rw-r--r-- | compiler/parser_test.go | 127 | ||||
-rw-r--r-- | compiler/syntax_error.go | 3 | ||||
-rw-r--r-- | driver/lexer_test.go | 37 |
6 files changed, 512 insertions, 18 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go index c1aa67e..5987e44 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -4,6 +4,7 @@ import ( "bufio" "fmt" "io" + "strings" ) type tokenKind string @@ -21,12 +22,17 @@ const ( tokenKindInverseBExpOpen = tokenKind("[^") tokenKindBExpClose = tokenKind("]") tokenKindCharRange = tokenKind("-") + tokenKindCodePointLeader = tokenKind("\\u") + tokenKindLBrace = tokenKind("{") + tokenKindRBrace = tokenKind("}") + tokenKindCodePoint = tokenKind("code point") tokenKindEOF = tokenKind("eof") ) type token struct { - kind tokenKind - char rune + kind tokenKind + char rune + codePoint string } const nullChar = '\u0000' @@ -38,13 +44,45 @@ func newToken(kind tokenKind, char rune) *token { } } +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + type lexerMode string const ( lexerModeDefault = lexerMode("default") lexerModeBExp = lexerMode("bracket expression") + lexerModeCPExp = lexerMode("code point expression") ) +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + type rangeState string // [a-z] @@ -71,7 +109,7 @@ type lexer struct { prevEOF1 bool prevChar2 rune pervEOF2 bool - mode lexerMode + modeStack *lexerModeStack rangeState rangeState errMsgDetails string } @@ -89,7 +127,7 @@ func newLexer(src io.Reader) *lexer { prevEOF1: false, prevChar2: nullChar, pervEOF2: false, - mode: lexerModeDefault, + modeStack: newLexerModeStack(), rangeState: rangeStateReady, } } @@ -103,7 +141,7 @@ func (l *lexer) next() (*token, error) { return newToken(tokenKindEOF, nullChar), nil } - switch l.mode { + switch l.modeStack.top() { case lexerModeBExp: tok, err := l.nextInBExp(c) if err != nil { @@ -111,7 +149,7 @@ func (l *lexer) next() (*token, error) { } switch tok.kind { case tokenKindBExpClose: - l.mode = lexerModeDefault + l.modeStack.pop() case tokenKindCharRange: l.rangeState = rangeStateExpectRangeTerminator case tokenKindChar: @@ -121,6 +159,18 @@ func (l *lexer) next() (*token, error) { case rangeStateExpectRangeTerminator: l.rangeState = rangeStateReady } + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() } return tok, nil default: @@ -130,11 +180,13 @@ func (l *lexer) next() (*token, error) { } switch tok.kind { case tokenKindBExpOpen: - l.mode = lexerModeBExp + l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady case tokenKindInverseBExpOpen: - l.mode = lexerModeBExp + l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) } return tok, nil } @@ -210,6 +262,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { if eof { return nil, synErrIncompletedEscSeq } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -259,6 +314,9 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { if eof { return nil, synErrIncompletedEscSeq } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } if c == '\\' || c == '^' || c == '-' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -269,6 +327,60 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { } } +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + return nil, synErrInvalidCodePoint + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + return nil, synErrInvalidCodePoint + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + return nil, synErrInvalidCodePoint + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + func (l *lexer) read() (rune, bool, error) { if l.reachedEOF { return l.lastChar, l.reachedEOF, nil diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index c77d7c7..87e3a81 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -30,7 +30,7 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters in default mode", - src: ".*+?|()[", + src: ".*+?|()[\\u", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), @@ -40,6 +40,7 @@ func TestLexer(t *testing.T) { newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindEOF, nullChar), }, }, @@ -60,26 +61,36 @@ func TestLexer(t *testing.T) { }, }, { - caption: "] is treated as an ordinary character in default mode", - src: "]", + caption: "], {, and } are treated as an ordinary character in default mode", + src: "]{}", tokens: []*token{ newToken(tokenKindChar, ']'), + newToken(tokenKindChar, '{'), + newToken(tokenKindChar, '}'), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the special characters in bracket expression mode", - src: "[a-z][^a-z]", + src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09AF"), + newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09abcf"), + newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, @@ -233,6 +244,163 @@ func TestLexer(t *testing.T) { }, err: synErrIncompletedEscSeq, }, + { + caption: "lexer can recognize the special characters and code points in code point expression mode", + src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a one digit hex string isn't a valid code point", + src: "\\u{0", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a two digits hex string isn't a valid code point", + src: "\\u{01", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a three digits hex string isn't a valid code point", + src: "\\u{012", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a four digits hex string is a valid code point", + src: "\\u{0123}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a five digits hex string isn't a valid code point", + src: "\\u{01234", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a six digits hex string is a valid code point", + src: "\\u{012345}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("012345"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a seven digits hex string isn't a valid code point", + src: "\\u{0123456", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{g", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{G", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { @@ -268,7 +436,7 @@ func TestLexer(t *testing.T) { func testToken(t *testing.T, a, e *token) { t.Helper() - if e.kind != a.kind || e.char != a.char { - t.Fatalf("unexpected token; want: %v, got: %v", e, a) + if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { + t.Fatalf("unexpected token; want: %+v, got: %+v", e, a) } } diff --git a/compiler/parser.go b/compiler/parser.go index 63e5549..ba15cd0 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -2,6 +2,8 @@ package compiler import ( "bytes" + "encoding/binary" + "encoding/hex" "fmt" "io" "strings" @@ -307,6 +309,9 @@ func (p *parser) parseSingleChar() astNode { p.expect(tokenKindBExpClose) return inverse } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { @@ -318,6 +323,9 @@ func (p *parser) parseSingleChar() astNode { } func (p *parser) parseBExpElem() astNode { + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } left := p.parseNormalChar() if left == nil { return nil @@ -338,6 +346,53 @@ func (p *parser) parseBExpElem() astNode { return genRangeAST(left, right) } +func (p *parser) parseCodePoint() astNode { + if !p.consume(tokenKindLBrace) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + if !p.consume(tokenKindCodePoint) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + + var cp []byte + { + // Although hex.DecodeString method can handle only a hex string that has even length, + // `codePoint` always has even length by the lexical specification. + b, err := hex.DecodeString(p.lastTok.codePoint) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err)) + } + // `b` must be 4 bytes to convert it into a 32-bit integer. + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) + } + n := binary.BigEndian.Uint32(b) + if n < 0x0000 || n > 0x10FFFF { + raiseSyntaxError(synErrCPExpOutOfRange) + } + + cp = []byte(string(rune(n))) + } + + var concat astNode + { + concat = newSymbolNode(cp[0]) + for _, b := range cp[1:] { + concat = genConcatNode( + concat, + newSymbolNode(b), + ) + } + } + + if !p.consume(tokenKindRBrace) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + + return concat +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil diff --git a/compiler/parser_test.go b/compiler/parser_test.go index dcbe924..c636d8b 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -71,6 +71,19 @@ func TestParser_parse(t *testing.T) { ), }, { + pattern: "\\u{3042}?", + ast: genConcatNode( + newOptionNode( + genConcatNode( + newSymbolNodeWithPos(0xE3, symPos(1)), + newSymbolNodeWithPos(0x81, symPos(2)), + newSymbolNodeWithPos(0x82, symPos(3)), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { pattern: "(a)?", ast: genConcatNode( newOptionNode( @@ -160,6 +173,19 @@ func TestParser_parse(t *testing.T) { ), }, { + pattern: "\\u{3042}*", + ast: genConcatNode( + newRepeatNode( + genConcatNode( + newSymbolNodeWithPos(0xE3, symPos(1)), + newSymbolNodeWithPos(0x81, symPos(2)), + newSymbolNodeWithPos(0x82, symPos(3)), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { pattern: "((a*)*)*", ast: genConcatNode( newRepeatNode( @@ -246,6 +272,24 @@ func TestParser_parse(t *testing.T) { ), }, { + pattern: "\\u{3042}+", + ast: genConcatNode( + genConcatNode( + newSymbolNodeWithPos(0xE3, symPos(1)), + newSymbolNodeWithPos(0x81, symPos(2)), + newSymbolNodeWithPos(0x82, symPos(3)), + ), + newRepeatNode( + genConcatNode( + newSymbolNodeWithPos(0xE3, symPos(4)), + newSymbolNodeWithPos(0x81, symPos(5)), + newSymbolNodeWithPos(0x82, symPos(6)), + ), + ), + newEndMarkerNodeWithPos(1, endPos(7)), + ), + }, + { pattern: "((a+)+)+", ast: genConcatNode( genConcatNode( @@ -715,6 +759,85 @@ func TestParser_parse(t *testing.T) { ), }, { + pattern: "\\u{006E}", + ast: genConcatNode( + newSymbolNodeWithPos(0x6E, symPos(1)), + newEndMarkerNodeWithPos(1, endPos(2)), + ), + }, + { + pattern: "\\u{03BD}", + ast: genConcatNode( + genConcatNode( + newSymbolNodeWithPos(0xCE, symPos(1)), + newSymbolNodeWithPos(0xBD, symPos(2)), + ), + newEndMarkerNodeWithPos(1, endPos(3)), + ), + }, + { + pattern: "\\u{306B}", + ast: genConcatNode( + genConcatNode( + newSymbolNodeWithPos(0xE3, symPos(1)), + newSymbolNodeWithPos(0x81, symPos(2)), + newSymbolNodeWithPos(0xAB, symPos(3)), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { + pattern: "\\u{01F638}", + ast: genConcatNode( + genConcatNode( + newSymbolNodeWithPos(0xF0, symPos(1)), + newSymbolNodeWithPos(0x9F, symPos(2)), + newSymbolNodeWithPos(0x98, symPos(3)), + newSymbolNodeWithPos(0xB8, symPos(4)), + ), + newEndMarkerNodeWithPos(1, endPos(5)), + ), + }, + { + pattern: "\\u{0000}", + ast: genConcatNode( + newSymbolNodeWithPos(0x00, symPos(1)), + newEndMarkerNodeWithPos(1, endPos(2)), + ), + }, + { + pattern: "\\u{10FFFF}", + ast: genConcatNode( + genConcatNode( + newSymbolNodeWithPos(0xF4, symPos(1)), + newSymbolNodeWithPos(0x8F, symPos(2)), + newSymbolNodeWithPos(0xBF, symPos(3)), + newSymbolNodeWithPos(0xBF, symPos(4)), + ), + newEndMarkerNodeWithPos(1, endPos(5)), + ), + }, + { + pattern: "\\u{110000}", + syntaxError: synErrCPExpOutOfRange, + }, + { + pattern: "\\u", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{03BD", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{}", + syntaxError: synErrCPExpInvalidForm, + }, + { pattern: "(a)", ast: newConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -997,8 +1120,8 @@ func testAST(t *testing.T, expected, actual astNode) { switch e := expected.(type) { case *symbolNode: a := actual.(*symbolNode) - if a.pos != e.pos { - t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos) + if a.pos != e.pos || a.from != e.from || a.to != e.to { + t.Fatalf("unexpected node; want: %+v, got: %+v", e, a) } case *endMarkerNode: a := actual.(*endMarkerNode) diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go index be2dc38..df8977d 100644 --- a/compiler/syntax_error.go +++ b/compiler/syntax_error.go @@ -20,6 +20,7 @@ var ( // lexical errors synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\") synErrInvalidEscSeq = newSyntaxError("invalid escape sequence") + synErrInvalidCodePoint = newSyntaxError("code points must consist of just 4 or 6 hex digits") // syntax errors synErrUnexpectedToken = newSyntaxError("unexpected token") @@ -34,4 +35,6 @@ var ( synErrBExpUnclosed = newSyntaxError("unclosed bracket expression") synErrBExpInvalidForm = newSyntaxError("invalid bracket expression") synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order") + synErrCPExpInvalidForm = newSyntaxError("invalid code point expression") + synErrCPExpOutOfRange = newSyntaxError("a code point must be between U+0000 to U+10FFFF") ) diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 68830a5..26b5d49 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -143,12 +143,12 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - // all 1 byte characters + // all 1 byte characters except null character (U+0000) // // NOTE: // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. - // There is room for improvement in this behavior of the lexer. + // If a pattern needs a null character, use code point expression \u{0000}. newLexEntry("1ByteChar", "[\x01-\x7f]"), }, }, @@ -416,6 +416,39 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry("1ByteChar", "\\u{006E}"), + newLexEntry("2ByteChar", "\\u{03BD}"), + newLexEntry("3ByteChar", "\\u{306B}"), + newLexEntry("4ByteChar", "\\u{01F638}"), + }, + }, + src: "nνに😸", + tokens: []*Token{ + newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), + newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), + newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), + }, + }, + src: "nνに😸", + tokens: []*Token{ + newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFToken(), + }, + }, } for i, tt := range test { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { |