diff options
-rw-r--r-- | compiler/lexer.go | 5 | ||||
-rw-r--r-- | compiler/lexer_test.go | 11 | ||||
-rw-r--r-- | compiler/parser.go | 709 | ||||
-rw-r--r-- | driver/lexer_test.go | 261 |
4 files changed, 977 insertions, 9 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go index 91cd209..acc9658 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -19,6 +19,7 @@ const ( tokenKindGroupClose = tokenKind(")") tokenKindBExpOpen = tokenKind("[") tokenKindBExpClose = tokenKind("]") + tokenKindCharRange = tokenKind("-") tokenKindEOF = tokenKind("eof") ) @@ -124,6 +125,8 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { func (l *lexer) nextInBExp(c rune) (*token, error) { switch c { + case '-': + return newToken(tokenKindCharRange, nullChar), nil case ']': l.mode = lexerModeDefault return newToken(tokenKindBExpClose, nullChar), nil @@ -138,7 +141,7 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { } } switch { - case c == '\\' || c == ']': + case c == '\\' || c == '-' || c == ']': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index e2e7d7a..be00317 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -31,7 +31,7 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters", - src: ".*+?|()[]", + src: ".*+?|()[-]", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), @@ -41,13 +41,14 @@ func TestLexer(t *testing.T) { newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCharRange, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the escape sequences", - src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\]", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\-]", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), @@ -59,12 +60,15 @@ func TestLexer(t *testing.T) { newToken(tokenKindChar, ')'), newToken(tokenKindChar, '['), newToken(tokenKindChar, ']'), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "in a bracket expression, the special characters are also handled as normal characters", - src: "[\\\\.*+?|()[\\]].*|()][", + src: "[\\\\.*+?|()[\\]].*|()-][", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '\\'), @@ -83,6 +87,7 @@ func TestLexer(t *testing.T) { newToken(tokenKindAlt, nullChar), newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindEOF, nullChar), diff --git a/compiler/parser.go b/compiler/parser.go index 04cd8ad..bae4e17 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -15,9 +15,9 @@ func (err *SyntaxError) Error() string { return fmt.Sprintf("Syntax Error: %v", err.message) } -func raiseSyntaxError(message string) { +func raiseSyntaxError(format string, a ...interface{}) { panic(&SyntaxError{ - message: message, + message: fmt.Sprintf(format, a...), }) } @@ -170,12 +170,12 @@ func (p *parser) parseSingleChar() astNode { } if p.consume(tokenKindBExpOpen) { defer p.expect(tokenKindBExpClose) - left := p.parseNormalChar() + left := p.parseBExpElem() if left == nil { raiseSyntaxError("bracket expression must include at least one character") } for { - right := p.parseNormalChar() + right := p.parseBExpElem() if right == nil { break } @@ -185,6 +185,19 @@ func (p *parser) parseSingleChar() astNode { } return p.parseNormalChar() } + +func (p *parser) parseBExpElem() astNode { + left := p.parseNormalChar() + if !p.consume(tokenKindCharRange) { + return left + } + right := p.parseNormalChar() + if right == nil { + raiseSyntaxError("invalid range expression") + } + return genRangeAST(left, right) +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil @@ -316,6 +329,694 @@ func genAnyCharAST(tok *token) astNode { ) } +func genRangeAST(fromNode, toNode astNode) astNode { + from := genByteSeq(fromNode) + to := genByteSeq(toNode) + if !isValidOrder(from, to) { + raiseSyntaxError("range expression with invalid order: [%s-%s] ([%v-%v])", string(from), string(to), from, to) + } + switch len(from) { + case 1: + switch len(to) { + case 1: + return gen1ByteCharRangeAST(from, to) + case 2: + return newAltNode( + gen1ByteCharRangeAST(from, []byte{0x7f}), + gen2ByteCharRangeAST([]byte{0xc2, 0x80}, to), + ) + case 3: + return newAltNode( + newAltNode( + gen1ByteCharRangeAST(from, []byte{0x7f}), + gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}), + ), + gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, to), + ) + case 4: + return newAltNode( + newAltNode( + newAltNode( + gen1ByteCharRangeAST(from, []byte{0x7f}), + gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}), + ), + gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, []byte{0xef, 0xbf, 0xbf}), + ), + gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), + ) + } + case 2: + switch len(to) { + case 2: + return gen2ByteCharRangeAST(from, to) + case 3: + return newAltNode( + gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}), + gen3ByteCharRangeAST([]byte{0xc2, 0x80}, to), + ) + case 4: + return newAltNode( + newAltNode( + gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}), + gen3ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xef, 0xbf, 0xbf}), + ), + gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), + ) + } + case 3: + switch len(to) { + case 3: + return gen3ByteCharRangeAST(from, to) + case 4: + return newAltNode( + gen3ByteCharRangeAST(from, []byte{0xef, 0xbf, 0xbf}), + gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), + ) + } + case 4: + return gen4ByteCharRangeAST(from, to) + } + panic(fmt.Sprintf("invalid range; from: %v, to: %v", from, to)) +} + +func genByteSeq(node astNode) []byte { + switch n := node.(type) { + case *symbolNode: + return []byte{n.from} + case *concatNode: + seq := genByteSeq(n.left) + seq = append(seq, genByteSeq(n.right)...) + return seq + } + panic(fmt.Sprintf("genByteSeq() cannot handle %T: %v", node, node)) +} + +func isValidOrder(from, to []byte) bool { + if len(from) > len(to) { + return false + } + if len(from) < len(to) { + return true + } + for i, f := range from { + t := to[i] + if f > t { + return false + } + if f < t { + return true + } + } + return true +} + +func gen1ByteCharRangeAST(from, to []byte) astNode { + return newRangeSymbolNode(nil, from[0], to[0], symbolPositionNil) +} + +func gen2ByteCharRangeAST(from, to []byte) astNode { + from0 := from[0] + from1 := from[1] + to0 := to[0] + to1 := to[1] + switch { + case from0 == to0 && from1 == to1: + return newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ) + case from0 == to0: + return newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1, to1, symbolPositionNil), + ) + default: + alt1 := newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1, 0xbf, symbolPositionNil), + ) + alt2 := newConcatNode( + newRangeSymbolNode(nil, from0+1, to0, symbolPositionNil), + newRangeSymbolNode(nil, 0x80, to1, symbolPositionNil), + ) + return newAltNode(alt1, alt2) + } +} + +type byteBoundsEntry struct { + min byte + max byte +} + +var ( + bounds3 = [][]byteBoundsEntry{ + nil, + {{min: 0xe0, max: 0xe0}, {min: 0xa0, max: 0xbf}, {min: 0x80, max: 0xbf}}, + {{min: 0xe1, max: 0xec}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, + {{min: 0xed, max: 0xed}, {min: 0x80, max: 0x9f}, {min: 0x80, max: 0xbf}}, + {{min: 0xee, max: 0xef}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, + } + + bounds4 = [][]byteBoundsEntry{ + nil, + {{min: 0xf0, max: 0xf0}, {min: 0x90, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, + {{min: 0xf1, max: 0xf3}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, + {{min: 0xf4, max: 0xf4}, {min: 0x80, max: 0x8f}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, + } +) + +func get3ByteCharRangeNum(seq []byte) int { + head := seq[0] + switch { + case head == 0xe0: + return 1 + case head >= 0xe1 && head <= 0xec: + return 2 + case head == 0xed: + return 3 + case head >= 0xee && head <= 0xef: + return 4 + } + return 0 +} + +func get4ByteCharRangeNum(seq []byte) int { + head := seq[0] + switch { + case head == 0xf0: + return 1 + case head >= 0xf1 && head <= 0xf3: + return 2 + case head == 0xf4: + return 3 + } + return 0 +} + +func gen3ByteCharRangeAST(from, to []byte) astNode { + from0 := from[0] + from1 := from[1] + from2 := from[2] + to0 := to[0] + to1 := to[1] + to2 := to[2] + switch { + case from0 == to0 && from1 == to1 && from2 == to2: + return newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ) + case from0 == to0 && from1 == to1: + return newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2, to2, symbolPositionNil), + ) + case from0 == to0: + rangeNum := get3ByteCharRangeNum(from) + bounds := bounds3[rangeNum] + var alt astNode + alt = newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2, bounds[2].max, symbolPositionNil), + ) + if from1+1 < to1 { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1+1, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil), + ), + ) + return alt + default: + fromRangeNum := get3ByteCharRangeNum(from) + toRangeNum := get3ByteCharRangeNum(to) + bounds := bounds3[fromRangeNum] + var alt astNode + alt = newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2, bounds[2].max, symbolPositionNil), + ) + if from1 < bounds[1].max { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1+1, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + if fromRangeNum == toRangeNum { + if from0+1 < to0 { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, from0+1, to0-1, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + if to1 > bounds[1].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil), + ), + ) + return alt + } + for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ { + bounds := bounds3[rangeNum] + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, bounds[0].min, bounds[0].max, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + bounds = bounds3[toRangeNum] + if to0 > bounds[0].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, bounds[0].min, to0-1, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + if to1 > bounds[1].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil), + ), + ) + return alt + } +} + +func gen4ByteCharRangeAST(from, to []byte) astNode { + from0 := from[0] + from1 := from[1] + from2 := from[2] + from3 := from[3] + to0 := to[0] + to1 := to[1] + to2 := to[2] + to3 := to[3] + switch { + case from0 == to0 && from1 == to1 && from2 == to2 && from3 == to3: + return newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ), + newSymbolNode(nil, from3, symbolPositionNil), + ) + case from0 == to0 && from1 == to1 && from2 == to2: + return newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ), + newRangeSymbolNode(nil, from3, to3, symbolPositionNil), + ) + case from0 == to0 && from1 == to1: + rangeNum := get4ByteCharRangeNum(from) + bounds := bounds4[rangeNum] + var alt astNode + alt = newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ), + newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil), + ) + if from2+1 < to2 { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2+1, to2-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, to2, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil), + ), + ) + return alt + case from0 == to0: + rangeNum := get4ByteCharRangeNum(from) + bounds := bounds4[rangeNum] + var alt astNode + alt = newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ), + newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil), + ) + if from2 < bounds[2].max { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2+1, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if from1+1 < to1 { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1+1, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if to2 > bounds[2].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newSymbolNode(nil, to2, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil), + ), + ) + return alt + default: + fromRangeNum := get4ByteCharRangeNum(from) + toRangeNum := get4ByteCharRangeNum(to) + bounds := bounds4[fromRangeNum] + var alt astNode + alt = newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newSymbolNode(nil, from2, symbolPositionNil), + ), + newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil), + ) + if from2 < bounds[2].max { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newSymbolNode(nil, from1, symbolPositionNil), + ), + newRangeSymbolNode(nil, from2+1, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if from1 < bounds[1].max { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, from0, symbolPositionNil), + newRangeSymbolNode(nil, from1+1, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if fromRangeNum == toRangeNum { + if from0+1 < to0 { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, from0+1, to0-1, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if to1 > bounds[1].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if to2 > bounds[2].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newSymbolNode(nil, to2, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil), + ), + ) + return alt + } + for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ { + bounds := bounds4[rangeNum] + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, bounds[0].min, bounds[0].max, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + bounds = bounds4[toRangeNum] + if to0 > bounds[0].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newRangeSymbolNode(nil, bounds[0].min, to0-1, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if to1 > bounds[1].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + if to2 > bounds[2].min { + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil), + ), + ) + } + alt = newAltNode( + alt, + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(nil, to0, symbolPositionNil), + newSymbolNode(nil, to1, symbolPositionNil), + ), + newSymbolNode(nil, to2, symbolPositionNil), + ), + newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil), + ), + ) + return alt + } +} + func (p *parser) expect(expected tokenKind) { if !p.consume(expected) { tok := p.peekedTok diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 283d5fe..bdf5f03 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -132,6 +132,265 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 1 byte characters + spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"), + }, + }, + src: string([]byte{ + 0x00, + 0x01, + 0x7e, + 0x7f, + }), + tokens: []*Token{ + newToken(1, "1ByteChar", []byte{0x00}), + newToken(1, "1ByteChar", []byte{0x01}), + newToken(1, "1ByteChar", []byte{0x7e}), + newToken(1, "1ByteChar", []byte{0x7f}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 2 byte characters + spec.NewLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), + }, + }, + src: string([]byte{ + 0xc2, 0x80, + 0xc2, 0x81, + 0xdf, 0xbe, + 0xdf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "2ByteChar", []byte{0xc2, 0x80}), + newToken(1, "2ByteChar", []byte{0xc2, 0x81}), + newToken(1, "2ByteChar", []byte{0xdf, 0xbe}), + newToken(1, "2ByteChar", []byte{0xdf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // All bytes are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first two bytes are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xa0, 0xbe, + 0xe0, 0xa0, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first byte are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xbf, 0xbe, + 0xe0, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 3 byte characters + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xbf, 0xbe, + 0xe0, 0xbf, 0xbf, + 0xe1, 0x80, 0x80, + 0xe1, 0x80, 0x81, + 0xec, 0xbf, 0xbe, + 0xec, 0xbf, 0xbf, + 0xed, 0x80, 0x80, + 0xed, 0x80, 0x81, + 0xed, 0x9f, 0xbe, + 0xed, 0x9f, 0xbf, + 0xee, 0x80, 0x80, + 0xee, 0x80, 0x81, + 0xef, 0xbf, 0xbe, + 0xef, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), + newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbf}), + newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbe}), + newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbf}), + newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // All bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first 3 bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0x90, 0x80, 0xbe, + 0xf0, 0x90, 0x80, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first 2 bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0x90, 0xbf, 0xbe, + 0xf0, 0x90, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first byte are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0xbf, 0xbf, 0xbe, + 0xf0, 0xbf, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 4 byte characters + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0xbf, 0xbf, 0xbe, + 0xf0, 0xbf, 0xbf, 0xbf, + 0xf1, 0x80, 0x80, 0x80, + 0xf1, 0x80, 0x80, 0x81, + 0xf3, 0xbf, 0xbf, 0xbe, + 0xf3, 0xbf, 0xbf, 0xbf, + 0xf4, 0x80, 0x80, 0x80, + 0xf4, 0x80, 0x80, 0x81, + 0xf4, 0x8f, 0xbf, 0xbe, + 0xf4, 0x8f, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbf}), + newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbf}), + newEOFToken(), + }, + }, } for _, tt := range test { clspec, err := compiler.Compile(tt.lspec) @@ -149,7 +408,7 @@ func TestLexer_Next(t *testing.T) { break } testToken(t, eTok, tok) - t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) + // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) if tok.EOF { break } |