aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/lexer.go5
-rw-r--r--compiler/lexer_test.go11
-rw-r--r--compiler/parser.go709
-rw-r--r--driver/lexer_test.go261
4 files changed, 977 insertions, 9 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index 91cd209..acc9658 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -19,6 +19,7 @@ const (
tokenKindGroupClose = tokenKind(")")
tokenKindBExpOpen = tokenKind("[")
tokenKindBExpClose = tokenKind("]")
+ tokenKindCharRange = tokenKind("-")
tokenKindEOF = tokenKind("eof")
)
@@ -124,6 +125,8 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
func (l *lexer) nextInBExp(c rune) (*token, error) {
switch c {
+ case '-':
+ return newToken(tokenKindCharRange, nullChar), nil
case ']':
l.mode = lexerModeDefault
return newToken(tokenKindBExpClose, nullChar), nil
@@ -138,7 +141,7 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
}
}
switch {
- case c == '\\' || c == ']':
+ case c == '\\' || c == '-' || c == ']':
return newToken(tokenKindChar, c), nil
default:
return nil, &SyntaxError{
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index e2e7d7a..be00317 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -31,7 +31,7 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the special characters",
- src: ".*+?|()[]",
+ src: ".*+?|()[-]",
tokens: []*token{
newToken(tokenKindAnyChar, nullChar),
newToken(tokenKindRepeat, nullChar),
@@ -41,13 +41,14 @@ func TestLexer(t *testing.T) {
newToken(tokenKindGroupOpen, nullChar),
newToken(tokenKindGroupClose, nullChar),
newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCharRange, nullChar),
newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
{
caption: "lexer can recognize the escape sequences",
- src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\]",
+ src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\-]",
tokens: []*token{
newToken(tokenKindChar, '\\'),
newToken(tokenKindChar, '.'),
@@ -59,12 +60,15 @@ func TestLexer(t *testing.T) {
newToken(tokenKindChar, ')'),
newToken(tokenKindChar, '['),
newToken(tokenKindChar, ']'),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
{
caption: "in a bracket expression, the special characters are also handled as normal characters",
- src: "[\\\\.*+?|()[\\]].*|()][",
+ src: "[\\\\.*+?|()[\\]].*|()-][",
tokens: []*token{
newToken(tokenKindBExpOpen, nullChar),
newToken(tokenKindChar, '\\'),
@@ -83,6 +87,7 @@ func TestLexer(t *testing.T) {
newToken(tokenKindAlt, nullChar),
newToken(tokenKindGroupOpen, nullChar),
newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindChar, '-'),
newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindBExpOpen, nullChar),
newToken(tokenKindEOF, nullChar),
diff --git a/compiler/parser.go b/compiler/parser.go
index 04cd8ad..bae4e17 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -15,9 +15,9 @@ func (err *SyntaxError) Error() string {
return fmt.Sprintf("Syntax Error: %v", err.message)
}
-func raiseSyntaxError(message string) {
+func raiseSyntaxError(format string, a ...interface{}) {
panic(&SyntaxError{
- message: message,
+ message: fmt.Sprintf(format, a...),
})
}
@@ -170,12 +170,12 @@ func (p *parser) parseSingleChar() astNode {
}
if p.consume(tokenKindBExpOpen) {
defer p.expect(tokenKindBExpClose)
- left := p.parseNormalChar()
+ left := p.parseBExpElem()
if left == nil {
raiseSyntaxError("bracket expression must include at least one character")
}
for {
- right := p.parseNormalChar()
+ right := p.parseBExpElem()
if right == nil {
break
}
@@ -185,6 +185,19 @@ func (p *parser) parseSingleChar() astNode {
}
return p.parseNormalChar()
}
+
+func (p *parser) parseBExpElem() astNode {
+ left := p.parseNormalChar()
+ if !p.consume(tokenKindCharRange) {
+ return left
+ }
+ right := p.parseNormalChar()
+ if right == nil {
+ raiseSyntaxError("invalid range expression")
+ }
+ return genRangeAST(left, right)
+}
+
func (p *parser) parseNormalChar() astNode {
if !p.consume(tokenKindChar) {
return nil
@@ -316,6 +329,694 @@ func genAnyCharAST(tok *token) astNode {
)
}
+func genRangeAST(fromNode, toNode astNode) astNode {
+ from := genByteSeq(fromNode)
+ to := genByteSeq(toNode)
+ if !isValidOrder(from, to) {
+ raiseSyntaxError("range expression with invalid order: [%s-%s] ([%v-%v])", string(from), string(to), from, to)
+ }
+ switch len(from) {
+ case 1:
+ switch len(to) {
+ case 1:
+ return gen1ByteCharRangeAST(from, to)
+ case 2:
+ return newAltNode(
+ gen1ByteCharRangeAST(from, []byte{0x7f}),
+ gen2ByteCharRangeAST([]byte{0xc2, 0x80}, to),
+ )
+ case 3:
+ return newAltNode(
+ newAltNode(
+ gen1ByteCharRangeAST(from, []byte{0x7f}),
+ gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}),
+ ),
+ gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, to),
+ )
+ case 4:
+ return newAltNode(
+ newAltNode(
+ newAltNode(
+ gen1ByteCharRangeAST(from, []byte{0x7f}),
+ gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}),
+ ),
+ gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, []byte{0xef, 0xbf, 0xbf}),
+ ),
+ gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to),
+ )
+ }
+ case 2:
+ switch len(to) {
+ case 2:
+ return gen2ByteCharRangeAST(from, to)
+ case 3:
+ return newAltNode(
+ gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}),
+ gen3ByteCharRangeAST([]byte{0xc2, 0x80}, to),
+ )
+ case 4:
+ return newAltNode(
+ newAltNode(
+ gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}),
+ gen3ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xef, 0xbf, 0xbf}),
+ ),
+ gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to),
+ )
+ }
+ case 3:
+ switch len(to) {
+ case 3:
+ return gen3ByteCharRangeAST(from, to)
+ case 4:
+ return newAltNode(
+ gen3ByteCharRangeAST(from, []byte{0xef, 0xbf, 0xbf}),
+ gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to),
+ )
+ }
+ case 4:
+ return gen4ByteCharRangeAST(from, to)
+ }
+ panic(fmt.Sprintf("invalid range; from: %v, to: %v", from, to))
+}
+
+func genByteSeq(node astNode) []byte {
+ switch n := node.(type) {
+ case *symbolNode:
+ return []byte{n.from}
+ case *concatNode:
+ seq := genByteSeq(n.left)
+ seq = append(seq, genByteSeq(n.right)...)
+ return seq
+ }
+ panic(fmt.Sprintf("genByteSeq() cannot handle %T: %v", node, node))
+}
+
+func isValidOrder(from, to []byte) bool {
+ if len(from) > len(to) {
+ return false
+ }
+ if len(from) < len(to) {
+ return true
+ }
+ for i, f := range from {
+ t := to[i]
+ if f > t {
+ return false
+ }
+ if f < t {
+ return true
+ }
+ }
+ return true
+}
+
+func gen1ByteCharRangeAST(from, to []byte) astNode {
+ return newRangeSymbolNode(nil, from[0], to[0], symbolPositionNil)
+}
+
+func gen2ByteCharRangeAST(from, to []byte) astNode {
+ from0 := from[0]
+ from1 := from[1]
+ to0 := to[0]
+ to1 := to[1]
+ switch {
+ case from0 == to0 && from1 == to1:
+ return newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ )
+ case from0 == to0:
+ return newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1, to1, symbolPositionNil),
+ )
+ default:
+ alt1 := newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1, 0xbf, symbolPositionNil),
+ )
+ alt2 := newConcatNode(
+ newRangeSymbolNode(nil, from0+1, to0, symbolPositionNil),
+ newRangeSymbolNode(nil, 0x80, to1, symbolPositionNil),
+ )
+ return newAltNode(alt1, alt2)
+ }
+}
+
+type byteBoundsEntry struct {
+ min byte
+ max byte
+}
+
+var (
+ bounds3 = [][]byteBoundsEntry{
+ nil,
+ {{min: 0xe0, max: 0xe0}, {min: 0xa0, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ {{min: 0xe1, max: 0xec}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ {{min: 0xed, max: 0xed}, {min: 0x80, max: 0x9f}, {min: 0x80, max: 0xbf}},
+ {{min: 0xee, max: 0xef}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ }
+
+ bounds4 = [][]byteBoundsEntry{
+ nil,
+ {{min: 0xf0, max: 0xf0}, {min: 0x90, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ {{min: 0xf1, max: 0xf3}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ {{min: 0xf4, max: 0xf4}, {min: 0x80, max: 0x8f}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}},
+ }
+)
+
+func get3ByteCharRangeNum(seq []byte) int {
+ head := seq[0]
+ switch {
+ case head == 0xe0:
+ return 1
+ case head >= 0xe1 && head <= 0xec:
+ return 2
+ case head == 0xed:
+ return 3
+ case head >= 0xee && head <= 0xef:
+ return 4
+ }
+ return 0
+}
+
+func get4ByteCharRangeNum(seq []byte) int {
+ head := seq[0]
+ switch {
+ case head == 0xf0:
+ return 1
+ case head >= 0xf1 && head <= 0xf3:
+ return 2
+ case head == 0xf4:
+ return 3
+ }
+ return 0
+}
+
+func gen3ByteCharRangeAST(from, to []byte) astNode {
+ from0 := from[0]
+ from1 := from[1]
+ from2 := from[2]
+ to0 := to[0]
+ to1 := to[1]
+ to2 := to[2]
+ switch {
+ case from0 == to0 && from1 == to1 && from2 == to2:
+ return newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ )
+ case from0 == to0 && from1 == to1:
+ return newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2, to2, symbolPositionNil),
+ )
+ case from0 == to0:
+ rangeNum := get3ByteCharRangeNum(from)
+ bounds := bounds3[rangeNum]
+ var alt astNode
+ alt = newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2, bounds[2].max, symbolPositionNil),
+ )
+ if from1+1 < to1 {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1+1, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil),
+ ),
+ )
+ return alt
+ default:
+ fromRangeNum := get3ByteCharRangeNum(from)
+ toRangeNum := get3ByteCharRangeNum(to)
+ bounds := bounds3[fromRangeNum]
+ var alt astNode
+ alt = newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2, bounds[2].max, symbolPositionNil),
+ )
+ if from1 < bounds[1].max {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1+1, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ if fromRangeNum == toRangeNum {
+ if from0+1 < to0 {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, from0+1, to0-1, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to1 > bounds[1].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil),
+ ),
+ )
+ return alt
+ }
+ for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ {
+ bounds := bounds3[rangeNum]
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, bounds[0].min, bounds[0].max, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ bounds = bounds3[toRangeNum]
+ if to0 > bounds[0].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, bounds[0].min, to0-1, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to1 > bounds[1].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2, symbolPositionNil),
+ ),
+ )
+ return alt
+ }
+}
+
+func gen4ByteCharRangeAST(from, to []byte) astNode {
+ from0 := from[0]
+ from1 := from[1]
+ from2 := from[2]
+ from3 := from[3]
+ to0 := to[0]
+ to1 := to[1]
+ to2 := to[2]
+ to3 := to[3]
+ switch {
+ case from0 == to0 && from1 == to1 && from2 == to2 && from3 == to3:
+ return newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from3, symbolPositionNil),
+ )
+ case from0 == to0 && from1 == to1 && from2 == to2:
+ return newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from3, to3, symbolPositionNil),
+ )
+ case from0 == to0 && from1 == to1:
+ rangeNum := get4ByteCharRangeNum(from)
+ bounds := bounds4[rangeNum]
+ var alt astNode
+ alt = newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil),
+ )
+ if from2+1 < to2 {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2+1, to2-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, to2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil),
+ ),
+ )
+ return alt
+ case from0 == to0:
+ rangeNum := get4ByteCharRangeNum(from)
+ bounds := bounds4[rangeNum]
+ var alt astNode
+ alt = newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil),
+ )
+ if from2 < bounds[2].max {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2+1, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if from1+1 < to1 {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1+1, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to2 > bounds[2].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, to2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil),
+ ),
+ )
+ return alt
+ default:
+ fromRangeNum := get4ByteCharRangeNum(from)
+ toRangeNum := get4ByteCharRangeNum(to)
+ bounds := bounds4[fromRangeNum]
+ var alt astNode
+ alt = newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, from2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from3, bounds[3].max, symbolPositionNil),
+ )
+ if from2 < bounds[2].max {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newSymbolNode(nil, from1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, from2+1, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if from1 < bounds[1].max {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, from0, symbolPositionNil),
+ newRangeSymbolNode(nil, from1+1, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if fromRangeNum == toRangeNum {
+ if from0+1 < to0 {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, from0+1, to0-1, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to1 > bounds[1].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to2 > bounds[2].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, to2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil),
+ ),
+ )
+ return alt
+ }
+ for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ {
+ bounds := bounds4[rangeNum]
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, bounds[0].min, bounds[0].max, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ bounds = bounds4[toRangeNum]
+ if to0 > bounds[0].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(nil, bounds[0].min, to0-1, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, bounds[1].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to1 > bounds[1].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newRangeSymbolNode(nil, bounds[1].min, to1-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, bounds[2].max, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ if to2 > bounds[2].min {
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[2].min, to2-1, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, bounds[3].max, symbolPositionNil),
+ ),
+ )
+ }
+ alt = newAltNode(
+ alt,
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(nil, to0, symbolPositionNil),
+ newSymbolNode(nil, to1, symbolPositionNil),
+ ),
+ newSymbolNode(nil, to2, symbolPositionNil),
+ ),
+ newRangeSymbolNode(nil, bounds[3].min, to3, symbolPositionNil),
+ ),
+ )
+ return alt
+ }
+}
+
func (p *parser) expect(expected tokenKind) {
if !p.consume(expected) {
tok := p.peekedTok
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 283d5fe..bdf5f03 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -132,6 +132,265 @@ func TestLexer_Next(t *testing.T) {
newEOFToken(),
},
},
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // all 1 byte characters
+ spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"),
+ },
+ },
+ src: string([]byte{
+ 0x00,
+ 0x01,
+ 0x7e,
+ 0x7f,
+ }),
+ tokens: []*Token{
+ newToken(1, "1ByteChar", []byte{0x00}),
+ newToken(1, "1ByteChar", []byte{0x01}),
+ newToken(1, "1ByteChar", []byte{0x7e}),
+ newToken(1, "1ByteChar", []byte{0x7f}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // all 2 byte characters
+ spec.NewLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xc2, 0x80,
+ 0xc2, 0x81,
+ 0xdf, 0xbe,
+ 0xdf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "2ByteChar", []byte{0xc2, 0x80}),
+ newToken(1, "2ByteChar", []byte{0xc2, 0x81}),
+ newToken(1, "2ByteChar", []byte{0xdf, 0xbe}),
+ newToken(1, "2ByteChar", []byte{0xdf, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // All bytes are the same.
+ spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
+ },
+ },
+ src: string([]byte{
+ 0xe0, 0xa0, 0x80,
+ }),
+ tokens: []*Token{
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // The first two bytes are the same.
+ spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xe0, 0xa0, 0x80,
+ 0xe0, 0xa0, 0x81,
+ 0xe0, 0xa0, 0xbe,
+ 0xe0, 0xa0, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // The first byte are the same.
+ spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xe0, 0xa0, 0x80,
+ 0xe0, 0xa0, 0x81,
+ 0xe0, 0xbf, 0xbe,
+ 0xe0, 0xbf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // all 3 byte characters
+ spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xe0, 0xa0, 0x80,
+ 0xe0, 0xa0, 0x81,
+ 0xe0, 0xbf, 0xbe,
+ 0xe0, 0xbf, 0xbf,
+ 0xe1, 0x80, 0x80,
+ 0xe1, 0x80, 0x81,
+ 0xec, 0xbf, 0xbe,
+ 0xec, 0xbf, 0xbf,
+ 0xed, 0x80, 0x80,
+ 0xed, 0x80, 0x81,
+ 0xed, 0x9f, 0xbe,
+ 0xed, 0x9f, 0xbf,
+ 0xee, 0x80, 0x80,
+ 0xee, 0x80, 0x81,
+ 0xef, 0xbf, 0xbe,
+ 0xef, 0xbf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}),
+ newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbf}),
+ newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbf}),
+ newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x80}),
+ newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x81}),
+ newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbe}),
+ newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // All bytes are the same.
+ spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
+ },
+ },
+ src: string([]byte{
+ 0xf0, 0x90, 0x80, 0x80,
+ }),
+ tokens: []*Token{
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // The first 3 bytes are the same.
+ spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xf0, 0x90, 0x80, 0x80,
+ 0xf0, 0x90, 0x80, 0x81,
+ 0xf0, 0x90, 0x80, 0xbe,
+ 0xf0, 0x90, 0x80, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // The first 2 bytes are the same.
+ spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xf0, 0x90, 0x80, 0x80,
+ 0xf0, 0x90, 0x80, 0x81,
+ 0xf0, 0x90, 0xbf, 0xbe,
+ 0xf0, 0x90, 0xbf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // The first byte are the same.
+ spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xf0, 0x90, 0x80, 0x80,
+ 0xf0, 0x90, 0x80, 0x81,
+ 0xf0, 0xbf, 0xbf, 0xbe,
+ 0xf0, 0xbf, 0xbf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}),
+ newEOFToken(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // all 4 byte characters
+ spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
+ },
+ },
+ src: string([]byte{
+ 0xf0, 0x90, 0x80, 0x80,
+ 0xf0, 0x90, 0x80, 0x81,
+ 0xf0, 0xbf, 0xbf, 0xbe,
+ 0xf0, 0xbf, 0xbf, 0xbf,
+ 0xf1, 0x80, 0x80, 0x80,
+ 0xf1, 0x80, 0x80, 0x81,
+ 0xf3, 0xbf, 0xbf, 0xbe,
+ 0xf3, 0xbf, 0xbf, 0xbf,
+ 0xf4, 0x80, 0x80, 0x80,
+ 0xf4, 0x80, 0x80, 0x81,
+ 0xf4, 0x8f, 0xbf, 0xbe,
+ 0xf4, 0x8f, 0xbf, 0xbf,
+ }),
+ tokens: []*Token{
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}),
+ newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbf}),
+ newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x80}),
+ newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x81}),
+ newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbe}),
+ newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbf}),
+ newEOFToken(),
+ },
+ },
}
for _, tt := range test {
clspec, err := compiler.Compile(tt.lspec)
@@ -149,7 +408,7 @@ func TestLexer_Next(t *testing.T) {
break
}
testToken(t, eTok, tok)
- t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid)
+ // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid)
if tok.EOF {
break
}