aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/ast.go9
-rw-r--r--compiler/byte.go281
-rw-r--r--compiler/byte_test.go263
-rw-r--r--compiler/lexer.go118
-rw-r--r--compiler/lexer_test.go32
-rw-r--r--compiler/parser.go97
-rw-r--r--driver/lexer_test.go23
7 files changed, 786 insertions, 37 deletions
diff --git a/compiler/ast.go b/compiler/ast.go
index f0181f5..0b72f8d 100644
--- a/compiler/ast.go
+++ b/compiler/ast.go
@@ -119,15 +119,6 @@ func (s symbolPositionSet) sort() []symbolPosition {
return sorted
}
-type byteRange struct {
- from byte
- to byte
-}
-
-func (r byteRange) String() string {
- return fmt.Sprintf("%v - %v", r.from, r.to)
-}
-
type astNode interface {
fmt.Stringer
children() (astNode, astNode)
diff --git a/compiler/byte.go b/compiler/byte.go
new file mode 100644
index 0000000..464887e
--- /dev/null
+++ b/compiler/byte.go
@@ -0,0 +1,281 @@
+package compiler
+
+import "fmt"
+
+type byteRange struct {
+ from byte
+ to byte
+}
+
+func newByteRange(from, to byte) byteRange {
+ return byteRange{
+ from: from,
+ to: to,
+ }
+}
+
+func (r byteRange) String() string {
+ return fmt.Sprintf("%X-%X (%v-%v)", r.from, r.to, r.from, r.to)
+}
+
+func newByteRangeSequence(from, to []byte) ([]byteRange, error) {
+ if len(from) != len(to) {
+ return nil, fmt.Errorf("length of `from` and `to` are mismatched; from: %+v, to: %+v", from, to)
+ }
+ seq := []byteRange{}
+ for i := 0; i < len(from); i++ {
+ seq = append(seq, newByteRange(from[i], to[i]))
+ }
+ return seq, nil
+}
+
+func excludeByteRangeSequence(b1, b2 []byteRange) [][]byteRange {
+ if len(b1) != len(b2) {
+ // no overlapping
+ return [][]byteRange{b2}
+ }
+ switch len(b1) {
+ case 1:
+ return exclude1Byte(b1, b2)
+ case 2:
+ return exclude2Byte(b1, b2)
+ case 3:
+ return exclude3Byte(b1, b2)
+ case 4:
+ return exclude4Byte(b1, b2)
+ }
+ panic(fmt.Errorf("excludeByteRangeSequence can only handle sequences up to 4 bytes in size; b1: %+v, b2: %+v", b1, b2))
+}
+
+func exclude1Byte(b1, b2 []byteRange) [][]byteRange {
+ r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
+ if !overlapping0 {
+ // no overlapping
+ return [][]byteRange{b2}
+ }
+ result := [][]byteRange{}
+ {
+ if r01 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r01.from, r01.to),
+ })
+ }
+ if r02 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r02.from, r02.to),
+ })
+ }
+ }
+ return result
+}
+
+func exclude2Byte(b1, b2 []byteRange) [][]byteRange {
+ r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
+ r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
+ if !overlapping0 || !overlapping1 {
+ // no overlapping
+ return [][]byteRange{b2}
+ }
+ result := [][]byteRange{}
+ {
+ if r11 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r11.from, r11.to),
+ })
+ }
+ if r12 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r12.from, r12.to),
+ })
+ }
+ if r01 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r01.from, r01.to),
+ newByteRange(b2[1].from, b2[1].to),
+ })
+ }
+ if r02 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r02.from, r02.to),
+ newByteRange(b2[1].from, b2[1].to),
+ })
+ }
+ }
+ return result
+}
+
+func exclude3Byte(b1, b2 []byteRange) [][]byteRange {
+ r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
+ r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
+ r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2])
+ if !overlapping0 || !overlapping1 || !overlapping2 {
+ // no overlapping
+ return [][]byteRange{b2}
+ }
+ result := [][]byteRange{}
+ {
+ if r21 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(r21.from, r21.to),
+ })
+ }
+ if r22 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(r22.from, r22.to),
+ })
+ }
+ if r11 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r11.from, r11.to),
+ newByteRange(b2[2].from, b2[2].to),
+ })
+ }
+ if r12 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r12.from, r12.to),
+ newByteRange(b2[2].from, b2[2].to),
+ })
+ }
+ if r01 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r01.from, r01.to),
+ newByteRange(b2[1].from, b2[1].to),
+ newByteRange(b2[2].from, b2[2].to),
+ })
+ }
+ if r02 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r02.from, r02.to),
+ newByteRange(b2[1].from, b2[1].to),
+ newByteRange(b2[2].from, b2[2].to),
+ })
+ }
+ }
+ return result
+}
+
+func exclude4Byte(b1, b2 []byteRange) [][]byteRange {
+ r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
+ r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
+ r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2])
+ r31, r32, overlapping3 := excludeByteRange(&b1[3], &b2[3])
+ if !overlapping0 || !overlapping1 || !overlapping2 || !overlapping3 {
+ // no overlapping
+ return [][]byteRange{b2}
+ }
+ result := [][]byteRange{}
+ {
+ if r31 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(b1[2].from, b1[2].to),
+ newByteRange(r31.from, r31.to),
+ })
+ }
+ if r32 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(b1[2].from, b1[2].to),
+ newByteRange(r32.from, r32.to),
+ })
+ }
+ if r21 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(r21.from, r21.to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ if r22 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(b1[1].from, b1[1].to),
+ newByteRange(r22.from, r22.to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ if r11 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r11.from, r11.to),
+ newByteRange(b2[2].from, b2[2].to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ if r12 != nil {
+ result = append(result, []byteRange{
+ newByteRange(b1[0].from, b1[0].to),
+ newByteRange(r12.from, r12.to),
+ newByteRange(b2[2].from, b2[2].to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ if r01 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r01.from, r01.to),
+ newByteRange(b2[1].from, b2[1].to),
+ newByteRange(b2[2].from, b2[2].to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ if r02 != nil {
+ result = append(result, []byteRange{
+ newByteRange(r02.from, r02.to),
+ newByteRange(b2[1].from, b2[1].to),
+ newByteRange(b2[2].from, b2[2].to),
+ newByteRange(b2[3].from, b2[3].to),
+ })
+ }
+ }
+ return result
+}
+
+// excludeByteRange excludes `r1` from `r2`.
+func excludeByteRange(r1, r2 *byteRange) (*byteRange, *byteRange, bool) {
+ if r1.from <= r2.from {
+ if r1.to < r2.from {
+ // no overlapping
+ return nil, nil, false
+ }
+ if r1.to < r2.to {
+ // The beginning of `r2` overlaps with `r1`.
+ return &byteRange{
+ from: r1.to + 1,
+ to: r2.to,
+ }, nil, true
+ }
+ // `symbol` overlaps with `base` entirely.
+ return nil, nil, true
+ }
+ if r1.from > r2.to {
+ // no overlapping
+ return nil, nil, false
+ }
+ if r1.to >= r2.to {
+ // The end of `r2` overlaps with `r1`.
+ return &byteRange{
+ from: r2.from,
+ to: r1.from - 1,
+ }, nil, true
+ }
+ // `r2` overlaps with `r1` entirely.
+ return &byteRange{
+ from: r2.from,
+ to: r1.from - 1,
+ },
+ &byteRange{
+ from: r1.to + 1,
+ to: r2.to,
+ }, true
+}
diff --git a/compiler/byte_test.go b/compiler/byte_test.go
new file mode 100644
index 0000000..c298501
--- /dev/null
+++ b/compiler/byte_test.go
@@ -0,0 +1,263 @@
+package compiler
+
+import "testing"
+
+func TestExcludeByteRangeSequence(t *testing.T) {
+ tests := []struct {
+ b1 []byteRange
+ b2 []byteRange
+ excluded [][]byteRange
+ }{
+ // 1 Byte
+ {
+ b1: newByteRangeSeq([]byte{0x00}, []byte{0x00}),
+ b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x01}, []byte{0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xff}, []byte{0xff}),
+ b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00}, []byte{0xfe}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}),
+ b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00}, []byte{0xbf}),
+ newByteRangeSeq([]byte{0xd0}, []byte{0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ b2: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}),
+ excluded: nil,
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
+ excluded: nil,
+ },
+
+ // 2 Byte
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0x00, 0x00}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x01}, []byte{0x00, 0xff}),
+ newByteRangeSeq([]byte{0x01, 0x00}, []byte{0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xff, 0xff}, []byte{0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xfe, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0x00}, []byte{0xff, 0xfe}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xbf}),
+ newByteRangeSeq([]byte{0xc0, 0xc1}, []byte{0xc0, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}),
+ excluded: nil,
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
+ excluded: nil,
+ },
+
+ // 3 Byte
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0xff}),
+ newByteRangeSeq([]byte{0x00, 0x01, 0x00}, []byte{0x00, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0x01, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xfe}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xbf}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}),
+ excluded: nil,
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
+ excluded: nil,
+ },
+
+ // 4 Byte
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00, 0x00}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0x00, 0xff}),
+ newByteRangeSeq([]byte{0x00, 0x00, 0x01, 0x00}, []byte{0x00, 0x00, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0x00, 0x01, 0x00, 0x00}, []byte{0x00, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0x01, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0x00, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0xff, 0x00, 0x00}, []byte{0xff, 0xff, 0xfe, 0xff}),
+ newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xff, 0xfe}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xbf}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xc0, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: [][]byteRange{
+ newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
+ newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ },
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}),
+ excluded: nil,
+ },
+ {
+ b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
+ excluded: nil,
+ },
+ }
+ for _, tt := range tests {
+ excluded := excludeByteRangeSequence(tt.b1, tt.b2)
+ t.Logf("b1: %v, b2: %v", tt.b1, tt.b2)
+ t.Logf("excluded: %+v", excluded)
+ if len(excluded) != len(tt.excluded) {
+ t.Errorf("unexpected results; expected: %+v, actual: %+v", tt.excluded, excluded)
+ }
+ for _, expectedSeq := range tt.excluded {
+ found := false
+ for _, actualSeq := range excluded {
+ mismatched := false
+ for i := 0; i < len(expectedSeq); i++ {
+ if actualSeq[i] != expectedSeq[i] {
+ mismatched = true
+ break
+ }
+ }
+ if mismatched {
+ continue
+ }
+ found = true
+ break
+ }
+ if !found {
+ t.Errorf("an expected byte range sequence was not found: %+v", expectedSeq)
+ }
+ }
+ }
+}
+
+func newByteRangeSeq(from, to []byte) []byteRange {
+ seq, err := newByteRangeSequence(from, to)
+ if err != nil {
+ panic(err)
+ }
+ return seq
+}
diff --git a/compiler/lexer.go b/compiler/lexer.go
index acc9658..dba667f 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -18,6 +18,7 @@ const (
tokenKindGroupOpen = tokenKind("(")
tokenKindGroupClose = tokenKind(")")
tokenKindBExpOpen = tokenKind("[")
+ tokenKindInverseBExpOpen = tokenKind("[^")
tokenKindBExpClose = tokenKind("]")
tokenKindCharRange = tokenKind("-")
tokenKindEOF = tokenKind("eof")
@@ -46,18 +47,32 @@ const (
type lexer struct {
src *bufio.Reader
+ peekChar2 rune
+ peekEOF2 bool
+ peekChar1 rune
+ peekEOF1 bool
lastChar rune
- prevChar rune
reachedEOF bool
+ prevChar1 rune
+ prevEOF1 bool
+ prevChar2 rune
+ pervEOF2 bool
mode lexerMode
}
func newLexer(src io.Reader) *lexer {
return &lexer{
src: bufio.NewReader(src),
+ peekChar2: nullChar,
+ peekEOF2: false,
+ peekChar1: nullChar,
+ peekEOF1: false,
lastChar: nullChar,
- prevChar: nullChar,
reachedEOF: false,
+ prevChar1: nullChar,
+ prevEOF1: false,
+ prevChar2: nullChar,
+ pervEOF2: false,
mode: lexerModeDefault,
}
}
@@ -97,6 +112,50 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
return newToken(tokenKindGroupClose, nullChar), nil
case '[':
l.mode = lexerModeBExp
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ if c1 != '^' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ c2, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ if c2 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
return newToken(tokenKindBExpOpen, nullChar), nil
case ']':
return newToken(tokenKindBExpClose, nullChar), nil
@@ -141,7 +200,7 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
}
}
switch {
- case c == '\\' || c == '-' || c == ']':
+ case c == '\\' || c == '^' || c == '-' || c == ']':
return newToken(tokenKindChar, c), nil
default:
return nil, &SyntaxError{
@@ -154,32 +213,57 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
}
func (l *lexer) read() (rune, bool, error) {
+ if l.reachedEOF {
+ return l.lastChar, l.reachedEOF, nil
+ }
+ if l.peekChar1 != nullChar || l.peekEOF1 {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = l.peekChar1
+ l.reachedEOF = l.peekEOF1
+ l.peekChar1 = l.peekChar2
+ l.peekEOF1 = l.peekEOF2
+ l.peekChar2 = nullChar
+ l.peekEOF2 = false
+ return l.lastChar, l.reachedEOF, nil
+ }
c, _, err := l.src.ReadRune()
if err != nil {
if err == io.EOF {
- l.prevChar = l.lastChar
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
l.lastChar = nullChar
l.reachedEOF = true
- return nullChar, true, nil
+ return l.lastChar, l.reachedEOF, nil
}
return nullChar, false, err
}
- l.prevChar = l.lastChar
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
l.lastChar = c
- return c, false, nil
+ l.reachedEOF = false
+ return l.lastChar, l.reachedEOF, nil
}
func (l *lexer) restore() error {
- if l.reachedEOF {
- l.lastChar = l.prevChar
- l.prevChar = nullChar
- l.reachedEOF = false
- return l.src.UnreadRune()
- }
- if l.lastChar == nullChar {
+ if l.lastChar == nullChar && !l.reachedEOF {
return fmt.Errorf("the lexer failed to call restore() because the last character is null")
}
- l.lastChar = l.prevChar
- l.prevChar = nullChar
- return l.src.UnreadRune()
+ l.peekChar2 = l.peekChar1
+ l.peekEOF2 = l.peekEOF1
+ l.peekChar1 = l.lastChar
+ l.peekEOF1 = l.reachedEOF
+ l.lastChar = l.prevChar1
+ l.reachedEOF = l.prevEOF1
+ l.prevChar1 = l.prevChar2
+ l.prevEOF1 = l.pervEOF2
+ l.prevChar2 = nullChar
+ l.pervEOF2 = false
+ return nil
}
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index be00317..0dfe2f7 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -31,7 +31,7 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the special characters",
- src: ".*+?|()[-]",
+ src: ".*+?|()[-][^^]",
tokens: []*token{
newToken(tokenKindAnyChar, nullChar),
newToken(tokenKindRepeat, nullChar),
@@ -43,12 +43,15 @@ func TestLexer(t *testing.T) {
newToken(tokenKindBExpOpen, nullChar),
newToken(tokenKindCharRange, nullChar),
newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
},
},
{
caption: "lexer can recognize the escape sequences",
- src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\-]",
+ src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\^\\-]",
tokens: []*token{
newToken(tokenKindChar, '\\'),
newToken(tokenKindChar, '.'),
@@ -61,6 +64,7 @@ func TestLexer(t *testing.T) {
newToken(tokenKindChar, '['),
newToken(tokenKindChar, ']'),
newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
newToken(tokenKindChar, '-'),
newToken(tokenKindBExpClose, nullChar),
newToken(tokenKindEOF, nullChar),
@@ -94,6 +98,30 @@ func TestLexer(t *testing.T) {
},
},
{
+ caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
+ // [^...^...][^]
+ // ~~ ~ ~~
+ // ^ ^ ^^
+ // | | |`-- Ordinary Character (c)
+ // | | `-- Bracket Expression
+ // | `-- Ordinary Character (b)
+ // `-- Inverse Bracket Expression (a)
+ //
+ // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
+ // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
+ // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
+ src: "[^^][^]",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
caption: "lexer raises an error when an invalid escape sequence appears",
src: "\\@",
err: &SyntaxError{},
diff --git a/compiler/parser.go b/compiler/parser.go
index cdc4817..d5c7b3d 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -183,6 +183,28 @@ func (p *parser) parseSingleChar() astNode {
}
return left
}
+ if p.consume(tokenKindInverseBExpOpen) {
+ defer p.expect(tokenKindBExpClose)
+ elem := p.parseBExpElem()
+ if elem == nil {
+ raiseSyntaxError("bracket expression must include at least one character")
+ }
+ inverse := exclude(elem, genAnyCharAST())
+ if inverse == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ for {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ break
+ }
+ inverse = exclude(elem, inverse)
+ if inverse == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ }
+ return inverse
+ }
return p.parseNormalChar()
}
@@ -228,6 +250,38 @@ func (p *parser) parseNormalChar() astNode {
}
}
+func exclude(symbol, base astNode) astNode {
+ switch base.(type) {
+ case *altNode:
+ left, right := base.children()
+ return genAltNode(
+ exclude(symbol, left),
+ exclude(symbol, right),
+ )
+ case *symbolNode:
+ baseSeq := genByteRangeSeq(base)
+ symSeq := genByteRangeSeq(symbol)
+ excluded := excludeByteRangeSequence(symSeq, baseSeq)
+ if len(excluded) <= 0 {
+ return nil
+ }
+ return convertByteRangeSeqsToAST(excluded)
+ }
+ return nil
+}
+
+func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode {
+ concats := []astNode{}
+ for _, seq := range seqs {
+ syms := []astNode{}
+ for _, elem := range seq {
+ syms = append(syms, newRangeSymbolNode(elem.from, elem.to))
+ }
+ concats = append(concats, genConcatNode(syms...))
+ }
+ return genAltNode(concats...)
+}
+
// Refelences:
// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
// * Table 3-6. UTF-8 Bit Distribution
@@ -363,6 +417,18 @@ func genByteSeq(node astNode) []byte {
panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node))
}
+func genByteRangeSeq(node astNode) []byteRange {
+ switch n := node.(type) {
+ case *symbolNode:
+ return []byteRange{{from: n.from, to: n.to}}
+ case *concatNode:
+ seq := genByteRangeSeq(n.left)
+ seq = append(seq, genByteRangeSeq(n.right)...)
+ return seq
+ }
+ panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node))
+}
+
func isValidOrder(from, to []byte) bool {
if len(from) > len(to) {
return false
@@ -853,17 +919,36 @@ func gen4ByteCharRangeAST(from, to []byte) astNode {
}
}
-func genConcatNode(c1, c2 astNode, cn ...astNode) *concatNode {
- concat := newConcatNode(c1, c2)
- for _, c := range cn {
+func genConcatNode(cs ...astNode) astNode {
+ if len(cs) <= 0 {
+ return nil
+ }
+ if len(cs) == 1 {
+ return cs[0]
+ }
+ concat := newConcatNode(cs[0], cs[1])
+ for _, c := range cs[2:] {
concat = newConcatNode(concat, c)
}
return concat
}
-func genAltNode(c1, c2 astNode, cn ...astNode) *altNode {
- alt := newAltNode(c1, c2)
- for _, c := range cn {
+func genAltNode(cs ...astNode) astNode {
+ nonNilNodes := []astNode{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
alt = newAltNode(alt, c)
}
return alt
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index bdf5f03..0c9f720 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -136,18 +136,23 @@ func TestLexer_Next(t *testing.T) {
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 1 byte characters
- spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"),
+ //
+ // NOTE:
+ // maleeni cannot handle the null character in patterns because compiler.lexer,
+ // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
+ // There is room for improvement in this behavior of the lexer.
+ spec.NewLexEntry("1ByteChar", "[\x01-\x7f]"),
},
},
src: string([]byte{
- 0x00,
0x01,
+ 0x02,
0x7e,
0x7f,
}),
tokens: []*Token{
- newToken(1, "1ByteChar", []byte{0x00}),
newToken(1, "1ByteChar", []byte{0x01}),
+ newToken(1, "1ByteChar", []byte{0x02}),
newToken(1, "1ByteChar", []byte{0x7e}),
newToken(1, "1ByteChar", []byte{0x7f}),
newEOFToken(),
@@ -391,6 +396,18 @@ func TestLexer_Next(t *testing.T) {
newEOFToken(),
},
},
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ spec.NewLexEntry("NonNumber", "[^0-9]+[0-9]"),
+ },
+ },
+ src: "foo9",
+ tokens: []*Token{
+ newToken(1, "NonNumber", []byte("foo9")),
+ newEOFToken(),
+ },
+ },
}
for _, tt := range test {
clspec, err := compiler.Compile(tt.lspec)