diff options
-rw-r--r-- | compiler/ast.go | 9 | ||||
-rw-r--r-- | compiler/byte.go | 281 | ||||
-rw-r--r-- | compiler/byte_test.go | 263 | ||||
-rw-r--r-- | compiler/lexer.go | 118 | ||||
-rw-r--r-- | compiler/lexer_test.go | 32 | ||||
-rw-r--r-- | compiler/parser.go | 97 | ||||
-rw-r--r-- | driver/lexer_test.go | 23 |
7 files changed, 786 insertions, 37 deletions
diff --git a/compiler/ast.go b/compiler/ast.go index f0181f5..0b72f8d 100644 --- a/compiler/ast.go +++ b/compiler/ast.go @@ -119,15 +119,6 @@ func (s symbolPositionSet) sort() []symbolPosition { return sorted } -type byteRange struct { - from byte - to byte -} - -func (r byteRange) String() string { - return fmt.Sprintf("%v - %v", r.from, r.to) -} - type astNode interface { fmt.Stringer children() (astNode, astNode) diff --git a/compiler/byte.go b/compiler/byte.go new file mode 100644 index 0000000..464887e --- /dev/null +++ b/compiler/byte.go @@ -0,0 +1,281 @@ +package compiler + +import "fmt" + +type byteRange struct { + from byte + to byte +} + +func newByteRange(from, to byte) byteRange { + return byteRange{ + from: from, + to: to, + } +} + +func (r byteRange) String() string { + return fmt.Sprintf("%X-%X (%v-%v)", r.from, r.to, r.from, r.to) +} + +func newByteRangeSequence(from, to []byte) ([]byteRange, error) { + if len(from) != len(to) { + return nil, fmt.Errorf("length of `from` and `to` are mismatched; from: %+v, to: %+v", from, to) + } + seq := []byteRange{} + for i := 0; i < len(from); i++ { + seq = append(seq, newByteRange(from[i], to[i])) + } + return seq, nil +} + +func excludeByteRangeSequence(b1, b2 []byteRange) [][]byteRange { + if len(b1) != len(b2) { + // no overlapping + return [][]byteRange{b2} + } + switch len(b1) { + case 1: + return exclude1Byte(b1, b2) + case 2: + return exclude2Byte(b1, b2) + case 3: + return exclude3Byte(b1, b2) + case 4: + return exclude4Byte(b1, b2) + } + panic(fmt.Errorf("excludeByteRangeSequence can only handle sequences up to 4 bytes in size; b1: %+v, b2: %+v", b1, b2)) +} + +func exclude1Byte(b1, b2 []byteRange) [][]byteRange { + r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) + if !overlapping0 { + // no overlapping + return [][]byteRange{b2} + } + result := [][]byteRange{} + { + if r01 != nil { + result = append(result, []byteRange{ + newByteRange(r01.from, r01.to), + }) + } + if r02 != nil { + result = append(result, []byteRange{ + newByteRange(r02.from, r02.to), + }) + } + } + return result +} + +func exclude2Byte(b1, b2 []byteRange) [][]byteRange { + r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) + r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) + if !overlapping0 || !overlapping1 { + // no overlapping + return [][]byteRange{b2} + } + result := [][]byteRange{} + { + if r11 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r11.from, r11.to), + }) + } + if r12 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r12.from, r12.to), + }) + } + if r01 != nil { + result = append(result, []byteRange{ + newByteRange(r01.from, r01.to), + newByteRange(b2[1].from, b2[1].to), + }) + } + if r02 != nil { + result = append(result, []byteRange{ + newByteRange(r02.from, r02.to), + newByteRange(b2[1].from, b2[1].to), + }) + } + } + return result +} + +func exclude3Byte(b1, b2 []byteRange) [][]byteRange { + r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) + r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) + r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2]) + if !overlapping0 || !overlapping1 || !overlapping2 { + // no overlapping + return [][]byteRange{b2} + } + result := [][]byteRange{} + { + if r21 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(r21.from, r21.to), + }) + } + if r22 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(r22.from, r22.to), + }) + } + if r11 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r11.from, r11.to), + newByteRange(b2[2].from, b2[2].to), + }) + } + if r12 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r12.from, r12.to), + newByteRange(b2[2].from, b2[2].to), + }) + } + if r01 != nil { + result = append(result, []byteRange{ + newByteRange(r01.from, r01.to), + newByteRange(b2[1].from, b2[1].to), + newByteRange(b2[2].from, b2[2].to), + }) + } + if r02 != nil { + result = append(result, []byteRange{ + newByteRange(r02.from, r02.to), + newByteRange(b2[1].from, b2[1].to), + newByteRange(b2[2].from, b2[2].to), + }) + } + } + return result +} + +func exclude4Byte(b1, b2 []byteRange) [][]byteRange { + r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) + r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) + r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2]) + r31, r32, overlapping3 := excludeByteRange(&b1[3], &b2[3]) + if !overlapping0 || !overlapping1 || !overlapping2 || !overlapping3 { + // no overlapping + return [][]byteRange{b2} + } + result := [][]byteRange{} + { + if r31 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(b1[2].from, b1[2].to), + newByteRange(r31.from, r31.to), + }) + } + if r32 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(b1[2].from, b1[2].to), + newByteRange(r32.from, r32.to), + }) + } + if r21 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(r21.from, r21.to), + newByteRange(b2[3].from, b2[3].to), + }) + } + if r22 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(b1[1].from, b1[1].to), + newByteRange(r22.from, r22.to), + newByteRange(b2[3].from, b2[3].to), + }) + } + if r11 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r11.from, r11.to), + newByteRange(b2[2].from, b2[2].to), + newByteRange(b2[3].from, b2[3].to), + }) + } + if r12 != nil { + result = append(result, []byteRange{ + newByteRange(b1[0].from, b1[0].to), + newByteRange(r12.from, r12.to), + newByteRange(b2[2].from, b2[2].to), + newByteRange(b2[3].from, b2[3].to), + }) + } + if r01 != nil { + result = append(result, []byteRange{ + newByteRange(r01.from, r01.to), + newByteRange(b2[1].from, b2[1].to), + newByteRange(b2[2].from, b2[2].to), + newByteRange(b2[3].from, b2[3].to), + }) + } + if r02 != nil { + result = append(result, []byteRange{ + newByteRange(r02.from, r02.to), + newByteRange(b2[1].from, b2[1].to), + newByteRange(b2[2].from, b2[2].to), + newByteRange(b2[3].from, b2[3].to), + }) + } + } + return result +} + +// excludeByteRange excludes `r1` from `r2`. +func excludeByteRange(r1, r2 *byteRange) (*byteRange, *byteRange, bool) { + if r1.from <= r2.from { + if r1.to < r2.from { + // no overlapping + return nil, nil, false + } + if r1.to < r2.to { + // The beginning of `r2` overlaps with `r1`. + return &byteRange{ + from: r1.to + 1, + to: r2.to, + }, nil, true + } + // `symbol` overlaps with `base` entirely. + return nil, nil, true + } + if r1.from > r2.to { + // no overlapping + return nil, nil, false + } + if r1.to >= r2.to { + // The end of `r2` overlaps with `r1`. + return &byteRange{ + from: r2.from, + to: r1.from - 1, + }, nil, true + } + // `r2` overlaps with `r1` entirely. + return &byteRange{ + from: r2.from, + to: r1.from - 1, + }, + &byteRange{ + from: r1.to + 1, + to: r2.to, + }, true +} diff --git a/compiler/byte_test.go b/compiler/byte_test.go new file mode 100644 index 0000000..c298501 --- /dev/null +++ b/compiler/byte_test.go @@ -0,0 +1,263 @@ +package compiler + +import "testing" + +func TestExcludeByteRangeSequence(t *testing.T) { + tests := []struct { + b1 []byteRange + b2 []byteRange + excluded [][]byteRange + }{ + // 1 Byte + { + b1: newByteRangeSeq([]byte{0x00}, []byte{0x00}), + b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x01}, []byte{0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xff}, []byte{0xff}), + b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00}, []byte{0xfe}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}), + b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00}, []byte{0xbf}), + newByteRangeSeq([]byte{0xd0}, []byte{0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + b2: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}), + excluded: nil, + }, + { + b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), + excluded: nil, + }, + + // 2 Byte + { + b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0x00, 0x00}), + b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x01}, []byte{0x00, 0xff}), + newByteRangeSeq([]byte{0x01, 0x00}, []byte{0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xff, 0xff}, []byte{0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xfe, 0xff}), + newByteRangeSeq([]byte{0xff, 0x00}, []byte{0xff, 0xfe}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}), + b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xbf}), + newByteRangeSeq([]byte{0xc0, 0xc1}, []byte{0xc0, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + b2: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}), + excluded: nil, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), + excluded: nil, + }, + + // 3 Byte + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0xff}), + newByteRangeSeq([]byte{0x00, 0x01, 0x00}, []byte{0x00, 0xff, 0xff}), + newByteRangeSeq([]byte{0x01, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff}), + newByteRangeSeq([]byte{0xff, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff}), + newByteRangeSeq([]byte{0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xfe}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xbf}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}), + excluded: nil, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), + excluded: nil, + }, + + // 4 Byte + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00, 0x00}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0x00, 0xff}), + newByteRangeSeq([]byte{0x00, 0x00, 0x01, 0x00}, []byte{0x00, 0x00, 0xff, 0xff}), + newByteRangeSeq([]byte{0x00, 0x01, 0x00, 0x00}, []byte{0x00, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0x01, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xff, 0x00, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff, 0xff}), + newByteRangeSeq([]byte{0xff, 0xff, 0x00, 0x00}, []byte{0xff, 0xff, 0xfe, 0xff}), + newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xff, 0xfe}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xbf}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xc0, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: [][]byteRange{ + newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), + newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + }, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}), + excluded: nil, + }, + { + b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), + excluded: nil, + }, + } + for _, tt := range tests { + excluded := excludeByteRangeSequence(tt.b1, tt.b2) + t.Logf("b1: %v, b2: %v", tt.b1, tt.b2) + t.Logf("excluded: %+v", excluded) + if len(excluded) != len(tt.excluded) { + t.Errorf("unexpected results; expected: %+v, actual: %+v", tt.excluded, excluded) + } + for _, expectedSeq := range tt.excluded { + found := false + for _, actualSeq := range excluded { + mismatched := false + for i := 0; i < len(expectedSeq); i++ { + if actualSeq[i] != expectedSeq[i] { + mismatched = true + break + } + } + if mismatched { + continue + } + found = true + break + } + if !found { + t.Errorf("an expected byte range sequence was not found: %+v", expectedSeq) + } + } + } +} + +func newByteRangeSeq(from, to []byte) []byteRange { + seq, err := newByteRangeSequence(from, to) + if err != nil { + panic(err) + } + return seq +} diff --git a/compiler/lexer.go b/compiler/lexer.go index acc9658..dba667f 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -18,6 +18,7 @@ const ( tokenKindGroupOpen = tokenKind("(") tokenKindGroupClose = tokenKind(")") tokenKindBExpOpen = tokenKind("[") + tokenKindInverseBExpOpen = tokenKind("[^") tokenKindBExpClose = tokenKind("]") tokenKindCharRange = tokenKind("-") tokenKindEOF = tokenKind("eof") @@ -46,18 +47,32 @@ const ( type lexer struct { src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool lastChar rune - prevChar rune reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool mode lexerMode } func newLexer(src io.Reader) *lexer { return &lexer{ src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, lastChar: nullChar, - prevChar: nullChar, reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, mode: lexerModeDefault, } } @@ -97,6 +112,50 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { return newToken(tokenKindGroupClose, nullChar), nil case '[': l.mode = lexerModeBExp + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } return newToken(tokenKindBExpOpen, nullChar), nil case ']': return newToken(tokenKindBExpClose, nullChar), nil @@ -141,7 +200,7 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { } } switch { - case c == '\\' || c == '-' || c == ']': + case c == '\\' || c == '^' || c == '-' || c == ']': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ @@ -154,32 +213,57 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { } func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } c, _, err := l.src.ReadRune() if err != nil { if err == io.EOF { - l.prevChar = l.lastChar + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF l.lastChar = nullChar l.reachedEOF = true - return nullChar, true, nil + return l.lastChar, l.reachedEOF, nil } return nullChar, false, err } - l.prevChar = l.lastChar + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF l.lastChar = c - return c, false, nil + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil } func (l *lexer) restore() error { - if l.reachedEOF { - l.lastChar = l.prevChar - l.prevChar = nullChar - l.reachedEOF = false - return l.src.UnreadRune() - } - if l.lastChar == nullChar { + if l.lastChar == nullChar && !l.reachedEOF { return fmt.Errorf("the lexer failed to call restore() because the last character is null") } - l.lastChar = l.prevChar - l.prevChar = nullChar - return l.src.UnreadRune() + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil } diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index be00317..0dfe2f7 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -31,7 +31,7 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters", - src: ".*+?|()[-]", + src: ".*+?|()[-][^^]", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), @@ -43,12 +43,15 @@ func TestLexer(t *testing.T) { newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindCharRange, nullChar), newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the escape sequences", - src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\-]", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\^\\-]", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), @@ -61,6 +64,7 @@ func TestLexer(t *testing.T) { newToken(tokenKindChar, '['), newToken(tokenKindChar, ']'), newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), @@ -94,6 +98,30 @@ func TestLexer(t *testing.T) { }, }, { + caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", + // [^...^...][^] + // ~~ ~ ~~ + // ^ ^ ^^ + // | | |`-- Ordinary Character (c) + // | | `-- Bracket Expression + // | `-- Ordinary Character (b) + // `-- Inverse Bracket Expression (a) + // + // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. + // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. + // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. + src: "[^^][^]", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { caption: "lexer raises an error when an invalid escape sequence appears", src: "\\@", err: &SyntaxError{}, diff --git a/compiler/parser.go b/compiler/parser.go index cdc4817..d5c7b3d 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -183,6 +183,28 @@ func (p *parser) parseSingleChar() astNode { } return left } + if p.consume(tokenKindInverseBExpOpen) { + defer p.expect(tokenKindBExpClose) + elem := p.parseBExpElem() + if elem == nil { + raiseSyntaxError("bracket expression must include at least one character") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + panic(fmt.Errorf("a pattern that isn't matching any symbols")) + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + panic(fmt.Errorf("a pattern that isn't matching any symbols")) + } + } + return inverse + } return p.parseNormalChar() } @@ -228,6 +250,38 @@ func (p *parser) parseNormalChar() astNode { } } +func exclude(symbol, base astNode) astNode { + switch base.(type) { + case *altNode: + left, right := base.children() + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + case *symbolNode: + baseSeq := genByteRangeSeq(base) + symSeq := genByteRangeSeq(symbol) + excluded := excludeByteRangeSequence(symSeq, baseSeq) + if len(excluded) <= 0 { + return nil + } + return convertByteRangeSeqsToAST(excluded) + } + return nil +} + +func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode { + concats := []astNode{} + for _, seq := range seqs { + syms := []astNode{} + for _, elem := range seq { + syms = append(syms, newRangeSymbolNode(elem.from, elem.to)) + } + concats = append(concats, genConcatNode(syms...)) + } + return genAltNode(concats...) +} + // Refelences: // * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 // * Table 3-6. UTF-8 Bit Distribution @@ -363,6 +417,18 @@ func genByteSeq(node astNode) []byte { panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node)) } +func genByteRangeSeq(node astNode) []byteRange { + switch n := node.(type) { + case *symbolNode: + return []byteRange{{from: n.from, to: n.to}} + case *concatNode: + seq := genByteRangeSeq(n.left) + seq = append(seq, genByteRangeSeq(n.right)...) + return seq + } + panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node)) +} + func isValidOrder(from, to []byte) bool { if len(from) > len(to) { return false @@ -853,17 +919,36 @@ func gen4ByteCharRangeAST(from, to []byte) astNode { } } -func genConcatNode(c1, c2 astNode, cn ...astNode) *concatNode { - concat := newConcatNode(c1, c2) - for _, c := range cn { +func genConcatNode(cs ...astNode) astNode { + if len(cs) <= 0 { + return nil + } + if len(cs) == 1 { + return cs[0] + } + concat := newConcatNode(cs[0], cs[1]) + for _, c := range cs[2:] { concat = newConcatNode(concat, c) } return concat } -func genAltNode(c1, c2 astNode, cn ...astNode) *altNode { - alt := newAltNode(c1, c2) - for _, c := range cn { +func genAltNode(cs ...astNode) astNode { + nonNilNodes := []astNode{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { alt = newAltNode(alt, c) } return alt diff --git a/driver/lexer_test.go b/driver/lexer_test.go index bdf5f03..0c9f720 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -136,18 +136,23 @@ func TestLexer_Next(t *testing.T) { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 1 byte characters - spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"), + // + // NOTE: + // maleeni cannot handle the null character in patterns because compiler.lexer, + // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. + // There is room for improvement in this behavior of the lexer. + spec.NewLexEntry("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ - 0x00, 0x01, + 0x02, 0x7e, 0x7f, }), tokens: []*Token{ - newToken(1, "1ByteChar", []byte{0x00}), newToken(1, "1ByteChar", []byte{0x01}), + newToken(1, "1ByteChar", []byte{0x02}), newToken(1, "1ByteChar", []byte{0x7e}), newToken(1, "1ByteChar", []byte{0x7f}), newEOFToken(), @@ -391,6 +396,18 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + spec.NewLexEntry("NonNumber", "[^0-9]+[0-9]"), + }, + }, + src: "foo9", + tokens: []*Token{ + newToken(1, "NonNumber", []byte("foo9")), + newEOFToken(), + }, + }, } for _, tt := range test { clspec, err := compiler.Compile(tt.lspec) |