aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/dfa/tree.go2
-rw-r--r--utf8/utf8.go454
-rw-r--r--utf8/utf8_test.go181
3 files changed, 266 insertions, 371 deletions
diff --git a/compiler/dfa/tree.go b/compiler/dfa/tree.go
index ab75eae..a3442f8 100644
--- a/compiler/dfa/tree.go
+++ b/compiler/dfa/tree.go
@@ -512,7 +512,7 @@ func ConvertCPTreeToByteTree(cpTrees map[spec.LexModeKindID]parser.CPTree) (byte
func convCPTreeToByteTree(cpTree parser.CPTree) (byteTree, error) {
if from, to, ok := cpTree.Range(); ok {
- bs, err := utf8.GenCharBlocks([]byte(string(from)), []byte(string(to)))
+ bs, err := utf8.GenCharBlocks(from, to)
if err != nil {
return nil, err
}
diff --git a/utf8/utf8.go b/utf8/utf8.go
index 16b3746..4f52bd4 100644
--- a/utf8/utf8.go
+++ b/utf8/utf8.go
@@ -1,398 +1,112 @@
package utf8
-import "fmt"
+import (
+ "fmt"
+ "strings"
+)
type CharBlock struct {
From []byte
To []byte
}
-// Refelences:
-// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
-// * Table 3-6. UTF-8 Bit Distribution
-// * Table 3-7. Well-Formed UTF-8 Byte Sequences
-var (
- // 1 byte character:
- // * <00..7F>
- cBlks1 = []*CharBlock{
- {
- From: []byte{0x00},
- To: []byte{0x7f},
- },
+func (b *CharBlock) String() string {
+ var s strings.Builder
+ fmt.Fprint(&s, "<")
+ fmt.Fprintf(&s, "%X", b.From[0])
+ for i := 1; i < len(b.From); i++ {
+ fmt.Fprintf(&s, " %X", b.From[i])
}
-
- // 2 bytes character:
- // * <C2..DF 80..BF>
- cBlks2 = []*CharBlock{
- {
- From: []byte{0xc2, 0x80},
- To: []byte{0xdf, 0xbf},
- },
- }
-
- // 3 bytes character:
- // * <E0 A0..BF 80..BF>
- // * <E1..EC 80..BF 80..BF>
- // * <ED 80..9F 80..BF>
- // * <EE..EF 80..BF 80..BF>
- cBlks3 = []*CharBlock{
- {
- From: []byte{0xe0, 0xa0, 0x80},
- To: []byte{0xe0, 0xbf, 0xbf},
- },
- {
- From: []byte{0xe1, 0x80, 0x80},
- To: []byte{0xec, 0xbf, 0xbf},
- },
- {
- From: []byte{0xed, 0x80, 0x80},
- To: []byte{0xed, 0x9f, 0xbf},
- },
- {
- From: []byte{0xee, 0x80, 0x80},
- To: []byte{0xef, 0xbf, 0xbf},
- },
+ fmt.Fprint(&s, "..")
+ fmt.Fprintf(&s, "%X", b.To[0])
+ for i := 1; i < len(b.To); i++ {
+ fmt.Fprintf(&s, " %X", b.To[i])
}
+ fmt.Fprint(&s, ">")
+ return s.String()
+}
- // 4 bytes character:
- // * <F0 90..BF 80..BF 80..BF>
- // * <F1..F3 80..BF 80..BF 80..BF>
- // * <F4 80..8F 80..BF 80..BF>
- cBlks4 = []*CharBlock{
- {
- From: []byte{0xf0, 0x90, 0x80, 0x80},
- To: []byte{0xf0, 0xbf, 0xbf, 0xbf},
- },
- {
- From: []byte{0xf1, 0x80, 0x80, 0x80},
- To: []byte{0xf3, 0xbf, 0xbf, 0xbf},
- },
- {
- From: []byte{0xf4, 0x80, 0x80, 0x80},
- To: []byte{0xf4, 0x8f, 0xbf, 0xbf},
- },
+func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
+ rs, err := splitCodePoint(from, to)
+ if err != nil {
+ return nil, err
}
- cBlk1Head = cBlks1[0]
- cBlk1Last = cBlks1[len(cBlks1)-1]
- cBlk2Head = cBlks2[0]
- cBlk2Last = cBlks2[len(cBlks2)-1]
- cBlk3Head = cBlks3[0]
- cBlk3Last = cBlks3[len(cBlks3)-1]
- cBlk4Head = cBlks4[0]
- cBlk4Last = cBlks4[len(cBlks4)-1]
-)
-
-func GenCharBlocks(from, to []byte) ([]*CharBlock, error) {
- switch len(from) {
- case 1:
- switch len(to) {
- case 1:
- return genCharBlocks1(from, to), nil
- case 2:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
- alt = append(alt, genCharBlocks2(cBlk2Head.From, to)...)
- return alt, nil
- case 3:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
- alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...)
- alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...)
- return alt, nil
- case 4:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
- alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...)
- alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...)
- alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
- return alt, nil
- }
- case 2:
- switch len(to) {
- case 2:
- return genCharBlocks2(from, to), nil
- case 3:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...)
- alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...)
- return alt, nil
- case 4:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...)
- alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...)
- alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
- return alt, nil
+ blks := make([]*CharBlock, len(rs))
+ for i, r := range rs {
+ blks[i] = &CharBlock{
+ From: []byte(string(r.from)),
+ To: []byte(string(r.to)),
}
- case 3:
- switch len(to) {
- case 3:
- return genCharBlocks3(from, to), nil
- case 4:
- var alt []*CharBlock
- alt = append(alt, genCharBlocks3(from, cBlk3Last.To)...)
- alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
- return alt, nil
- }
- case 4:
- return genCharBlocks4(from, to), nil
}
- return nil, fmt.Errorf("invalid range; From: %v, To: %v", from, to)
-}
-func genCharBlocks1(from, to []byte) []*CharBlock {
- return []*CharBlock{
- {From: from, To: to},
- }
+ return blks, nil
}
-func genCharBlocks2(from, to []byte) []*CharBlock {
- switch {
- case from[0] == to[0]:
- return []*CharBlock{
- {From: from, To: to},
- }
- default:
- return []*CharBlock{
- {From: from, To: []byte{from[0], cBlks2[0].To[1]}},
- {From: []byte{from[0] + 1, cBlks2[0].From[1]}, To: to},
- }
- }
+type cpRange struct {
+ from rune
+ to rune
}
-func genCharBlocks3(from, to []byte) []*CharBlock {
- switch {
- case from[0] == to[0] && from[1] == to[1]:
- return []*CharBlock{
- {From: from, To: to},
- }
- case from[0] == to[0]:
- _, fromBlk := findCharBlock3(from)
- var alt []*CharBlock
- alt = append(alt, &CharBlock{
- From: from,
- To: []byte{from[0], from[1], fromBlk.To[2]},
- })
- if from[1]+1 < to[1] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1] + 1, fromBlk.From[2]},
- To: []byte{from[0], to[1] - 1, fromBlk.To[2]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{from[0], to[1], fromBlk.From[2]},
- To: to,
- })
- return alt
- default:
- fromBlkNum, fromBlk := findCharBlock3(from)
- toBlkNum, toBlk := findCharBlock3(to)
- var alt []*CharBlock
- alt = append(alt, &CharBlock{
- From: from,
- To: []byte{from[0], from[1], fromBlk.To[2]},
- })
- if from[1] < fromBlk.To[1] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1] + 1, fromBlk.From[2]},
- To: []byte{from[0], fromBlk.To[1], fromBlk.To[2]},
- })
- }
- if fromBlkNum == toBlkNum {
- if from[0]+1 < to[0] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2]},
- To: []byte{to[0] - 1, fromBlk.To[1], fromBlk.To[2]},
- })
- }
- if to[1] > fromBlk.From[1] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], fromBlk.From[1], fromBlk.From[2]},
- To: []byte{to[0], to[1] - 1, fromBlk.To[2]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], fromBlk.From[2]},
- To: to,
- })
- return alt
- }
- for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ {
- fromBlk := cBlks3[blkNum]
- alt = append(alt, &CharBlock{
- From: fromBlk.From,
- To: fromBlk.To,
- })
- }
- if to[0] > toBlk.From[0] {
- alt = append(alt, &CharBlock{
- From: toBlk.From,
- To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2]},
- })
- }
- if to[1] > toBlk.From[1] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], toBlk.From[1], toBlk.From[2]},
- To: []byte{to[0], to[1] - 1, toBlk.To[2]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], toBlk.From[2]},
- To: to,
- })
- return alt
+// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that
+// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function
+// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on
+// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>,
+// and <U+0080..U+07FF> is encoded <C2 80..DF BF>).
+//
+// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are
+// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
+// However, when `from` or `to` itself is the surrogate code point, this function returns an error.
+func splitCodePoint(from, to rune) ([]*cpRange, error) {
+ if from > to {
+ return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to)
}
-}
-
-func genCharBlocks4(from, to []byte) []*CharBlock {
- switch {
- case from[0] == to[0] && from[1] == to[1] && from[2] == to[2]:
- return []*CharBlock{
- {
- From: from,
- To: to,
- },
- }
- case from[0] == to[0] && from[1] == to[1]:
- _, fromBlk := findCharBlock4(from)
- var alt []*CharBlock
- alt = append(alt, &CharBlock{
- From: from,
- To: []byte{to[0], to[1], from[2], fromBlk.To[3]},
- })
- if from[2]+1 < to[2] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1], from[2] + 2, fromBlk.From[3]},
- To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1], to[2], fromBlk.From[3]},
- To: []byte{to[0], to[1], to[2], to[3]},
- })
- return alt
- case from[0] == to[0]:
- _, fromBlk := findCharBlock4(from)
- var alt []*CharBlock
- alt = append(alt, &CharBlock{
- From: from,
- To: []byte{to[0], from[1], from[2], fromBlk.To[3]},
- })
- if from[2] < fromBlk.To[2] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]},
- To: []byte{to[0], from[1], fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if from[1]+1 < to[1] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]},
- To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if to[2] > fromBlk.From[2] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], to[1], fromBlk.From[2], fromBlk.From[3]},
- To: []byte{from[0], to[1], to[2] - 1, fromBlk.To[3]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{from[0], to[1], to[2], fromBlk.From[3]},
- To: to,
- })
- return alt
- default:
- fromBlkNum, fromBlk := findCharBlock4(from)
- toBlkNum, toBlk := findCharBlock4(to)
- var alt []*CharBlock
- alt = append(alt, &CharBlock{
- From: from,
- To: []byte{from[0], from[1], from[2], fromBlk.To[3]},
- })
- if from[2] < fromBlk.To[2] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]},
- To: []byte{from[0], from[1], fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if from[1] < fromBlk.To[1] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]},
- To: []byte{from[0], fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if fromBlkNum == toBlkNum {
- if from[0]+1 < to[0] {
- alt = append(alt, &CharBlock{
- From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]},
- To: []byte{from[0] - 1, fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if to[1] > fromBlk.From[1] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]},
- To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]},
- })
- }
- if to[2] > fromBlk.From[2] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], fromBlk.From[2], fromBlk.From[3]},
- To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], to[2], fromBlk.From[3]},
- To: to,
- })
- return alt
- }
- for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ {
- blk := cBlks4[blkNum]
- alt = append(alt, &CharBlock{
- From: blk.From,
- To: blk.To,
- })
- }
- if to[0] > toBlk.From[0] {
- alt = append(alt, &CharBlock{
- From: toBlk.From,
- To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2], toBlk.To[3]},
- })
- }
- if to[1] > toBlk.From[1] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], toBlk.From[1], toBlk.From[2], toBlk.From[3]},
- To: []byte{to[0], to[1] - 1, toBlk.To[2], toBlk.To[3]},
- })
- }
- if to[2] > toBlk.From[2] {
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], toBlk.From[2], toBlk.From[3]},
- To: []byte{to[0], to[1], to[2] - 1, toBlk.To[3]},
- })
- }
- alt = append(alt, &CharBlock{
- From: []byte{to[0], to[1], to[2], toBlk.From[3]},
- To: to,
- })
- return alt
+ if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
+ return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to)
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92
+ // > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise
+ // > map to code points U+D800..U+DFFF is ill-formed.
+ if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
+ return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to)
}
-}
-func findCharBlock3(c []byte) (int, *CharBlock) {
- for i, blk := range cBlks3 {
- if c[0] >= blk.From[0] && c[0] <= blk.To[0] {
- return i, blk
- }
+ in := &cpRange{
+ from: from,
+ to: to,
}
- return 0, nil
-}
+ var rs []*cpRange
+ for in.from <= in.to {
+ r := &cpRange{
+ from: in.from,
+ to: in.to,
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7. Well-Formed UTF-8 Byte Sequences
+ switch {
+ case in.from <= 0x007f && in.to > 0x007f:
+ r.to = 0x007f
+ case in.from <= 0x07ff && in.to > 0x07ff:
+ r.to = 0x07ff
+ case in.from <= 0x0fff && in.to > 0x0fff:
+ r.to = 0x0fff
+ case in.from <= 0xcfff && in.to > 0xcfff:
+ r.to = 0xcfff
+ case in.from <= 0xd7ff && in.to > 0xd7ff:
+ r.to = 0xd7ff
+ case in.from <= 0xffff && in.to > 0xffff:
+ r.to = 0xffff
+ case in.from <= 0x3ffff && in.to > 0x3ffff:
+ r.to = 0x3ffff
+ case in.from <= 0xfffff && in.to > 0xfffff:
+ r.to = 0xfffff
+ }
+ rs = append(rs, r)
+ in.from = r.to + 1
-func findCharBlock4(c []byte) (int, *CharBlock) {
- for i, blk := range cBlks4 {
- if c[0] >= blk.From[0] && c[0] <= blk.To[0] {
- return i, blk
+ // Skip surrogate code points U+D800..U+DFFF.
+ if in.from >= 0xd800 && in.from <= 0xdfff {
+ in.from = 0xe000
}
}
- return 0, nil
+ return rs, nil
}
diff --git a/utf8/utf8_test.go b/utf8/utf8_test.go
new file mode 100644
index 0000000..2dc8093
--- /dev/null
+++ b/utf8/utf8_test.go
@@ -0,0 +1,181 @@
+package utf8
+
+import (
+ "fmt"
+ "testing"
+)
+
+func TestGenCharBlocks_WellFormed(t *testing.T) {
+ cBlk := func(from []byte, to []byte) *CharBlock {
+ return &CharBlock{
+ From: from,
+ To: to,
+ }
+ }
+
+ seq := func(b ...byte) []byte {
+ return b
+ }
+
+ tests := []struct {
+ from rune
+ to rune
+ blocks []*CharBlock
+ }{
+ {
+ from: '\u0000',
+ to: '\u007f',
+ blocks: []*CharBlock{
+ cBlk(seq(0x00), seq(0x7f)),
+ },
+ },
+ {
+ from: '\u0080',
+ to: '\u07ff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+ },
+ },
+ {
+ from: '\u0800',
+ to: '\u0fff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\u1000',
+ to: '\ucfff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\ud000',
+ to: '\ud7ff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
+ },
+ },
+ {
+ from: '\ue000',
+ to: '\uffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\U00010000',
+ to: '\U0003ffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\U00040000',
+ to: '\U000fffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\U00100000',
+ to: '\U0010ffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
+ },
+ },
+ {
+ from: '\u0000',
+ to: '\U0010ffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0x00), seq(0x7f)),
+ cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+ cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
+ cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
+ cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
+ cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
+ cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
+ cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
+ cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
+ blks, err := GenCharBlocks(tt.from, tt.to)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(blks) != len(tt.blocks) {
+ t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
+ }
+ for i, blk := range blks {
+ if len(blk.From) != len(tt.blocks[i].From) || len(blk.To) != len(tt.blocks[i].To) {
+ t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
+ }
+ for j := 0; j < len(blk.From); j++ {
+ if blk.From[j] != tt.blocks[i].From[j] || blk.To[j] != tt.blocks[i].To[j] {
+ t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
+ }
+ }
+ }
+ })
+ }
+}
+
+func TestGenCharBlocks_IllFormed(t *testing.T) {
+ tests := []struct {
+ from rune
+ to rune
+ }{
+ {
+ // from > to
+ from: '\u0001',
+ to: '\u0000',
+ },
+ {
+ from: -1, // <U+0000
+ to: '\u0000',
+ },
+ {
+ from: '\u0000',
+ to: -1, // <U+0000
+ },
+ {
+ from: 0x110000, // >U+10FFFF
+ to: '\u0000',
+ },
+ {
+ from: '\u0000',
+ to: 0x110000, // >U+10FFFF
+ },
+ {
+ from: 0xd800, // U+D800 (surrogate code point)
+ to: '\ue000',
+ },
+ {
+ from: 0xdfff, // U+DFFF (surrogate code point)
+ to: '\ue000',
+ },
+ {
+ from: '\ucfff',
+ to: 0xd800, // U+D800 (surrogate code point)
+ },
+ {
+ from: '\ucfff',
+ to: 0xdfff, // U+DFFF (surrogate code point)
+ },
+ }
+ for _, tt := range tests {
+ t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
+ blks, err := GenCharBlocks(tt.from, tt.to)
+ if err == nil {
+ t.Fatal("expected error didn't occur")
+ }
+ if blks != nil {
+ t.Fatal("character blocks must be nil")
+ }
+ })
+ }
+}