aboutsummaryrefslogtreecommitdiff
path: root/utf8
diff options
context:
space:
mode:
Diffstat (limited to 'utf8')
-rw-r--r--utf8/utf8.go407
1 files changed, 407 insertions, 0 deletions
diff --git a/utf8/utf8.go b/utf8/utf8.go
new file mode 100644
index 0000000..79ca1de
--- /dev/null
+++ b/utf8/utf8.go
@@ -0,0 +1,407 @@
+package utf8
+
+import "fmt"
+
+type CharBlock struct {
+ From []byte
+ To []byte
+}
+
+// Refelences:
+// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
+// * Table 3-6. UTF-8 Bit Distribution
+// * Table 3-7. Well-Formed UTF-8 Byte Sequences
+var (
+ // 1 byte character:
+ // * <00..7F>
+ cBlks1 = []*CharBlock{
+ {
+ From: []byte{0x00},
+ To: []byte{0x7f},
+ },
+ }
+
+ // 2 bytes character:
+ // * <C2..DF 80..BF>
+ cBlks2 = []*CharBlock{
+ {
+ From: []byte{0xc2, 0x80},
+ To: []byte{0xdf, 0xbf},
+ },
+ }
+
+ // 3 bytes character:
+ // * <E0 A0..BF 80..BF>
+ // * <E1..EC 80..BF 80..BF>
+ // * <ED 80..9F 80..BF>
+ // * <EE..EF 80..BF 80..BF>
+ cBlks3 = []*CharBlock{
+ {
+ From: []byte{0xe0, 0xa0, 0x80},
+ To: []byte{0xe0, 0xbf, 0xbf},
+ },
+ {
+ From: []byte{0xe1, 0x80, 0x80},
+ To: []byte{0xec, 0xbf, 0xbf},
+ },
+ {
+ From: []byte{0xed, 0x80, 0x80},
+ To: []byte{0xed, 0x9f, 0xbf},
+ },
+ {
+ From: []byte{0xee, 0x80, 0x80},
+ To: []byte{0xef, 0xbf, 0xbf},
+ },
+ }
+
+ // 4 bytes character:
+ // * <F0 90..BF 80..BF 80..BF>
+ // * <F1..F3 80..BF 80..BF 80..BF>
+ // * <F4 80..8F 80..BF 80..BF>
+ cBlks4 = []*CharBlock{
+ {
+ From: []byte{0xf0, 0x90, 0x80, 0x80},
+ To: []byte{0xf0, 0xbf, 0xbf, 0xbf},
+ },
+ {
+ From: []byte{0xf1, 0x80, 0x80, 0x80},
+ To: []byte{0xf3, 0xbf, 0xbf, 0xbf},
+ },
+ {
+ From: []byte{0xf4, 0x80, 0x80, 0x80},
+ To: []byte{0xf4, 0x8f, 0xbf, 0xbf},
+ },
+ }
+
+ cBlk1Head = cBlks1[0]
+ cBlk1Last = cBlks1[len(cBlks1)-1]
+ cBlk2Head = cBlks2[0]
+ cBlk2Last = cBlks2[len(cBlks2)-1]
+ cBlk3Head = cBlks3[0]
+ cBlk3Last = cBlks3[len(cBlks3)-1]
+ cBlk4Head = cBlks4[0]
+ cBlk4Last = cBlks4[len(cBlks4)-1]
+)
+
+func AllCharBlocks() []*CharBlock {
+ var blks []*CharBlock
+ blks = append(blks, cBlks1...)
+ blks = append(blks, cBlks2...)
+ blks = append(blks, cBlks3...)
+ blks = append(blks, cBlks4...)
+ return blks
+}
+
+func GenCharBlocks(from, to []byte) ([]*CharBlock, error) {
+ switch len(from) {
+ case 1:
+ switch len(to) {
+ case 1:
+ return genCharBlocks1(from, to), nil
+ case 2:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
+ alt = append(alt, genCharBlocks2(cBlk2Head.From, to)...)
+ return alt, nil
+ case 3:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
+ alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...)
+ alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...)
+ return alt, nil
+ case 4:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...)
+ alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...)
+ alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...)
+ alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
+ return alt, nil
+ }
+ case 2:
+ switch len(to) {
+ case 2:
+ return genCharBlocks2(from, to), nil
+ case 3:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...)
+ alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...)
+ return alt, nil
+ case 4:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...)
+ alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...)
+ alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
+ return alt, nil
+ }
+ case 3:
+ switch len(to) {
+ case 3:
+ return genCharBlocks3(from, to), nil
+ case 4:
+ var alt []*CharBlock
+ alt = append(alt, genCharBlocks3(from, cBlk3Last.To)...)
+ alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...)
+ return alt, nil
+ }
+ case 4:
+ return genCharBlocks4(from, to), nil
+ }
+ return nil, fmt.Errorf("invalid range; From: %v, To: %v", from, to)
+}
+
+func genCharBlocks1(from, to []byte) []*CharBlock {
+ return []*CharBlock{
+ {From: from, To: to},
+ }
+}
+
+func genCharBlocks2(from, to []byte) []*CharBlock {
+ switch {
+ case from[0] == to[0]:
+ return []*CharBlock{
+ {From: from, To: to},
+ }
+ default:
+ return []*CharBlock{
+ {From: from, To: []byte{from[0], cBlks2[0].To[1]}},
+ {From: []byte{from[0] + 1, cBlks2[0].From[1]}, To: to},
+ }
+ }
+}
+
+func genCharBlocks3(from, to []byte) []*CharBlock {
+ switch {
+ case from[0] == to[0] && from[1] == to[1]:
+ return []*CharBlock{
+ {From: from, To: to},
+ }
+ case from[0] == to[0]:
+ _, fromBlk := findCharBlock3(from)
+ var alt []*CharBlock
+ alt = append(alt, &CharBlock{
+ From: from,
+ To: []byte{from[0], from[1], fromBlk.To[2]},
+ })
+ if from[1]+1 < to[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1] + 1, fromBlk.From[2]},
+ To: []byte{from[0], to[1] - 1, fromBlk.To[2]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], to[1], fromBlk.From[2]},
+ To: to,
+ })
+ return alt
+ default:
+ fromBlkNum, fromBlk := findCharBlock3(from)
+ toBlkNum, toBlk := findCharBlock3(to)
+ var alt []*CharBlock
+ alt = append(alt, &CharBlock{
+ From: from,
+ To: []byte{from[0], from[1], fromBlk.To[2]},
+ })
+ if from[1] < fromBlk.To[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1] + 1, fromBlk.From[2]},
+ To: []byte{from[0], fromBlk.To[1], fromBlk.To[2]},
+ })
+ }
+ if fromBlkNum == toBlkNum {
+ if from[0]+1 < to[0] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2]},
+ To: []byte{to[0] - 1, fromBlk.To[1], fromBlk.To[2]},
+ })
+ }
+ if to[1] > fromBlk.From[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], fromBlk.From[1], fromBlk.From[2]},
+ To: []byte{to[0], to[1] - 1, fromBlk.To[2]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], fromBlk.From[2]},
+ To: to,
+ })
+ return alt
+ }
+ for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ {
+ fromBlk := cBlks3[blkNum]
+ alt = append(alt, &CharBlock{
+ From: fromBlk.From,
+ To: fromBlk.To,
+ })
+ }
+ if to[0] > toBlk.From[0] {
+ alt = append(alt, &CharBlock{
+ From: toBlk.From,
+ To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2]},
+ })
+ }
+ if to[1] > toBlk.From[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], toBlk.From[1], toBlk.From[2]},
+ To: []byte{to[0], to[1] - 1, toBlk.To[2]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], toBlk.From[2]},
+ To: to,
+ })
+ return alt
+ }
+}
+
+func genCharBlocks4(from, to []byte) []*CharBlock {
+ switch {
+ case from[0] == to[0] && from[1] == to[1] && from[2] == to[2]:
+ return []*CharBlock{
+ {
+ From: from,
+ To: to,
+ },
+ }
+ case from[0] == to[0] && from[1] == to[1]:
+ _, fromBlk := findCharBlock4(from)
+ var alt []*CharBlock
+ alt = append(alt, &CharBlock{
+ From: from,
+ To: []byte{to[0], to[1], from[2], fromBlk.To[3]},
+ })
+ if from[2]+1 < to[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1], from[2] + 2, fromBlk.From[3]},
+ To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1], to[2], fromBlk.From[3]},
+ To: []byte{to[0], to[1], to[2], to[3]},
+ })
+ return alt
+ case from[0] == to[0]:
+ _, fromBlk := findCharBlock4(from)
+ var alt []*CharBlock
+ alt = append(alt, &CharBlock{
+ From: from,
+ To: []byte{to[0], from[1], from[2], fromBlk.To[3]},
+ })
+ if from[2] < fromBlk.To[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]},
+ To: []byte{to[0], from[1], fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if from[1]+1 < to[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if to[2] > fromBlk.From[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], to[1], fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{from[0], to[1], to[2] - 1, fromBlk.To[3]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], to[1], to[2], fromBlk.From[3]},
+ To: to,
+ })
+ return alt
+ default:
+ fromBlkNum, fromBlk := findCharBlock4(from)
+ toBlkNum, toBlk := findCharBlock4(to)
+ var alt []*CharBlock
+ alt = append(alt, &CharBlock{
+ From: from,
+ To: []byte{from[0], from[1], from[2], fromBlk.To[3]},
+ })
+ if from[2] < fromBlk.To[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]},
+ To: []byte{from[0], from[1], fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if from[1] < fromBlk.To[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{from[0], fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if fromBlkNum == toBlkNum {
+ if from[0]+1 < to[0] {
+ alt = append(alt, &CharBlock{
+ From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{from[0] - 1, fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if to[1] > fromBlk.From[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]},
+ })
+ }
+ if to[2] > fromBlk.From[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], fromBlk.From[2], fromBlk.From[3]},
+ To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], to[2], fromBlk.From[3]},
+ To: to,
+ })
+ return alt
+ }
+ for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ {
+ blk := cBlks4[blkNum]
+ alt = append(alt, &CharBlock{
+ From: blk.From,
+ To: blk.To,
+ })
+ }
+ if to[0] > toBlk.From[0] {
+ alt = append(alt, &CharBlock{
+ From: toBlk.From,
+ To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2], toBlk.To[3]},
+ })
+ }
+ if to[1] > toBlk.From[1] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], toBlk.From[1], toBlk.From[2], toBlk.From[3]},
+ To: []byte{to[0], to[1] - 1, toBlk.To[2], toBlk.To[3]},
+ })
+ }
+ if to[2] > toBlk.From[2] {
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], toBlk.From[2], toBlk.From[3]},
+ To: []byte{to[0], to[1], to[2] - 1, toBlk.To[3]},
+ })
+ }
+ alt = append(alt, &CharBlock{
+ From: []byte{to[0], to[1], to[2], toBlk.From[3]},
+ To: to,
+ })
+ return alt
+ }
+}
+
+func findCharBlock3(c []byte) (int, *CharBlock) {
+ for i, blk := range cBlks3 {
+ if c[0] >= blk.From[0] && c[0] <= blk.To[0] {
+ return i, blk
+ }
+ }
+ return 0, nil
+}
+
+func findCharBlock4(c []byte) (int, *CharBlock) {
+ for i, blk := range cBlks4 {
+ if c[0] >= blk.From[0] && c[0] <= blk.To[0] {
+ return i, blk
+ }
+ }
+ return 0, nil
+}