aboutsummaryrefslogtreecommitdiff
path: root/utf8/utf8.go
diff options
context:
space:
mode:
Diffstat (limited to 'utf8/utf8.go')
-rw-r--r--utf8/utf8.go112
1 files changed, 112 insertions, 0 deletions
diff --git a/utf8/utf8.go b/utf8/utf8.go
new file mode 100644
index 0000000..4f52bd4
--- /dev/null
+++ b/utf8/utf8.go
@@ -0,0 +1,112 @@
+package utf8
+
+import (
+ "fmt"
+ "strings"
+)
+
+type CharBlock struct {
+ From []byte
+ To []byte
+}
+
+func (b *CharBlock) String() string {
+ var s strings.Builder
+ fmt.Fprint(&s, "<")
+ fmt.Fprintf(&s, "%X", b.From[0])
+ for i := 1; i < len(b.From); i++ {
+ fmt.Fprintf(&s, " %X", b.From[i])
+ }
+ fmt.Fprint(&s, "..")
+ fmt.Fprintf(&s, "%X", b.To[0])
+ for i := 1; i < len(b.To); i++ {
+ fmt.Fprintf(&s, " %X", b.To[i])
+ }
+ fmt.Fprint(&s, ">")
+ return s.String()
+}
+
+func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
+ rs, err := splitCodePoint(from, to)
+ if err != nil {
+ return nil, err
+ }
+
+ blks := make([]*CharBlock, len(rs))
+ for i, r := range rs {
+ blks[i] = &CharBlock{
+ From: []byte(string(r.from)),
+ To: []byte(string(r.to)),
+ }
+ }
+
+ return blks, nil
+}
+
+type cpRange struct {
+ from rune
+ to rune
+}
+
+// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that
+// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function
+// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on
+// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>,
+// and <U+0080..U+07FF> is encoded <C2 80..DF BF>).
+//
+// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are
+// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
+// However, when `from` or `to` itself is the surrogate code point, this function returns an error.
+func splitCodePoint(from, to rune) ([]*cpRange, error) {
+ if from > to {
+ return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to)
+ }
+ if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
+ return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to)
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92
+ // > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise
+ // > map to code points U+D800..U+DFFF is ill-formed.
+ if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
+ return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to)
+ }
+
+ in := &cpRange{
+ from: from,
+ to: to,
+ }
+ var rs []*cpRange
+ for in.from <= in.to {
+ r := &cpRange{
+ from: in.from,
+ to: in.to,
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7. Well-Formed UTF-8 Byte Sequences
+ switch {
+ case in.from <= 0x007f && in.to > 0x007f:
+ r.to = 0x007f
+ case in.from <= 0x07ff && in.to > 0x07ff:
+ r.to = 0x07ff
+ case in.from <= 0x0fff && in.to > 0x0fff:
+ r.to = 0x0fff
+ case in.from <= 0xcfff && in.to > 0xcfff:
+ r.to = 0xcfff
+ case in.from <= 0xd7ff && in.to > 0xd7ff:
+ r.to = 0xd7ff
+ case in.from <= 0xffff && in.to > 0xffff:
+ r.to = 0xffff
+ case in.from <= 0x3ffff && in.to > 0x3ffff:
+ r.to = 0x3ffff
+ case in.from <= 0xfffff && in.to > 0xfffff:
+ r.to = 0xfffff
+ }
+ rs = append(rs, r)
+ in.from = r.to + 1
+
+ // Skip surrogate code points U+D800..U+DFFF.
+ if in.from >= 0xd800 && in.from <= 0xdfff {
+ in.from = 0xe000
+ }
+ }
+ return rs, nil
+}