aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/tre.go139
-rw-r--r--tests/tre.go251
-rw-r--r--utf8/utf8.go112
-rw-r--r--utf8/utf8_test.go181
4 files changed, 390 insertions, 293 deletions
diff --git a/src/tre.go b/src/tre.go
index a24178a..204019a 100644
--- a/src/tre.go
+++ b/src/tre.go
@@ -1,5 +1,144 @@
package tre
+import (
+ "fmt"
+ "strings"
+)
+
+
+
+type CharBlock struct {
+ From []byte
+ To []byte
+}
+
+type cpRange struct {
+ from rune
+ to rune
+}
+
+
+
+func (b *CharBlock) String() string {
+ var s strings.Builder
+ fmt.Fprint(&s, "<")
+ fmt.Fprintf(&s, "%X", b.From[0])
+ for i := 1; i < len(b.From); i++ {
+ fmt.Fprintf(&s, " %X", b.From[i])
+ }
+ fmt.Fprint(&s, "..")
+ fmt.Fprintf(&s, "%X", b.To[0])
+ for i := 1; i < len(b.To); i++ {
+ fmt.Fprintf(&s, " %X", b.To[i])
+ }
+ fmt.Fprint(&s, ">")
+ return s.String()
+}
+
+func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
+ rs, err := splitCodePoint(from, to)
+ if err != nil {
+ return nil, err
+ }
+
+ blks := make([]*CharBlock, len(rs))
+ for i, r := range rs {
+ blks[i] = &CharBlock{
+ From: []byte(string(r.from)),
+ To: []byte(string(r.to)),
+ }
+ }
+
+ return blks, nil
+}
+
+/// `splitCodePoint` splits a code point range represented by <from..to> into
+/// some blocks. The code points that the block contains will be a continuous
+/// byte sequence when encoded into UTF-8. For instance, this function splits
+/// <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because
+/// <U+0000..U+07FF> is continuous on the code point but non-continuous in the
+/// UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>, and
+/// <U+0080..U+07FF> is encoded <C2 80..DF BF>).
+///
+/// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte
+/// sequences encoding them are ill-formed in UTF-8. For instance,
+/// <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
+/// However, when `from` or `to` itself is the surrogate code point, this
+/// function returns an error.
+func splitCodePoint(from, to rune) ([]*cpRange, error) {
+ if from > to {
+ return nil, fmt.Errorf(
+ "code point range must be from <= to: U+%X..U+%X",
+ from,
+ to,
+ )
+ }
+ if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
+ return nil, fmt.Errorf(
+ "code point must be >=U+0000 and <=U+10FFFF:" +
+ "U+%X..U+%X",
+ from,
+ to,
+ )
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
+ // > 3.9 Unicode Encoding Forms
+ // > UTF-8 D92
+ // > Because surrogate code points are not Unicode scalar values,
+ // > any UTF-8 byte sequence that would otherwise
+ // > map to code points U+D800..U+DFFF is ill-formed.
+ if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
+ return nil, fmt.Errorf(
+ "surrogate code points U+D800..U+DFFF " +
+ "are not allowed in UTF-8: U+%X..U+%X",
+ from,
+ to,
+ )
+ }
+
+ in := &cpRange{
+ from: from,
+ to: to,
+ }
+ var rs []*cpRange
+ for in.from <= in.to {
+ r := &cpRange{
+ from: in.from,
+ to: in.to,
+ }
+ // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
+ // > 3.9 Unicode Encoding Forms
+ // > UTF-8 Table 3-7.
+ // > Well-Formed UTF-8 Byte Sequences
+ switch {
+ case in.from <= 0x007f && in.to > 0x007f:
+ r.to = 0x007f
+ case in.from <= 0x07ff && in.to > 0x07ff:
+ r.to = 0x07ff
+ case in.from <= 0x0fff && in.to > 0x0fff:
+ r.to = 0x0fff
+ case in.from <= 0xcfff && in.to > 0xcfff:
+ r.to = 0xcfff
+ case in.from <= 0xd7ff && in.to > 0xd7ff:
+ r.to = 0xd7ff
+ case in.from <= 0xffff && in.to > 0xffff:
+ r.to = 0xffff
+ case in.from <= 0x3ffff && in.to > 0x3ffff:
+ r.to = 0x3ffff
+ case in.from <= 0xfffff && in.to > 0xfffff:
+ r.to = 0xfffff
+ }
+ rs = append(rs, r)
+ in.from = r.to + 1
+
+ // Skip surrogate code points U+D800..U+DFFF.
+ if in.from >= 0xd800 && in.from <= 0xdfff {
+ in.from = 0xe000
+ }
+ }
+ return rs, nil
+}
+
func Main() {
diff --git a/tests/tre.go b/tests/tre.go
index 7495c4b..1f3cfed 100644
--- a/tests/tre.go
+++ b/tests/tre.go
@@ -1,6 +1,257 @@
package tre
+import (
+ "fmt"
+ "os"
+ "testing"
+ "testing/internal/testdeps"
+)
+
+
+
+func TestGenCharBlocksWellFormed(t *testing.T) {
+ cBlk := func(from []byte, to []byte) *CharBlock {
+ return &CharBlock{
+ From: from,
+ To: to,
+ }
+ }
+
+ seq := func(b ...byte) []byte {
+ return b
+ }
+
+ tests := []struct {
+ from rune
+ to rune
+ blocks []*CharBlock
+ }{
+ {
+ from: '\u0000',
+ to: '\u007f',
+ blocks: []*CharBlock{
+ cBlk(seq(0x00), seq(0x7f)),
+ },
+ },
+ {
+ from: '\u0080',
+ to: '\u07ff',
+ blocks: []*CharBlock{
+ cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+ },
+ },
+ {
+ from: '\u0800',
+ to: '\u0fff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xe0, 0xa0, 0x80),
+ seq(0xe0, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\u1000',
+ to: '\ucfff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xe1, 0x80, 0x80),
+ seq(0xec, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\ud000',
+ to: '\ud7ff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xed, 0x80, 0x80),
+ seq(0xed, 0x9f, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\ue000',
+ to: '\uffff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xee, 0x80, 0x80),
+ seq(0xef, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\U00010000',
+ to: '\U0003ffff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xf0, 0x90, 0x80, 0x80),
+ seq(0xf0, 0xbf, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\U00040000',
+ to: '\U000fffff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xf1, 0x80, 0x80, 0x80),
+ seq(0xf3, 0xbf, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\U00100000',
+ to: '\U0010ffff',
+ blocks: []*CharBlock{
+ cBlk(
+ seq(0xf4, 0x80, 0x80, 0x80),
+ seq(0xf4, 0x8f, 0xbf, 0xbf),
+ ),
+ },
+ },
+ {
+ from: '\u0000',
+ to: '\U0010ffff',
+ blocks: []*CharBlock{
+ cBlk(seq(0x00), seq(0x7f)),
+ cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+ cBlk(
+ seq(0xe0, 0xa0, 0x80),
+ seq(0xe0, 0xbf, 0xbf),
+ ),
+ cBlk(
+ seq(0xe1, 0x80, 0x80),
+ seq(0xec, 0xbf, 0xbf),
+ ),
+ cBlk(
+ seq(0xed, 0x80, 0x80),
+ seq(0xed, 0x9f, 0xbf),
+ ),
+ cBlk(
+ seq(0xee, 0x80, 0x80),
+ seq(0xef, 0xbf, 0xbf),
+ ),
+ cBlk(
+ seq(0xf0, 0x90, 0x80, 0x80),
+ seq(0xf0, 0xbf, 0xbf, 0xbf),
+ ),
+ cBlk(
+ seq(0xf1, 0x80, 0x80, 0x80),
+ seq(0xf3, 0xbf, 0xbf, 0xbf),
+ ),
+ cBlk(
+ seq(0xf4, 0x80, 0x80, 0x80),
+ seq(0xf4, 0x8f, 0xbf, 0xbf),
+ ),
+ },
+ },
+ }
+ for _, tt := range tests {
+ const errmsg = "unexpected character block: want: %+v, got: %+v"
+ tts := fmt.Sprintf("%v..%v", tt.from, tt.to)
+ t.Run(tts, func(t *testing.T) {
+ blks, err := GenCharBlocks(tt.from, tt.to)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(blks) != len(tt.blocks) {
+ t.Fatalf(errmsg, tt.blocks, blks)
+ }
+ for i, blk := range blks {
+ expected := tt.blocks[i]
+ neqFrom := len(blk.From) != len(expected.From)
+ neqTo := len(blk.To) != len(expected.To)
+ if neqFrom || neqTo {
+ t.Fatalf(errmsg, tt.blocks, blks)
+ }
+ for j := 0; j < len(blk.From); j++ {
+ neqFrom := blk.From[j] !=
+ expected.From[j]
+ neqTo := blk.To[j] !=
+ expected.To[j]
+ if neqFrom || neqTo {
+ t.Fatalf(
+ errmsg,
+ tt.blocks,
+ blks,
+ )
+ }
+ }
+ }
+ })
+ }
+}
+
+func TestGenCharBlocksIllFormed(t *testing.T) {
+ tests := []struct {
+ from rune
+ to rune
+ }{
+ {
+ // from > to
+ from: '\u0001',
+ to: '\u0000',
+ },
+ {
+ from: -1, // <U+0000
+ to: '\u0000',
+ },
+ {
+ from: '\u0000',
+ to: -1, // <U+0000
+ },
+ {
+ from: 0x110000, // >U+10FFFF
+ to: '\u0000',
+ },
+ {
+ from: '\u0000',
+ to: 0x110000, // >U+10FFFF
+ },
+ {
+ from: 0xd800, // U+D800 (surrogate code point)
+ to: '\ue000',
+ },
+ {
+ from: 0xdfff, // U+DFFF (surrogate code point)
+ to: '\ue000',
+ },
+ {
+ from: '\ucfff',
+ to: 0xd800, // U+D800 (surrogate code point)
+ },
+ {
+ from: '\ucfff',
+ to: 0xdfff, // U+DFFF (surrogate code point)
+ },
+ }
+ for _, tt := range tests {
+ tts := fmt.Sprintf("%v..%v", tt.from, tt.to)
+ t.Run(tts, func(t *testing.T) {
+ blks, err := GenCharBlocks(tt.from, tt.to)
+ if err == nil {
+ t.Fatal("expected error didn't occur")
+ }
+ if blks != nil {
+ t.Fatal("character blocks must be nil")
+ }
+ })
+ }
+}
+
func MainTest() {
+ tests := []testing.InternalTest{
+ { "TestGenCharBlocksWellFormed", TestGenCharBlocksWellFormed },
+ { "TestGenCharBlocksIllFormed", TestGenCharBlocksIllFormed },
+ }
+
+ deps := testdeps.TestDeps{}
+ benchmarks := []testing.InternalBenchmark {}
+ fuzzTargets := []testing.InternalFuzzTarget{}
+ examples := []testing.InternalExample {}
+ m := testing.MainStart(deps, tests, benchmarks, fuzzTargets, examples)
+ os.Exit(m.Run())
}
diff --git a/utf8/utf8.go b/utf8/utf8.go
deleted file mode 100644
index 4f52bd4..0000000
--- a/utf8/utf8.go
+++ /dev/null
@@ -1,112 +0,0 @@
-package utf8
-
-import (
- "fmt"
- "strings"
-)
-
-type CharBlock struct {
- From []byte
- To []byte
-}
-
-func (b *CharBlock) String() string {
- var s strings.Builder
- fmt.Fprint(&s, "<")
- fmt.Fprintf(&s, "%X", b.From[0])
- for i := 1; i < len(b.From); i++ {
- fmt.Fprintf(&s, " %X", b.From[i])
- }
- fmt.Fprint(&s, "..")
- fmt.Fprintf(&s, "%X", b.To[0])
- for i := 1; i < len(b.To); i++ {
- fmt.Fprintf(&s, " %X", b.To[i])
- }
- fmt.Fprint(&s, ">")
- return s.String()
-}
-
-func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
- rs, err := splitCodePoint(from, to)
- if err != nil {
- return nil, err
- }
-
- blks := make([]*CharBlock, len(rs))
- for i, r := range rs {
- blks[i] = &CharBlock{
- From: []byte(string(r.from)),
- To: []byte(string(r.to)),
- }
- }
-
- return blks, nil
-}
-
-type cpRange struct {
- from rune
- to rune
-}
-
-// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that
-// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function
-// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on
-// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>,
-// and <U+0080..U+07FF> is encoded <C2 80..DF BF>).
-//
-// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are
-// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
-// However, when `from` or `to` itself is the surrogate code point, this function returns an error.
-func splitCodePoint(from, to rune) ([]*cpRange, error) {
- if from > to {
- return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to)
- }
- if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
- return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to)
- }
- // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92
- // > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise
- // > map to code points U+D800..U+DFFF is ill-formed.
- if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
- return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to)
- }
-
- in := &cpRange{
- from: from,
- to: to,
- }
- var rs []*cpRange
- for in.from <= in.to {
- r := &cpRange{
- from: in.from,
- to: in.to,
- }
- // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7. Well-Formed UTF-8 Byte Sequences
- switch {
- case in.from <= 0x007f && in.to > 0x007f:
- r.to = 0x007f
- case in.from <= 0x07ff && in.to > 0x07ff:
- r.to = 0x07ff
- case in.from <= 0x0fff && in.to > 0x0fff:
- r.to = 0x0fff
- case in.from <= 0xcfff && in.to > 0xcfff:
- r.to = 0xcfff
- case in.from <= 0xd7ff && in.to > 0xd7ff:
- r.to = 0xd7ff
- case in.from <= 0xffff && in.to > 0xffff:
- r.to = 0xffff
- case in.from <= 0x3ffff && in.to > 0x3ffff:
- r.to = 0x3ffff
- case in.from <= 0xfffff && in.to > 0xfffff:
- r.to = 0xfffff
- }
- rs = append(rs, r)
- in.from = r.to + 1
-
- // Skip surrogate code points U+D800..U+DFFF.
- if in.from >= 0xd800 && in.from <= 0xdfff {
- in.from = 0xe000
- }
- }
- return rs, nil
-}
diff --git a/utf8/utf8_test.go b/utf8/utf8_test.go
deleted file mode 100644
index 2dc8093..0000000
--- a/utf8/utf8_test.go
+++ /dev/null
@@ -1,181 +0,0 @@
-package utf8
-
-import (
- "fmt"
- "testing"
-)
-
-func TestGenCharBlocks_WellFormed(t *testing.T) {
- cBlk := func(from []byte, to []byte) *CharBlock {
- return &CharBlock{
- From: from,
- To: to,
- }
- }
-
- seq := func(b ...byte) []byte {
- return b
- }
-
- tests := []struct {
- from rune
- to rune
- blocks []*CharBlock
- }{
- {
- from: '\u0000',
- to: '\u007f',
- blocks: []*CharBlock{
- cBlk(seq(0x00), seq(0x7f)),
- },
- },
- {
- from: '\u0080',
- to: '\u07ff',
- blocks: []*CharBlock{
- cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
- },
- },
- {
- from: '\u0800',
- to: '\u0fff',
- blocks: []*CharBlock{
- cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
- },
- },
- {
- from: '\u1000',
- to: '\ucfff',
- blocks: []*CharBlock{
- cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
- },
- },
- {
- from: '\ud000',
- to: '\ud7ff',
- blocks: []*CharBlock{
- cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
- },
- },
- {
- from: '\ue000',
- to: '\uffff',
- blocks: []*CharBlock{
- cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
- },
- },
- {
- from: '\U00010000',
- to: '\U0003ffff',
- blocks: []*CharBlock{
- cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
- },
- },
- {
- from: '\U00040000',
- to: '\U000fffff',
- blocks: []*CharBlock{
- cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
- },
- },
- {
- from: '\U00100000',
- to: '\U0010ffff',
- blocks: []*CharBlock{
- cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
- },
- },
- {
- from: '\u0000',
- to: '\U0010ffff',
- blocks: []*CharBlock{
- cBlk(seq(0x00), seq(0x7f)),
- cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
- cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
- cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
- cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
- cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
- cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
- cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
- cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
- },
- },
- }
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
- blks, err := GenCharBlocks(tt.from, tt.to)
- if err != nil {
- t.Fatal(err)
- }
- if len(blks) != len(tt.blocks) {
- t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
- }
- for i, blk := range blks {
- if len(blk.From) != len(tt.blocks[i].From) || len(blk.To) != len(tt.blocks[i].To) {
- t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
- }
- for j := 0; j < len(blk.From); j++ {
- if blk.From[j] != tt.blocks[i].From[j] || blk.To[j] != tt.blocks[i].To[j] {
- t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
- }
- }
- }
- })
- }
-}
-
-func TestGenCharBlocks_IllFormed(t *testing.T) {
- tests := []struct {
- from rune
- to rune
- }{
- {
- // from > to
- from: '\u0001',
- to: '\u0000',
- },
- {
- from: -1, // <U+0000
- to: '\u0000',
- },
- {
- from: '\u0000',
- to: -1, // <U+0000
- },
- {
- from: 0x110000, // >U+10FFFF
- to: '\u0000',
- },
- {
- from: '\u0000',
- to: 0x110000, // >U+10FFFF
- },
- {
- from: 0xd800, // U+D800 (surrogate code point)
- to: '\ue000',
- },
- {
- from: 0xdfff, // U+DFFF (surrogate code point)
- to: '\ue000',
- },
- {
- from: '\ucfff',
- to: 0xd800, // U+D800 (surrogate code point)
- },
- {
- from: '\ucfff',
- to: 0xdfff, // U+DFFF (surrogate code point)
- },
- }
- for _, tt := range tests {
- t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
- blks, err := GenCharBlocks(tt.from, tt.to)
- if err == nil {
- t.Fatal("expected error didn't occur")
- }
- if blks != nil {
- t.Fatal("character blocks must be nil")
- }
- })
- }
-}