diff options
-rw-r--r-- | src/tre.go | 139 | ||||
-rw-r--r-- | tests/tre.go | 251 | ||||
-rw-r--r-- | utf8/utf8.go | 112 | ||||
-rw-r--r-- | utf8/utf8_test.go | 181 |
4 files changed, 390 insertions, 293 deletions
@@ -1,5 +1,144 @@ package tre +import ( + "fmt" + "strings" +) + + + +type CharBlock struct { + From []byte + To []byte +} + +type cpRange struct { + from rune + to rune +} + + + +func (b *CharBlock) String() string { + var s strings.Builder + fmt.Fprint(&s, "<") + fmt.Fprintf(&s, "%X", b.From[0]) + for i := 1; i < len(b.From); i++ { + fmt.Fprintf(&s, " %X", b.From[i]) + } + fmt.Fprint(&s, "..") + fmt.Fprintf(&s, "%X", b.To[0]) + for i := 1; i < len(b.To); i++ { + fmt.Fprintf(&s, " %X", b.To[i]) + } + fmt.Fprint(&s, ">") + return s.String() +} + +func GenCharBlocks(from, to rune) ([]*CharBlock, error) { + rs, err := splitCodePoint(from, to) + if err != nil { + return nil, err + } + + blks := make([]*CharBlock, len(rs)) + for i, r := range rs { + blks[i] = &CharBlock{ + From: []byte(string(r.from)), + To: []byte(string(r.to)), + } + } + + return blks, nil +} + +/// `splitCodePoint` splits a code point range represented by <from..to> into +/// some blocks. The code points that the block contains will be a continuous +/// byte sequence when encoded into UTF-8. For instance, this function splits +/// <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because +/// <U+0000..U+07FF> is continuous on the code point but non-continuous in the +/// UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>, and +/// <U+0080..U+07FF> is encoded <C2 80..DF BF>). +/// +/// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte +/// sequences encoding them are ill-formed in UTF-8. For instance, +/// <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>. +/// However, when `from` or `to` itself is the surrogate code point, this +/// function returns an error. +func splitCodePoint(from, to rune) ([]*cpRange, error) { + if from > to { + return nil, fmt.Errorf( + "code point range must be from <= to: U+%X..U+%X", + from, + to, + ) + } + if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff { + return nil, fmt.Errorf( + "code point must be >=U+0000 and <=U+10FFFF:" + + "U+%X..U+%X", + from, + to, + ) + } + // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf + // > 3.9 Unicode Encoding Forms + // > UTF-8 D92 + // > Because surrogate code points are not Unicode scalar values, + // > any UTF-8 byte sequence that would otherwise + // > map to code points U+D800..U+DFFF is ill-formed. + if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff { + return nil, fmt.Errorf( + "surrogate code points U+D800..U+DFFF " + + "are not allowed in UTF-8: U+%X..U+%X", + from, + to, + ) + } + + in := &cpRange{ + from: from, + to: to, + } + var rs []*cpRange + for in.from <= in.to { + r := &cpRange{ + from: in.from, + to: in.to, + } + // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf + // > 3.9 Unicode Encoding Forms + // > UTF-8 Table 3-7. + // > Well-Formed UTF-8 Byte Sequences + switch { + case in.from <= 0x007f && in.to > 0x007f: + r.to = 0x007f + case in.from <= 0x07ff && in.to > 0x07ff: + r.to = 0x07ff + case in.from <= 0x0fff && in.to > 0x0fff: + r.to = 0x0fff + case in.from <= 0xcfff && in.to > 0xcfff: + r.to = 0xcfff + case in.from <= 0xd7ff && in.to > 0xd7ff: + r.to = 0xd7ff + case in.from <= 0xffff && in.to > 0xffff: + r.to = 0xffff + case in.from <= 0x3ffff && in.to > 0x3ffff: + r.to = 0x3ffff + case in.from <= 0xfffff && in.to > 0xfffff: + r.to = 0xfffff + } + rs = append(rs, r) + in.from = r.to + 1 + + // Skip surrogate code points U+D800..U+DFFF. + if in.from >= 0xd800 && in.from <= 0xdfff { + in.from = 0xe000 + } + } + return rs, nil +} + func Main() { diff --git a/tests/tre.go b/tests/tre.go index 7495c4b..1f3cfed 100644 --- a/tests/tre.go +++ b/tests/tre.go @@ -1,6 +1,257 @@ package tre +import ( + "fmt" + "os" + "testing" + "testing/internal/testdeps" +) + + + +func TestGenCharBlocksWellFormed(t *testing.T) { + cBlk := func(from []byte, to []byte) *CharBlock { + return &CharBlock{ + From: from, + To: to, + } + } + + seq := func(b ...byte) []byte { + return b + } + + tests := []struct { + from rune + to rune + blocks []*CharBlock + }{ + { + from: '\u0000', + to: '\u007f', + blocks: []*CharBlock{ + cBlk(seq(0x00), seq(0x7f)), + }, + }, + { + from: '\u0080', + to: '\u07ff', + blocks: []*CharBlock{ + cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), + }, + }, + { + from: '\u0800', + to: '\u0fff', + blocks: []*CharBlock{ + cBlk( + seq(0xe0, 0xa0, 0x80), + seq(0xe0, 0xbf, 0xbf), + ), + }, + }, + { + from: '\u1000', + to: '\ucfff', + blocks: []*CharBlock{ + cBlk( + seq(0xe1, 0x80, 0x80), + seq(0xec, 0xbf, 0xbf), + ), + }, + }, + { + from: '\ud000', + to: '\ud7ff', + blocks: []*CharBlock{ + cBlk( + seq(0xed, 0x80, 0x80), + seq(0xed, 0x9f, 0xbf), + ), + }, + }, + { + from: '\ue000', + to: '\uffff', + blocks: []*CharBlock{ + cBlk( + seq(0xee, 0x80, 0x80), + seq(0xef, 0xbf, 0xbf), + ), + }, + }, + { + from: '\U00010000', + to: '\U0003ffff', + blocks: []*CharBlock{ + cBlk( + seq(0xf0, 0x90, 0x80, 0x80), + seq(0xf0, 0xbf, 0xbf, 0xbf), + ), + }, + }, + { + from: '\U00040000', + to: '\U000fffff', + blocks: []*CharBlock{ + cBlk( + seq(0xf1, 0x80, 0x80, 0x80), + seq(0xf3, 0xbf, 0xbf, 0xbf), + ), + }, + }, + { + from: '\U00100000', + to: '\U0010ffff', + blocks: []*CharBlock{ + cBlk( + seq(0xf4, 0x80, 0x80, 0x80), + seq(0xf4, 0x8f, 0xbf, 0xbf), + ), + }, + }, + { + from: '\u0000', + to: '\U0010ffff', + blocks: []*CharBlock{ + cBlk(seq(0x00), seq(0x7f)), + cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), + cBlk( + seq(0xe0, 0xa0, 0x80), + seq(0xe0, 0xbf, 0xbf), + ), + cBlk( + seq(0xe1, 0x80, 0x80), + seq(0xec, 0xbf, 0xbf), + ), + cBlk( + seq(0xed, 0x80, 0x80), + seq(0xed, 0x9f, 0xbf), + ), + cBlk( + seq(0xee, 0x80, 0x80), + seq(0xef, 0xbf, 0xbf), + ), + cBlk( + seq(0xf0, 0x90, 0x80, 0x80), + seq(0xf0, 0xbf, 0xbf, 0xbf), + ), + cBlk( + seq(0xf1, 0x80, 0x80, 0x80), + seq(0xf3, 0xbf, 0xbf, 0xbf), + ), + cBlk( + seq(0xf4, 0x80, 0x80, 0x80), + seq(0xf4, 0x8f, 0xbf, 0xbf), + ), + }, + }, + } + for _, tt := range tests { + const errmsg = "unexpected character block: want: %+v, got: %+v" + tts := fmt.Sprintf("%v..%v", tt.from, tt.to) + t.Run(tts, func(t *testing.T) { + blks, err := GenCharBlocks(tt.from, tt.to) + if err != nil { + t.Fatal(err) + } + if len(blks) != len(tt.blocks) { + t.Fatalf(errmsg, tt.blocks, blks) + } + for i, blk := range blks { + expected := tt.blocks[i] + neqFrom := len(blk.From) != len(expected.From) + neqTo := len(blk.To) != len(expected.To) + if neqFrom || neqTo { + t.Fatalf(errmsg, tt.blocks, blks) + } + for j := 0; j < len(blk.From); j++ { + neqFrom := blk.From[j] != + expected.From[j] + neqTo := blk.To[j] != + expected.To[j] + if neqFrom || neqTo { + t.Fatalf( + errmsg, + tt.blocks, + blks, + ) + } + } + } + }) + } +} + +func TestGenCharBlocksIllFormed(t *testing.T) { + tests := []struct { + from rune + to rune + }{ + { + // from > to + from: '\u0001', + to: '\u0000', + }, + { + from: -1, // <U+0000 + to: '\u0000', + }, + { + from: '\u0000', + to: -1, // <U+0000 + }, + { + from: 0x110000, // >U+10FFFF + to: '\u0000', + }, + { + from: '\u0000', + to: 0x110000, // >U+10FFFF + }, + { + from: 0xd800, // U+D800 (surrogate code point) + to: '\ue000', + }, + { + from: 0xdfff, // U+DFFF (surrogate code point) + to: '\ue000', + }, + { + from: '\ucfff', + to: 0xd800, // U+D800 (surrogate code point) + }, + { + from: '\ucfff', + to: 0xdfff, // U+DFFF (surrogate code point) + }, + } + for _, tt := range tests { + tts := fmt.Sprintf("%v..%v", tt.from, tt.to) + t.Run(tts, func(t *testing.T) { + blks, err := GenCharBlocks(tt.from, tt.to) + if err == nil { + t.Fatal("expected error didn't occur") + } + if blks != nil { + t.Fatal("character blocks must be nil") + } + }) + } +} + func MainTest() { + tests := []testing.InternalTest{ + { "TestGenCharBlocksWellFormed", TestGenCharBlocksWellFormed }, + { "TestGenCharBlocksIllFormed", TestGenCharBlocksIllFormed }, + } + + deps := testdeps.TestDeps{} + benchmarks := []testing.InternalBenchmark {} + fuzzTargets := []testing.InternalFuzzTarget{} + examples := []testing.InternalExample {} + m := testing.MainStart(deps, tests, benchmarks, fuzzTargets, examples) + os.Exit(m.Run()) } diff --git a/utf8/utf8.go b/utf8/utf8.go deleted file mode 100644 index 4f52bd4..0000000 --- a/utf8/utf8.go +++ /dev/null @@ -1,112 +0,0 @@ -package utf8 - -import ( - "fmt" - "strings" -) - -type CharBlock struct { - From []byte - To []byte -} - -func (b *CharBlock) String() string { - var s strings.Builder - fmt.Fprint(&s, "<") - fmt.Fprintf(&s, "%X", b.From[0]) - for i := 1; i < len(b.From); i++ { - fmt.Fprintf(&s, " %X", b.From[i]) - } - fmt.Fprint(&s, "..") - fmt.Fprintf(&s, "%X", b.To[0]) - for i := 1; i < len(b.To); i++ { - fmt.Fprintf(&s, " %X", b.To[i]) - } - fmt.Fprint(&s, ">") - return s.String() -} - -func GenCharBlocks(from, to rune) ([]*CharBlock, error) { - rs, err := splitCodePoint(from, to) - if err != nil { - return nil, err - } - - blks := make([]*CharBlock, len(rs)) - for i, r := range rs { - blks[i] = &CharBlock{ - From: []byte(string(r.from)), - To: []byte(string(r.to)), - } - } - - return blks, nil -} - -type cpRange struct { - from rune - to rune -} - -// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that -// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function -// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on -// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>, -// and <U+0080..U+07FF> is encoded <C2 80..DF BF>). -// -// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are -// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>. -// However, when `from` or `to` itself is the surrogate code point, this function returns an error. -func splitCodePoint(from, to rune) ([]*cpRange, error) { - if from > to { - return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to) - } - if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff { - return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to) - } - // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92 - // > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise - // > map to code points U+D800..U+DFFF is ill-formed. - if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff { - return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to) - } - - in := &cpRange{ - from: from, - to: to, - } - var rs []*cpRange - for in.from <= in.to { - r := &cpRange{ - from: in.from, - to: in.to, - } - // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7. Well-Formed UTF-8 Byte Sequences - switch { - case in.from <= 0x007f && in.to > 0x007f: - r.to = 0x007f - case in.from <= 0x07ff && in.to > 0x07ff: - r.to = 0x07ff - case in.from <= 0x0fff && in.to > 0x0fff: - r.to = 0x0fff - case in.from <= 0xcfff && in.to > 0xcfff: - r.to = 0xcfff - case in.from <= 0xd7ff && in.to > 0xd7ff: - r.to = 0xd7ff - case in.from <= 0xffff && in.to > 0xffff: - r.to = 0xffff - case in.from <= 0x3ffff && in.to > 0x3ffff: - r.to = 0x3ffff - case in.from <= 0xfffff && in.to > 0xfffff: - r.to = 0xfffff - } - rs = append(rs, r) - in.from = r.to + 1 - - // Skip surrogate code points U+D800..U+DFFF. - if in.from >= 0xd800 && in.from <= 0xdfff { - in.from = 0xe000 - } - } - return rs, nil -} diff --git a/utf8/utf8_test.go b/utf8/utf8_test.go deleted file mode 100644 index 2dc8093..0000000 --- a/utf8/utf8_test.go +++ /dev/null @@ -1,181 +0,0 @@ -package utf8 - -import ( - "fmt" - "testing" -) - -func TestGenCharBlocks_WellFormed(t *testing.T) { - cBlk := func(from []byte, to []byte) *CharBlock { - return &CharBlock{ - From: from, - To: to, - } - } - - seq := func(b ...byte) []byte { - return b - } - - tests := []struct { - from rune - to rune - blocks []*CharBlock - }{ - { - from: '\u0000', - to: '\u007f', - blocks: []*CharBlock{ - cBlk(seq(0x00), seq(0x7f)), - }, - }, - { - from: '\u0080', - to: '\u07ff', - blocks: []*CharBlock{ - cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), - }, - }, - { - from: '\u0800', - to: '\u0fff', - blocks: []*CharBlock{ - cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)), - }, - }, - { - from: '\u1000', - to: '\ucfff', - blocks: []*CharBlock{ - cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)), - }, - }, - { - from: '\ud000', - to: '\ud7ff', - blocks: []*CharBlock{ - cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)), - }, - }, - { - from: '\ue000', - to: '\uffff', - blocks: []*CharBlock{ - cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)), - }, - }, - { - from: '\U00010000', - to: '\U0003ffff', - blocks: []*CharBlock{ - cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)), - }, - }, - { - from: '\U00040000', - to: '\U000fffff', - blocks: []*CharBlock{ - cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)), - }, - }, - { - from: '\U00100000', - to: '\U0010ffff', - blocks: []*CharBlock{ - cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)), - }, - }, - { - from: '\u0000', - to: '\U0010ffff', - blocks: []*CharBlock{ - cBlk(seq(0x00), seq(0x7f)), - cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), - cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)), - cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)), - cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)), - cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)), - cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)), - cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)), - cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)), - }, - }, - } - for _, tt := range tests { - t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) { - blks, err := GenCharBlocks(tt.from, tt.to) - if err != nil { - t.Fatal(err) - } - if len(blks) != len(tt.blocks) { - t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) - } - for i, blk := range blks { - if len(blk.From) != len(tt.blocks[i].From) || len(blk.To) != len(tt.blocks[i].To) { - t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) - } - for j := 0; j < len(blk.From); j++ { - if blk.From[j] != tt.blocks[i].From[j] || blk.To[j] != tt.blocks[i].To[j] { - t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) - } - } - } - }) - } -} - -func TestGenCharBlocks_IllFormed(t *testing.T) { - tests := []struct { - from rune - to rune - }{ - { - // from > to - from: '\u0001', - to: '\u0000', - }, - { - from: -1, // <U+0000 - to: '\u0000', - }, - { - from: '\u0000', - to: -1, // <U+0000 - }, - { - from: 0x110000, // >U+10FFFF - to: '\u0000', - }, - { - from: '\u0000', - to: 0x110000, // >U+10FFFF - }, - { - from: 0xd800, // U+D800 (surrogate code point) - to: '\ue000', - }, - { - from: 0xdfff, // U+DFFF (surrogate code point) - to: '\ue000', - }, - { - from: '\ucfff', - to: 0xd800, // U+D800 (surrogate code point) - }, - { - from: '\ucfff', - to: 0xdfff, // U+DFFF (surrogate code point) - }, - } - for _, tt := range tests { - t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) { - blks, err := GenCharBlocks(tt.from, tt.to) - if err == nil { - t.Fatal("expected error didn't occur") - } - if blks != nil { - t.Fatal("character blocks must be nil") - } - }) - } -} |