diff options
-rw-r--r-- | compiler/dfa/tree.go | 2 | ||||
-rw-r--r-- | utf8/utf8.go | 454 | ||||
-rw-r--r-- | utf8/utf8_test.go | 181 |
3 files changed, 266 insertions, 371 deletions
diff --git a/compiler/dfa/tree.go b/compiler/dfa/tree.go index ab75eae..a3442f8 100644 --- a/compiler/dfa/tree.go +++ b/compiler/dfa/tree.go @@ -512,7 +512,7 @@ func ConvertCPTreeToByteTree(cpTrees map[spec.LexModeKindID]parser.CPTree) (byte func convCPTreeToByteTree(cpTree parser.CPTree) (byteTree, error) { if from, to, ok := cpTree.Range(); ok { - bs, err := utf8.GenCharBlocks([]byte(string(from)), []byte(string(to))) + bs, err := utf8.GenCharBlocks(from, to) if err != nil { return nil, err } diff --git a/utf8/utf8.go b/utf8/utf8.go index 16b3746..4f52bd4 100644 --- a/utf8/utf8.go +++ b/utf8/utf8.go @@ -1,398 +1,112 @@ package utf8 -import "fmt" +import ( + "fmt" + "strings" +) type CharBlock struct { From []byte To []byte } -// Refelences: -// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 -// * Table 3-6. UTF-8 Bit Distribution -// * Table 3-7. Well-Formed UTF-8 Byte Sequences -var ( - // 1 byte character: - // * <00..7F> - cBlks1 = []*CharBlock{ - { - From: []byte{0x00}, - To: []byte{0x7f}, - }, +func (b *CharBlock) String() string { + var s strings.Builder + fmt.Fprint(&s, "<") + fmt.Fprintf(&s, "%X", b.From[0]) + for i := 1; i < len(b.From); i++ { + fmt.Fprintf(&s, " %X", b.From[i]) } - - // 2 bytes character: - // * <C2..DF 80..BF> - cBlks2 = []*CharBlock{ - { - From: []byte{0xc2, 0x80}, - To: []byte{0xdf, 0xbf}, - }, - } - - // 3 bytes character: - // * <E0 A0..BF 80..BF> - // * <E1..EC 80..BF 80..BF> - // * <ED 80..9F 80..BF> - // * <EE..EF 80..BF 80..BF> - cBlks3 = []*CharBlock{ - { - From: []byte{0xe0, 0xa0, 0x80}, - To: []byte{0xe0, 0xbf, 0xbf}, - }, - { - From: []byte{0xe1, 0x80, 0x80}, - To: []byte{0xec, 0xbf, 0xbf}, - }, - { - From: []byte{0xed, 0x80, 0x80}, - To: []byte{0xed, 0x9f, 0xbf}, - }, - { - From: []byte{0xee, 0x80, 0x80}, - To: []byte{0xef, 0xbf, 0xbf}, - }, + fmt.Fprint(&s, "..") + fmt.Fprintf(&s, "%X", b.To[0]) + for i := 1; i < len(b.To); i++ { + fmt.Fprintf(&s, " %X", b.To[i]) } + fmt.Fprint(&s, ">") + return s.String() +} - // 4 bytes character: - // * <F0 90..BF 80..BF 80..BF> - // * <F1..F3 80..BF 80..BF 80..BF> - // * <F4 80..8F 80..BF 80..BF> - cBlks4 = []*CharBlock{ - { - From: []byte{0xf0, 0x90, 0x80, 0x80}, - To: []byte{0xf0, 0xbf, 0xbf, 0xbf}, - }, - { - From: []byte{0xf1, 0x80, 0x80, 0x80}, - To: []byte{0xf3, 0xbf, 0xbf, 0xbf}, - }, - { - From: []byte{0xf4, 0x80, 0x80, 0x80}, - To: []byte{0xf4, 0x8f, 0xbf, 0xbf}, - }, +func GenCharBlocks(from, to rune) ([]*CharBlock, error) { + rs, err := splitCodePoint(from, to) + if err != nil { + return nil, err } - cBlk1Head = cBlks1[0] - cBlk1Last = cBlks1[len(cBlks1)-1] - cBlk2Head = cBlks2[0] - cBlk2Last = cBlks2[len(cBlks2)-1] - cBlk3Head = cBlks3[0] - cBlk3Last = cBlks3[len(cBlks3)-1] - cBlk4Head = cBlks4[0] - cBlk4Last = cBlks4[len(cBlks4)-1] -) - -func GenCharBlocks(from, to []byte) ([]*CharBlock, error) { - switch len(from) { - case 1: - switch len(to) { - case 1: - return genCharBlocks1(from, to), nil - case 2: - var alt []*CharBlock - alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...) - alt = append(alt, genCharBlocks2(cBlk2Head.From, to)...) - return alt, nil - case 3: - var alt []*CharBlock - alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...) - alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...) - alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...) - return alt, nil - case 4: - var alt []*CharBlock - alt = append(alt, genCharBlocks1(from, cBlk1Last.To)...) - alt = append(alt, genCharBlocks2(cBlk2Head.From, cBlk2Last.To)...) - alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...) - alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...) - return alt, nil - } - case 2: - switch len(to) { - case 2: - return genCharBlocks2(from, to), nil - case 3: - var alt []*CharBlock - alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...) - alt = append(alt, genCharBlocks3(cBlk3Head.From, to)...) - return alt, nil - case 4: - var alt []*CharBlock - alt = append(alt, genCharBlocks2(from, cBlk2Last.To)...) - alt = append(alt, genCharBlocks3(cBlk3Head.From, cBlk3Last.To)...) - alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...) - return alt, nil + blks := make([]*CharBlock, len(rs)) + for i, r := range rs { + blks[i] = &CharBlock{ + From: []byte(string(r.from)), + To: []byte(string(r.to)), } - case 3: - switch len(to) { - case 3: - return genCharBlocks3(from, to), nil - case 4: - var alt []*CharBlock - alt = append(alt, genCharBlocks3(from, cBlk3Last.To)...) - alt = append(alt, genCharBlocks4(cBlk4Head.From, to)...) - return alt, nil - } - case 4: - return genCharBlocks4(from, to), nil } - return nil, fmt.Errorf("invalid range; From: %v, To: %v", from, to) -} -func genCharBlocks1(from, to []byte) []*CharBlock { - return []*CharBlock{ - {From: from, To: to}, - } + return blks, nil } -func genCharBlocks2(from, to []byte) []*CharBlock { - switch { - case from[0] == to[0]: - return []*CharBlock{ - {From: from, To: to}, - } - default: - return []*CharBlock{ - {From: from, To: []byte{from[0], cBlks2[0].To[1]}}, - {From: []byte{from[0] + 1, cBlks2[0].From[1]}, To: to}, - } - } +type cpRange struct { + from rune + to rune } -func genCharBlocks3(from, to []byte) []*CharBlock { - switch { - case from[0] == to[0] && from[1] == to[1]: - return []*CharBlock{ - {From: from, To: to}, - } - case from[0] == to[0]: - _, fromBlk := findCharBlock3(from) - var alt []*CharBlock - alt = append(alt, &CharBlock{ - From: from, - To: []byte{from[0], from[1], fromBlk.To[2]}, - }) - if from[1]+1 < to[1] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1] + 1, fromBlk.From[2]}, - To: []byte{from[0], to[1] - 1, fromBlk.To[2]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{from[0], to[1], fromBlk.From[2]}, - To: to, - }) - return alt - default: - fromBlkNum, fromBlk := findCharBlock3(from) - toBlkNum, toBlk := findCharBlock3(to) - var alt []*CharBlock - alt = append(alt, &CharBlock{ - From: from, - To: []byte{from[0], from[1], fromBlk.To[2]}, - }) - if from[1] < fromBlk.To[1] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1] + 1, fromBlk.From[2]}, - To: []byte{from[0], fromBlk.To[1], fromBlk.To[2]}, - }) - } - if fromBlkNum == toBlkNum { - if from[0]+1 < to[0] { - alt = append(alt, &CharBlock{ - From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2]}, - To: []byte{to[0] - 1, fromBlk.To[1], fromBlk.To[2]}, - }) - } - if to[1] > fromBlk.From[1] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], fromBlk.From[1], fromBlk.From[2]}, - To: []byte{to[0], to[1] - 1, fromBlk.To[2]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], fromBlk.From[2]}, - To: to, - }) - return alt - } - for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ { - fromBlk := cBlks3[blkNum] - alt = append(alt, &CharBlock{ - From: fromBlk.From, - To: fromBlk.To, - }) - } - if to[0] > toBlk.From[0] { - alt = append(alt, &CharBlock{ - From: toBlk.From, - To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2]}, - }) - } - if to[1] > toBlk.From[1] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], toBlk.From[1], toBlk.From[2]}, - To: []byte{to[0], to[1] - 1, toBlk.To[2]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], toBlk.From[2]}, - To: to, - }) - return alt +// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that +// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function +// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on +// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>, +// and <U+0080..U+07FF> is encoded <C2 80..DF BF>). +// +// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are +// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>. +// However, when `from` or `to` itself is the surrogate code point, this function returns an error. +func splitCodePoint(from, to rune) ([]*cpRange, error) { + if from > to { + return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to) } -} - -func genCharBlocks4(from, to []byte) []*CharBlock { - switch { - case from[0] == to[0] && from[1] == to[1] && from[2] == to[2]: - return []*CharBlock{ - { - From: from, - To: to, - }, - } - case from[0] == to[0] && from[1] == to[1]: - _, fromBlk := findCharBlock4(from) - var alt []*CharBlock - alt = append(alt, &CharBlock{ - From: from, - To: []byte{to[0], to[1], from[2], fromBlk.To[3]}, - }) - if from[2]+1 < to[2] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1], from[2] + 2, fromBlk.From[3]}, - To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1], to[2], fromBlk.From[3]}, - To: []byte{to[0], to[1], to[2], to[3]}, - }) - return alt - case from[0] == to[0]: - _, fromBlk := findCharBlock4(from) - var alt []*CharBlock - alt = append(alt, &CharBlock{ - From: from, - To: []byte{to[0], from[1], from[2], fromBlk.To[3]}, - }) - if from[2] < fromBlk.To[2] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]}, - To: []byte{to[0], from[1], fromBlk.To[2], fromBlk.To[3]}, - }) - } - if from[1]+1 < to[1] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]}, - To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]}, - }) - } - if to[2] > fromBlk.From[2] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], to[1], fromBlk.From[2], fromBlk.From[3]}, - To: []byte{from[0], to[1], to[2] - 1, fromBlk.To[3]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{from[0], to[1], to[2], fromBlk.From[3]}, - To: to, - }) - return alt - default: - fromBlkNum, fromBlk := findCharBlock4(from) - toBlkNum, toBlk := findCharBlock4(to) - var alt []*CharBlock - alt = append(alt, &CharBlock{ - From: from, - To: []byte{from[0], from[1], from[2], fromBlk.To[3]}, - }) - if from[2] < fromBlk.To[2] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1], from[2] + 1, fromBlk.From[3]}, - To: []byte{from[0], from[1], fromBlk.To[2], fromBlk.To[3]}, - }) - } - if from[1] < fromBlk.To[1] { - alt = append(alt, &CharBlock{ - From: []byte{from[0], from[1] + 1, fromBlk.From[2], fromBlk.From[3]}, - To: []byte{from[0], fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]}, - }) - } - if fromBlkNum == toBlkNum { - if from[0]+1 < to[0] { - alt = append(alt, &CharBlock{ - From: []byte{from[0] + 1, fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]}, - To: []byte{from[0] - 1, fromBlk.To[1], fromBlk.To[2], fromBlk.To[3]}, - }) - } - if to[1] > fromBlk.From[1] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], fromBlk.From[1], fromBlk.From[2], fromBlk.From[3]}, - To: []byte{to[0], to[1] - 1, fromBlk.To[2], fromBlk.To[3]}, - }) - } - if to[2] > fromBlk.From[2] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], fromBlk.From[2], fromBlk.From[3]}, - To: []byte{to[0], to[1], to[2] - 1, fromBlk.To[3]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], to[2], fromBlk.From[3]}, - To: to, - }) - return alt - } - for blkNum := fromBlkNum + 1; blkNum < toBlkNum; blkNum++ { - blk := cBlks4[blkNum] - alt = append(alt, &CharBlock{ - From: blk.From, - To: blk.To, - }) - } - if to[0] > toBlk.From[0] { - alt = append(alt, &CharBlock{ - From: toBlk.From, - To: []byte{to[0] - 1, toBlk.To[1], toBlk.To[2], toBlk.To[3]}, - }) - } - if to[1] > toBlk.From[1] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], toBlk.From[1], toBlk.From[2], toBlk.From[3]}, - To: []byte{to[0], to[1] - 1, toBlk.To[2], toBlk.To[3]}, - }) - } - if to[2] > toBlk.From[2] { - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], toBlk.From[2], toBlk.From[3]}, - To: []byte{to[0], to[1], to[2] - 1, toBlk.To[3]}, - }) - } - alt = append(alt, &CharBlock{ - From: []byte{to[0], to[1], to[2], toBlk.From[3]}, - To: to, - }) - return alt + if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff { + return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to) + } + // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92 + // > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise + // > map to code points U+D800..U+DFFF is ill-formed. + if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff { + return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to) } -} -func findCharBlock3(c []byte) (int, *CharBlock) { - for i, blk := range cBlks3 { - if c[0] >= blk.From[0] && c[0] <= blk.To[0] { - return i, blk - } + in := &cpRange{ + from: from, + to: to, } - return 0, nil -} + var rs []*cpRange + for in.from <= in.to { + r := &cpRange{ + from: in.from, + to: in.to, + } + // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7. Well-Formed UTF-8 Byte Sequences + switch { + case in.from <= 0x007f && in.to > 0x007f: + r.to = 0x007f + case in.from <= 0x07ff && in.to > 0x07ff: + r.to = 0x07ff + case in.from <= 0x0fff && in.to > 0x0fff: + r.to = 0x0fff + case in.from <= 0xcfff && in.to > 0xcfff: + r.to = 0xcfff + case in.from <= 0xd7ff && in.to > 0xd7ff: + r.to = 0xd7ff + case in.from <= 0xffff && in.to > 0xffff: + r.to = 0xffff + case in.from <= 0x3ffff && in.to > 0x3ffff: + r.to = 0x3ffff + case in.from <= 0xfffff && in.to > 0xfffff: + r.to = 0xfffff + } + rs = append(rs, r) + in.from = r.to + 1 -func findCharBlock4(c []byte) (int, *CharBlock) { - for i, blk := range cBlks4 { - if c[0] >= blk.From[0] && c[0] <= blk.To[0] { - return i, blk + // Skip surrogate code points U+D800..U+DFFF. + if in.from >= 0xd800 && in.from <= 0xdfff { + in.from = 0xe000 } } - return 0, nil + return rs, nil } diff --git a/utf8/utf8_test.go b/utf8/utf8_test.go new file mode 100644 index 0000000..2dc8093 --- /dev/null +++ b/utf8/utf8_test.go @@ -0,0 +1,181 @@ +package utf8 + +import ( + "fmt" + "testing" +) + +func TestGenCharBlocks_WellFormed(t *testing.T) { + cBlk := func(from []byte, to []byte) *CharBlock { + return &CharBlock{ + From: from, + To: to, + } + } + + seq := func(b ...byte) []byte { + return b + } + + tests := []struct { + from rune + to rune + blocks []*CharBlock + }{ + { + from: '\u0000', + to: '\u007f', + blocks: []*CharBlock{ + cBlk(seq(0x00), seq(0x7f)), + }, + }, + { + from: '\u0080', + to: '\u07ff', + blocks: []*CharBlock{ + cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), + }, + }, + { + from: '\u0800', + to: '\u0fff', + blocks: []*CharBlock{ + cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)), + }, + }, + { + from: '\u1000', + to: '\ucfff', + blocks: []*CharBlock{ + cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)), + }, + }, + { + from: '\ud000', + to: '\ud7ff', + blocks: []*CharBlock{ + cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)), + }, + }, + { + from: '\ue000', + to: '\uffff', + blocks: []*CharBlock{ + cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)), + }, + }, + { + from: '\U00010000', + to: '\U0003ffff', + blocks: []*CharBlock{ + cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)), + }, + }, + { + from: '\U00040000', + to: '\U000fffff', + blocks: []*CharBlock{ + cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)), + }, + }, + { + from: '\U00100000', + to: '\U0010ffff', + blocks: []*CharBlock{ + cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)), + }, + }, + { + from: '\u0000', + to: '\U0010ffff', + blocks: []*CharBlock{ + cBlk(seq(0x00), seq(0x7f)), + cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)), + cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)), + cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)), + cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)), + cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)), + cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)), + cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)), + cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)), + }, + }, + } + for _, tt := range tests { + t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) { + blks, err := GenCharBlocks(tt.from, tt.to) + if err != nil { + t.Fatal(err) + } + if len(blks) != len(tt.blocks) { + t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) + } + for i, blk := range blks { + if len(blk.From) != len(tt.blocks[i].From) || len(blk.To) != len(tt.blocks[i].To) { + t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) + } + for j := 0; j < len(blk.From); j++ { + if blk.From[j] != tt.blocks[i].From[j] || blk.To[j] != tt.blocks[i].To[j] { + t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks) + } + } + } + }) + } +} + +func TestGenCharBlocks_IllFormed(t *testing.T) { + tests := []struct { + from rune + to rune + }{ + { + // from > to + from: '\u0001', + to: '\u0000', + }, + { + from: -1, // <U+0000 + to: '\u0000', + }, + { + from: '\u0000', + to: -1, // <U+0000 + }, + { + from: 0x110000, // >U+10FFFF + to: '\u0000', + }, + { + from: '\u0000', + to: 0x110000, // >U+10FFFF + }, + { + from: 0xd800, // U+D800 (surrogate code point) + to: '\ue000', + }, + { + from: 0xdfff, // U+DFFF (surrogate code point) + to: '\ue000', + }, + { + from: '\ucfff', + to: 0xd800, // U+D800 (surrogate code point) + }, + { + from: '\ucfff', + to: 0xdfff, // U+DFFF (surrogate code point) + }, + } + for _, tt := range tests { + t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) { + blks, err := GenCharBlocks(tt.from, tt.to) + if err == nil { + t.Fatal("expected error didn't occur") + } + if blks != nil { + t.Fatal("character blocks must be nil") + } + }) + } +} |