4 files changed, 390 insertions, 293 deletions
diff --git a/src/tre.go b/src/tre.go
index a24178a..204019a 100644
--- a/src/tre.go
+++ b/src/tre.go
@@ -1,5 +1,144 @@
 package tre
 
+import (
+	"fmt"
+	"strings"
+)
+
+
+
+type CharBlock struct {
+	From []byte
+	To   []byte
+}
+
+type cpRange struct {
+	from rune
+	to   rune
+}
+
+
+
+func (b *CharBlock) String() string {
+	var s strings.Builder
+	fmt.Fprint(&s, "<")
+	fmt.Fprintf(&s, "%X", b.From[0])
+	for i := 1; i < len(b.From); i++ {
+		fmt.Fprintf(&s, " %X", b.From[i])
+	}
+	fmt.Fprint(&s, "..")
+	fmt.Fprintf(&s, "%X", b.To[0])
+	for i := 1; i < len(b.To); i++ {
+		fmt.Fprintf(&s, " %X", b.To[i])
+	}
+	fmt.Fprint(&s, ">")
+	return s.String()
+}
+
+func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
+	rs, err := splitCodePoint(from, to)
+	if err != nil {
+		return nil, err
+	}
+
+	blks := make([]*CharBlock, len(rs))
+	for i, r := range rs {
+		blks[i] = &CharBlock{
+			From: []byte(string(r.from)),
+			To:   []byte(string(r.to)),
+		}
+	}
+
+	return blks, nil
+}
+
+/// `splitCodePoint` splits a code point range represented by <from..to> into
+/// some blocks.  The code points that the block contains will be a continuous
+/// byte sequence when encoded into UTF-8.  For instance, this function splits
+/// <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because
+/// <U+0000..U+07FF> is continuous on the code point but non-continuous in the
+/// UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>, and
+/// <U+0080..U+07FF> is encoded <C2 80..DF BF>).
+///
+/// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte
+/// sequences encoding them are ill-formed in UTF-8.  For instance,
+/// <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
+/// However, when `from` or `to` itself is the surrogate code point, this
+/// function returns an error.
+func splitCodePoint(from, to rune) ([]*cpRange, error) {
+	if from > to {
+		return nil, fmt.Errorf(
+			"code point range must be from <= to: U+%X..U+%X",
+			from,
+			to,
+		)
+	}
+	if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
+		return nil, fmt.Errorf(
+			"code point must be >=U+0000 and <=U+10FFFF:" +
+				"U+%X..U+%X",
+			from,
+			to,
+		)
+	}
+	// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
+	//  > 3.9 Unicode Encoding Forms
+	//   > UTF-8 D92
+	//    > Because surrogate code points are not Unicode scalar values,
+	//    > any UTF-8 byte sequence that would otherwise
+	//    > map to code points U+D800..U+DFFF is ill-formed.
+	if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
+		return nil, fmt.Errorf(
+			"surrogate code points U+D800..U+DFFF " +
+				"are not allowed in UTF-8: U+%X..U+%X",
+			from,
+			to,
+		)
+	}
+
+	in := &cpRange{
+		from: from,
+		to:   to,
+	}
+	var rs []*cpRange
+	for in.from <= in.to {
+		r := &cpRange{
+			from: in.from,
+			to:   in.to,
+		}
+		// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf
+		//  > 3.9 Unicode Encoding Forms
+		//   > UTF-8 Table 3-7.
+		//    > Well-Formed UTF-8 Byte Sequences
+		switch {
+		case in.from <= 0x007f && in.to > 0x007f:
+			r.to = 0x007f
+		case in.from <= 0x07ff && in.to > 0x07ff:
+			r.to = 0x07ff
+		case in.from <= 0x0fff && in.to > 0x0fff:
+			r.to = 0x0fff
+		case in.from <= 0xcfff && in.to > 0xcfff:
+			r.to = 0xcfff
+		case in.from <= 0xd7ff && in.to > 0xd7ff:
+			r.to = 0xd7ff
+		case in.from <= 0xffff && in.to > 0xffff:
+			r.to = 0xffff
+		case in.from <= 0x3ffff && in.to > 0x3ffff:
+			r.to = 0x3ffff
+		case in.from <= 0xfffff && in.to > 0xfffff:
+			r.to = 0xfffff
+		}
+		rs = append(rs, r)
+		in.from = r.to + 1
+
+		// Skip surrogate code points U+D800..U+DFFF.
+		if in.from >= 0xd800 && in.from <= 0xdfff {
+			in.from = 0xe000
+		}
+	}
+	return rs, nil
+}
+
 
 
 func Main() {
diff --git a/tests/tre.go b/tests/tre.go
index 7495c4b..1f3cfed 100644
--- a/tests/tre.go
+++ b/tests/tre.go
@@ -1,6 +1,257 @@
 package tre
 
+import (
+	"fmt"
+	"os"
+	"testing"
+	"testing/internal/testdeps"
+)
+
+
+
+func TestGenCharBlocksWellFormed(t *testing.T) {
+	cBlk := func(from []byte, to []byte) *CharBlock {
+		return &CharBlock{
+			From: from,
+			To:   to,
+		}
+	}
+
+	seq := func(b ...byte) []byte {
+		return b
+	}
+
+	tests := []struct {
+		from   rune
+		to     rune
+		blocks []*CharBlock
+	}{
+		{
+			from: '\u0000',
+			to:   '\u007f',
+			blocks: []*CharBlock{
+				cBlk(seq(0x00), seq(0x7f)),
+			},
+		},
+		{
+			from: '\u0080',
+			to:   '\u07ff',
+			blocks: []*CharBlock{
+				cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+			},
+		},
+		{
+			from: '\u0800',
+			to:   '\u0fff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xe0, 0xa0, 0x80),
+					seq(0xe0, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\u1000',
+			to:   '\ucfff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xe1, 0x80, 0x80),
+					seq(0xec, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\ud000',
+			to:   '\ud7ff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xed, 0x80, 0x80),
+					seq(0xed, 0x9f, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\ue000',
+			to:   '\uffff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xee, 0x80, 0x80),
+					seq(0xef, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\U00010000',
+			to:   '\U0003ffff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xf0, 0x90, 0x80, 0x80),
+					seq(0xf0, 0xbf, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\U00040000',
+			to:   '\U000fffff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xf1, 0x80, 0x80, 0x80),
+					seq(0xf3, 0xbf, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\U00100000',
+			to:   '\U0010ffff',
+			blocks: []*CharBlock{
+				cBlk(
+					seq(0xf4, 0x80, 0x80, 0x80),
+					seq(0xf4, 0x8f, 0xbf, 0xbf),
+				),
+			},
+		},
+		{
+			from: '\u0000',
+			to:   '\U0010ffff',
+			blocks: []*CharBlock{
+				cBlk(seq(0x00), seq(0x7f)),
+				cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
+				cBlk(
+					seq(0xe0, 0xa0, 0x80),
+					seq(0xe0, 0xbf, 0xbf),
+				),
+				cBlk(
+					seq(0xe1, 0x80, 0x80),
+					seq(0xec, 0xbf, 0xbf),
+				),
+				cBlk(
+					seq(0xed, 0x80, 0x80),
+					seq(0xed, 0x9f, 0xbf),
+				),
+				cBlk(
+					seq(0xee, 0x80, 0x80),
+					seq(0xef, 0xbf, 0xbf),
+				),
+				cBlk(
+					seq(0xf0, 0x90, 0x80, 0x80),
+					seq(0xf0, 0xbf, 0xbf, 0xbf),
+				),
+				cBlk(
+					seq(0xf1, 0x80, 0x80, 0x80),
+					seq(0xf3, 0xbf, 0xbf, 0xbf),
+				),
+				cBlk(
+					seq(0xf4, 0x80, 0x80, 0x80),
+					seq(0xf4, 0x8f, 0xbf, 0xbf),
+				),
+			},
+		},
+	}
+	for _, tt := range tests {
+		const errmsg = "unexpected character block: want: %+v, got: %+v"
+		tts := fmt.Sprintf("%v..%v", tt.from, tt.to)
+		t.Run(tts, func(t *testing.T) {
+			blks, err := GenCharBlocks(tt.from, tt.to)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if len(blks) != len(tt.blocks) {
+				t.Fatalf(errmsg, tt.blocks, blks)
+			}
+			for i, blk := range blks {
+				expected := tt.blocks[i]
+				neqFrom := len(blk.From) != len(expected.From)
+				neqTo   := len(blk.To)   != len(expected.To)
+				if neqFrom || neqTo {
+					t.Fatalf(errmsg, tt.blocks, blks)
+				}
+				for j := 0; j < len(blk.From); j++ {
+					neqFrom := blk.From[j] !=
+						expected.From[j] 
+					neqTo   := blk.To[j]   !=
+						expected.To[j]
+					if neqFrom || neqTo {
+						t.Fatalf(
+							errmsg,
+							tt.blocks,
+							blks,
+						)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestGenCharBlocksIllFormed(t *testing.T) {
+	tests := []struct {
+		from rune
+		to   rune
+	}{
+		{
+			// from > to
+			from: '\u0001',
+			to:   '\u0000',
+		},
+		{
+			from: -1, // <U+0000
+			to:   '\u0000',
+		},
+		{
+			from: '\u0000',
+			to:   -1, // <U+0000
+		},
+		{
+			from: 0x110000, // >U+10FFFF
+			to:   '\u0000',
+		},
+		{
+			from: '\u0000',
+			to:   0x110000, // >U+10FFFF
+		},
+		{
+			from: 0xd800, // U+D800 (surrogate code point)
+			to:   '\ue000',
+		},
+		{
+			from: 0xdfff, // U+DFFF (surrogate code point)
+			to:   '\ue000',
+		},
+		{
+			from: '\ucfff',
+			to:   0xd800, // U+D800 (surrogate code point)
+		},
+		{
+			from: '\ucfff',
+			to:   0xdfff, // U+DFFF (surrogate code point)
+		},
+	}
+	for _, tt := range tests {
+		tts := fmt.Sprintf("%v..%v", tt.from, tt.to)
+		t.Run(tts, func(t *testing.T) {
+			blks, err := GenCharBlocks(tt.from, tt.to)
+			if err == nil {
+				t.Fatal("expected error didn't occur")
+			}
+			if blks != nil {
+				t.Fatal("character blocks must be nil")
+			}
+		})
+	}
+}
+
 
 
 func MainTest() {
+	tests := []testing.InternalTest{
+		{ "TestGenCharBlocksWellFormed", TestGenCharBlocksWellFormed },
+		{ "TestGenCharBlocksIllFormed", TestGenCharBlocksIllFormed },
+	}
+
+	deps := testdeps.TestDeps{}
+	benchmarks  := []testing.InternalBenchmark {}
+	fuzzTargets := []testing.InternalFuzzTarget{}
+	examples    := []testing.InternalExample   {}
+	m := testing.MainStart(deps, tests, benchmarks, fuzzTargets, examples)
+	os.Exit(m.Run())
 }
diff --git a/utf8/utf8.go b/utf8/utf8.go
deleted file mode 100644
index 4f52bd4..0000000
--- a/utf8/utf8.go
+++ /dev/null
@@ -1,112 +0,0 @@
-package utf8
-
-import (
-	"fmt"
-	"strings"
-)
-
-type CharBlock struct {
-	From []byte
-	To   []byte
-}
-
-func (b *CharBlock) String() string {
-	var s strings.Builder
-	fmt.Fprint(&s, "<")
-	fmt.Fprintf(&s, "%X", b.From[0])
-	for i := 1; i < len(b.From); i++ {
-		fmt.Fprintf(&s, " %X", b.From[i])
-	}
-	fmt.Fprint(&s, "..")
-	fmt.Fprintf(&s, "%X", b.To[0])
-	for i := 1; i < len(b.To); i++ {
-		fmt.Fprintf(&s, " %X", b.To[i])
-	}
-	fmt.Fprint(&s, ">")
-	return s.String()
-}
-
-func GenCharBlocks(from, to rune) ([]*CharBlock, error) {
-	rs, err := splitCodePoint(from, to)
-	if err != nil {
-		return nil, err
-	}
-
-	blks := make([]*CharBlock, len(rs))
-	for i, r := range rs {
-		blks[i] = &CharBlock{
-			From: []byte(string(r.from)),
-			To:   []byte(string(r.to)),
-		}
-	}
-
-	return blks, nil
-}
-
-type cpRange struct {
-	from rune
-	to   rune
-}
-
-// splitCodePoint splits a code point range represented by <from..to> into some blocks. The code points that
-// the block contains will be a continuous byte sequence when encoded into UTF-8. For instance, this function
-// splits <U+0000..U+07FF> into <U+0000..U+007F> and <U+0080..U+07FF> because <U+0000..U+07FF> is continuous on
-// the code point but non-continuous in the UTF-8 byte sequence (In UTF-8, <U+0000..U+007F> is encoded <00..7F>,
-// and <U+0080..U+07FF> is encoded <C2 80..DF BF>).
-//
-// The blocks don't contain surrogate code points <U+D800..U+DFFF> because byte sequences encoding them are
-// ill-formed in UTF-8. For instance, <U+D000..U+FFFF> is split into <U+D000..U+D7FF> and <U+E000..U+FFFF>.
-// However, when `from` or `to` itself is the surrogate code point, this function returns an error.
-func splitCodePoint(from, to rune) ([]*cpRange, error) {
-	if from > to {
-		return nil, fmt.Errorf("code point range must be from <= to: U+%X..U+%X", from, to)
-	}
-	if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff {
-		return nil, fmt.Errorf("code point must be >=U+0000 and <=U+10FFFF: U+%X..U+%X", from, to)
-	}
-	// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 D92
-	// > Because surrogate code points are not Unicode scalar values, any UTF-8 byte sequence that would otherwise
-	// > map to code points U+D800..U+DFFF is ill-formed.
-	if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff {
-		return nil, fmt.Errorf("surrogate code points U+D800..U+DFFF are not allowed in UTF-8: U+%X..U+%X", from, to)
-	}
-
-	in := &cpRange{
-		from: from,
-		to:   to,
-	}
-	var rs []*cpRange
-	for in.from <= in.to {
-		r := &cpRange{
-			from: in.from,
-			to:   in.to,
-		}
-		// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > 3.9 Unicode Encoding Forms > UTF-8 Table 3-7.  Well-Formed UTF-8 Byte Sequences
-		switch {
-		case in.from <= 0x007f && in.to > 0x007f:
-			r.to = 0x007f
-		case in.from <= 0x07ff && in.to > 0x07ff:
-			r.to = 0x07ff
-		case in.from <= 0x0fff && in.to > 0x0fff:
-			r.to = 0x0fff
-		case in.from <= 0xcfff && in.to > 0xcfff:
-			r.to = 0xcfff
-		case in.from <= 0xd7ff && in.to > 0xd7ff:
-			r.to = 0xd7ff
-		case in.from <= 0xffff && in.to > 0xffff:
-			r.to = 0xffff
-		case in.from <= 0x3ffff && in.to > 0x3ffff:
-			r.to = 0x3ffff
-		case in.from <= 0xfffff && in.to > 0xfffff:
-			r.to = 0xfffff
-		}
-		rs = append(rs, r)
-		in.from = r.to + 1
-
-		// Skip surrogate code points U+D800..U+DFFF.
-		if in.from >= 0xd800 && in.from <= 0xdfff {
-			in.from = 0xe000
-		}
-	}
-	return rs, nil
-}
diff --git a/utf8/utf8_test.go b/utf8/utf8_test.go
deleted file mode 100644
index 2dc8093..0000000
--- a/utf8/utf8_test.go
+++ /dev/null
@@ -1,181 +0,0 @@
-package utf8
-
-import (
-	"fmt"
-	"testing"
-)
-
-func TestGenCharBlocks_WellFormed(t *testing.T) {
-	cBlk := func(from []byte, to []byte) *CharBlock {
-		return &CharBlock{
-			From: from,
-			To:   to,
-		}
-	}
-
-	seq := func(b ...byte) []byte {
-		return b
-	}
-
-	tests := []struct {
-		from   rune
-		to     rune
-		blocks []*CharBlock
-	}{
-		{
-			from: '\u0000',
-			to:   '\u007f',
-			blocks: []*CharBlock{
-				cBlk(seq(0x00), seq(0x7f)),
-			},
-		},
-		{
-			from: '\u0080',
-			to:   '\u07ff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
-			},
-		},
-		{
-			from: '\u0800',
-			to:   '\u0fff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\u1000',
-			to:   '\ucfff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\ud000',
-			to:   '\ud7ff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
-			},
-		},
-		{
-			from: '\ue000',
-			to:   '\uffff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\U00010000',
-			to:   '\U0003ffff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\U00040000',
-			to:   '\U000fffff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\U00100000',
-			to:   '\U0010ffff',
-			blocks: []*CharBlock{
-				cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
-			},
-		},
-		{
-			from: '\u0000',
-			to:   '\U0010ffff',
-			blocks: []*CharBlock{
-				cBlk(seq(0x00), seq(0x7f)),
-				cBlk(seq(0xc2, 0x80), seq(0xdf, 0xbf)),
-				cBlk(seq(0xe0, 0xa0, 0x80), seq(0xe0, 0xbf, 0xbf)),
-				cBlk(seq(0xe1, 0x80, 0x80), seq(0xec, 0xbf, 0xbf)),
-				cBlk(seq(0xed, 0x80, 0x80), seq(0xed, 0x9f, 0xbf)),
-				cBlk(seq(0xee, 0x80, 0x80), seq(0xef, 0xbf, 0xbf)),
-				cBlk(seq(0xf0, 0x90, 0x80, 0x80), seq(0xf0, 0xbf, 0xbf, 0xbf)),
-				cBlk(seq(0xf1, 0x80, 0x80, 0x80), seq(0xf3, 0xbf, 0xbf, 0xbf)),
-				cBlk(seq(0xf4, 0x80, 0x80, 0x80), seq(0xf4, 0x8f, 0xbf, 0xbf)),
-			},
-		},
-	}
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
-			blks, err := GenCharBlocks(tt.from, tt.to)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if len(blks) != len(tt.blocks) {
-				t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
-			}
-			for i, blk := range blks {
-				if len(blk.From) != len(tt.blocks[i].From) || len(blk.To) != len(tt.blocks[i].To) {
-					t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
-				}
-				for j := 0; j < len(blk.From); j++ {
-					if blk.From[j] != tt.blocks[i].From[j] || blk.To[j] != tt.blocks[i].To[j] {
-						t.Fatalf("unexpected character block: want: %+v, got: %+v", tt.blocks, blks)
-					}
-				}
-			}
-		})
-	}
-}
-
-func TestGenCharBlocks_IllFormed(t *testing.T) {
-	tests := []struct {
-		from rune
-		to   rune
-	}{
-		{
-			// from > to
-			from: '\u0001',
-			to:   '\u0000',
-		},
-		{
-			from: -1, // <U+0000
-			to:   '\u0000',
-		},
-		{
-			from: '\u0000',
-			to:   -1, // <U+0000
-		},
-		{
-			from: 0x110000, // >U+10FFFF
-			to:   '\u0000',
-		},
-		{
-			from: '\u0000',
-			to:   0x110000, // >U+10FFFF
-		},
-		{
-			from: 0xd800, // U+D800 (surrogate code point)
-			to:   '\ue000',
-		},
-		{
-			from: 0xdfff, // U+DFFF (surrogate code point)
-			to:   '\ue000',
-		},
-		{
-			from: '\ucfff',
-			to:   0xd800, // U+D800 (surrogate code point)
-		},
-		{
-			from: '\ucfff',
-			to:   0xdfff, // U+DFFF (surrogate code point)
-		},
-	}
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("%v..%v", tt.from, tt.to), func(t *testing.T) {
-			blks, err := GenCharBlocks(tt.from, tt.to)
-			if err == nil {
-				t.Fatal("expected error didn't occur")
-			}
-			if blks != nil {
-				t.Fatal("character blocks must be nil")
-			}
-		})
-	}
-}