aboutsummaryrefslogtreecommitdiff
path: root/ucd/unicode_data.go
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-11-23 01:24:02 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-11-23 17:39:57 +0900
commit12bfeb83ae4a804d05c7f6eab5c6b2b972b7d8d2 (patch)
tree52f39ffbc0eca3f8302cf7908cb1f33a30425a96 /ucd/unicode_data.go
parentUpdate CHANGELOG (diff)
downloadtre-12bfeb83ae4a804d05c7f6eab5c6b2b972b7d8d2.tar.gz
tre-12bfeb83ae4a804d05c7f6eab5c6b2b972b7d8d2.tar.xz
Refactor the UCD file parsers
Diffstat (limited to 'ucd/unicode_data.go')
-rw-r--r--ucd/unicode_data.go68
1 files changed, 68 insertions, 0 deletions
diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go
new file mode 100644
index 0000000..4513666
--- /dev/null
+++ b/ucd/unicode_data.go
@@ -0,0 +1,68 @@
+package ucd
+
+import "io"
+
+type UnicodeData struct {
+ GeneralCategory map[string][]*CodePointRange
+}
+
+// ParseUnicodeData parses the UnicodeData.txt.
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+ gc2CPRange := map[string][]*CodePointRange{}
+ lastCPTo := rune(-1)
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ cpRange, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ if cpRange.From-lastCPTo > 1 {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: cpRange.From - 1,
+ })
+ }
+ lastCPTo = cpRange.To
+ gc := p.fields[2].normalizedSymbol()
+ if gc == "" {
+ // https://www.unicode.org/reports/tr44/#Empty_Fields
+ // > The data file UnicodeData.txt defines many property values in each record. When a field in a data
+ // > line for a code point is empty, that indicates that the property takes the default value for that
+ // > code point.
+ if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To {
+ continue
+ }
+ gc = propValAliases.GeneralCategoryDefaultValue
+ }
+ rs, ok := gc2CPRange[gc]
+ if ok {
+ r := rs[len(rs)-1]
+ if cpRange.From-r.To == 1 {
+ r.To = cpRange.To
+ } else {
+ gc2CPRange[gc] = append(rs, cpRange)
+ }
+ } else {
+ gc2CPRange[gc] = []*CodePointRange{
+ cpRange,
+ }
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: propValAliases.GeneralCategoryDefaultRange.To,
+ })
+ }
+ return &UnicodeData{
+ GeneralCategory: gc2CPRange,
+ }, nil
+}