1 files changed, 56 insertions, 0 deletions
diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go
new file mode 100644
index 0000000..e2a8e87
--- /dev/null
+++ b/ucd/unicode_data.go
@@ -0,0 +1,56 @@
+package ucd
+
+import "io"
+
+type UnicodeData struct {
+	GeneralCategory map[string][]*CodePointRange
+
+	propValAliases *PropertyValueAliases
+}
+
+// ParseUnicodeData parses the UnicodeData.txt.
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+	unicodeData := &UnicodeData{
+		GeneralCategory: map[string][]*CodePointRange{},
+		propValAliases:  propValAliases,
+	}
+
+	p := newParser(r)
+	for p.parse() {
+		if len(p.fields) == 0 {
+			continue
+		}
+		cp, err := p.fields[0].codePointRange()
+		if err != nil {
+			return nil, err
+		}
+		gc := p.fields[2].normalizedSymbol()
+		unicodeData.addGC(gc, cp)
+	}
+	if p.err != nil {
+		return nil, p.err
+	}
+
+	return unicodeData, nil
+}
+
+func (u *UnicodeData) addGC(gc string, cp *CodePointRange) {
+	// https://www.unicode.org/reports/tr44/#Empty_Fields
+	// > The data file UnicodeData.txt defines many property values in each record. When a field in a data line
+	// > for a code point is empty, that indicates that the property takes the default value for that code point.
+	if gc == "" {
+		return
+	}
+
+	cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)]
+	if ok {
+		c := cps[len(cps)-1]
+		if cp.From-c.To == 1 {
+			c.To = cp.To
+		} else {
+			u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp)
+		}
+	} else {
+		u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp}
+	}
+}