aboutsummaryrefslogtreecommitdiff
path: root/src/urubu/ucd/unicode_data.go
diff options
context:
space:
mode:
authorEuAndreh <eu@euandre.org>2024-12-10 12:29:03 -0300
committerEuAndreh <eu@euandre.org>2024-12-10 12:29:03 -0300
commit8359c047aaebe274a2d811d61922b571ca7d10df (patch)
tree070e0ed93d27a842776ada805eeb4270e7e3c806 /src/urubu/ucd/unicode_data.go
parentStart building test files (diff)
downloadcotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.gz
cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.xz
Namespace packages with "urubu/"
Diffstat (limited to 'src/urubu/ucd/unicode_data.go')
-rw-r--r--src/urubu/ucd/unicode_data.go56
1 files changed, 56 insertions, 0 deletions
diff --git a/src/urubu/ucd/unicode_data.go b/src/urubu/ucd/unicode_data.go
new file mode 100644
index 0000000..e2a8e87
--- /dev/null
+++ b/src/urubu/ucd/unicode_data.go
@@ -0,0 +1,56 @@
+package ucd
+
+import "io"
+
+type UnicodeData struct {
+ GeneralCategory map[string][]*CodePointRange
+
+ propValAliases *PropertyValueAliases
+}
+
+// ParseUnicodeData parses the UnicodeData.txt.
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+ unicodeData := &UnicodeData{
+ GeneralCategory: map[string][]*CodePointRange{},
+ propValAliases: propValAliases,
+ }
+
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ cp, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ gc := p.fields[2].normalizedSymbol()
+ unicodeData.addGC(gc, cp)
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+
+ return unicodeData, nil
+}
+
+func (u *UnicodeData) addGC(gc string, cp *CodePointRange) {
+ // https://www.unicode.org/reports/tr44/#Empty_Fields
+ // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line
+ // > for a code point is empty, that indicates that the property takes the default value for that code point.
+ if gc == "" {
+ return
+ }
+
+ cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)]
+ if ok {
+ c := cps[len(cps)-1]
+ if cp.From-c.To == 1 {
+ c.To = cp.To
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp)
+ }
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp}
+ }
+}