diff options
author | EuAndreh <eu@euandre.org> | 2024-12-10 12:29:03 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2024-12-10 12:29:03 -0300 |
commit | 8359c047aaebe274a2d811d61922b571ca7d10df (patch) | |
tree | 070e0ed93d27a842776ada805eeb4270e7e3c806 /src/urubu/ucd/unicode_data.go | |
parent | Start building test files (diff) | |
download | cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.gz cotia-8359c047aaebe274a2d811d61922b571ca7d10df.tar.xz |
Namespace packages with "urubu/"
Diffstat (limited to 'src/urubu/ucd/unicode_data.go')
-rw-r--r-- | src/urubu/ucd/unicode_data.go | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/src/urubu/ucd/unicode_data.go b/src/urubu/ucd/unicode_data.go new file mode 100644 index 0000000..e2a8e87 --- /dev/null +++ b/src/urubu/ucd/unicode_data.go @@ -0,0 +1,56 @@ +package ucd + +import "io" + +type UnicodeData struct { + GeneralCategory map[string][]*CodePointRange + + propValAliases *PropertyValueAliases +} + +// ParseUnicodeData parses the UnicodeData.txt. +func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { + unicodeData := &UnicodeData{ + GeneralCategory: map[string][]*CodePointRange{}, + propValAliases: propValAliases, + } + + p := newParser(r) + for p.parse() { + if len(p.fields) == 0 { + continue + } + cp, err := p.fields[0].codePointRange() + if err != nil { + return nil, err + } + gc := p.fields[2].normalizedSymbol() + unicodeData.addGC(gc, cp) + } + if p.err != nil { + return nil, p.err + } + + return unicodeData, nil +} + +func (u *UnicodeData) addGC(gc string, cp *CodePointRange) { + // https://www.unicode.org/reports/tr44/#Empty_Fields + // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line + // > for a code point is empty, that indicates that the property takes the default value for that code point. + if gc == "" { + return + } + + cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)] + if ok { + c := cps[len(cps)-1] + if cp.From-c.To == 1 { + c.To = cp.To + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp) + } + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp} + } +} |