aboutsummaryrefslogtreecommitdiff
path: root/ucd/unicode_data.go
diff options
context:
space:
mode:
Diffstat (limited to 'ucd/unicode_data.go')
-rw-r--r--ucd/unicode_data.go75
1 files changed, 40 insertions, 35 deletions
diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go
index 4513666..d45ee39 100644
--- a/ucd/unicode_data.go
+++ b/ucd/unicode_data.go
@@ -4,65 +4,70 @@ import "io"
type UnicodeData struct {
GeneralCategory map[string][]*CodePointRange
+
+ propValAliases *PropertyValueAliases
}
// ParseUnicodeData parses the UnicodeData.txt.
func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
- gc2CPRange := map[string][]*CodePointRange{}
+ unicodeData := &UnicodeData{
+ GeneralCategory: map[string][]*CodePointRange{},
+ propValAliases: propValAliases,
+ }
+
lastCPTo := rune(-1)
p := newParser(r)
for p.parse() {
if len(p.fields) == 0 {
continue
}
- cpRange, err := p.fields[0].codePointRange()
+ cp, err := p.fields[0].codePointRange()
if err != nil {
return nil, err
}
- if cpRange.From-lastCPTo > 1 {
- defaultGCVal := propValAliases.GeneralCategoryDefaultValue
- gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ if cp.From-lastCPTo > 1 {
+ unicodeData.addGC(propValAliases.GeneralCategoryDefaultValue, &CodePointRange{
From: lastCPTo + 1,
- To: cpRange.From - 1,
+ To: cp.From - 1,
})
}
- lastCPTo = cpRange.To
+ lastCPTo = cp.To
gc := p.fields[2].normalizedSymbol()
- if gc == "" {
- // https://www.unicode.org/reports/tr44/#Empty_Fields
- // > The data file UnicodeData.txt defines many property values in each record. When a field in a data
- // > line for a code point is empty, that indicates that the property takes the default value for that
- // > code point.
- if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To {
- continue
- }
- gc = propValAliases.GeneralCategoryDefaultValue
- }
- rs, ok := gc2CPRange[gc]
- if ok {
- r := rs[len(rs)-1]
- if cpRange.From-r.To == 1 {
- r.To = cpRange.To
- } else {
- gc2CPRange[gc] = append(rs, cpRange)
- }
- } else {
- gc2CPRange[gc] = []*CodePointRange{
- cpRange,
- }
- }
+ unicodeData.addGC(gc, cp)
}
if p.err != nil {
return nil, p.err
}
if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
- defaultGCVal := propValAliases.GeneralCategoryDefaultValue
- gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ unicodeData.addGC(propValAliases.GeneralCategoryDefaultValue, &CodePointRange{
From: lastCPTo + 1,
To: propValAliases.GeneralCategoryDefaultRange.To,
})
}
- return &UnicodeData{
- GeneralCategory: gc2CPRange,
- }, nil
+
+ return unicodeData, nil
+}
+
+func (u *UnicodeData) addGC(gc string, cp *CodePointRange) {
+ // https://www.unicode.org/reports/tr44/#Empty_Fields
+ // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line
+ // > for a code point is empty, that indicates that the property takes the default value for that code point.
+ if gc == "" {
+ if cp.From < u.propValAliases.GeneralCategoryDefaultRange.From || cp.To > u.propValAliases.GeneralCategoryDefaultRange.To {
+ return
+ }
+ gc = u.propValAliases.GeneralCategoryDefaultValue
+ }
+
+ cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)]
+ if ok {
+ c := cps[len(cps)-1]
+ if cp.From-c.To == 1 {
+ c.To = cp.To
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp)
+ }
+ } else {
+ u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp}
+ }
}