diff options
Diffstat (limited to 'ucd/unicode_data.go')
-rw-r--r-- | ucd/unicode_data.go | 75 |
1 files changed, 40 insertions, 35 deletions
diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go index 4513666..d45ee39 100644 --- a/ucd/unicode_data.go +++ b/ucd/unicode_data.go @@ -4,65 +4,70 @@ import "io" type UnicodeData struct { GeneralCategory map[string][]*CodePointRange + + propValAliases *PropertyValueAliases } // ParseUnicodeData parses the UnicodeData.txt. func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { - gc2CPRange := map[string][]*CodePointRange{} + unicodeData := &UnicodeData{ + GeneralCategory: map[string][]*CodePointRange{}, + propValAliases: propValAliases, + } + lastCPTo := rune(-1) p := newParser(r) for p.parse() { if len(p.fields) == 0 { continue } - cpRange, err := p.fields[0].codePointRange() + cp, err := p.fields[0].codePointRange() if err != nil { return nil, err } - if cpRange.From-lastCPTo > 1 { - defaultGCVal := propValAliases.GeneralCategoryDefaultValue - gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + if cp.From-lastCPTo > 1 { + unicodeData.addGC(propValAliases.GeneralCategoryDefaultValue, &CodePointRange{ From: lastCPTo + 1, - To: cpRange.From - 1, + To: cp.From - 1, }) } - lastCPTo = cpRange.To + lastCPTo = cp.To gc := p.fields[2].normalizedSymbol() - if gc == "" { - // https://www.unicode.org/reports/tr44/#Empty_Fields - // > The data file UnicodeData.txt defines many property values in each record. When a field in a data - // > line for a code point is empty, that indicates that the property takes the default value for that - // > code point. - if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To { - continue - } - gc = propValAliases.GeneralCategoryDefaultValue - } - rs, ok := gc2CPRange[gc] - if ok { - r := rs[len(rs)-1] - if cpRange.From-r.To == 1 { - r.To = cpRange.To - } else { - gc2CPRange[gc] = append(rs, cpRange) - } - } else { - gc2CPRange[gc] = []*CodePointRange{ - cpRange, - } - } + unicodeData.addGC(gc, cp) } if p.err != nil { return nil, p.err } if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To { - defaultGCVal := propValAliases.GeneralCategoryDefaultValue - gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + unicodeData.addGC(propValAliases.GeneralCategoryDefaultValue, &CodePointRange{ From: lastCPTo + 1, To: propValAliases.GeneralCategoryDefaultRange.To, }) } - return &UnicodeData{ - GeneralCategory: gc2CPRange, - }, nil + + return unicodeData, nil +} + +func (u *UnicodeData) addGC(gc string, cp *CodePointRange) { + // https://www.unicode.org/reports/tr44/#Empty_Fields + // > The data file UnicodeData.txt defines many property values in each record. When a field in a data line + // > for a code point is empty, that indicates that the property takes the default value for that code point. + if gc == "" { + if cp.From < u.propValAliases.GeneralCategoryDefaultRange.From || cp.To > u.propValAliases.GeneralCategoryDefaultRange.To { + return + } + gc = u.propValAliases.GeneralCategoryDefaultValue + } + + cps, ok := u.GeneralCategory[u.propValAliases.gcAbb(gc)] + if ok { + c := cps[len(cps)-1] + if cp.From-c.To == 1 { + c.To = cp.To + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = append(cps, cp) + } + } else { + u.GeneralCategory[u.propValAliases.gcAbb(gc)] = []*CodePointRange{cp} + } } |