diff options
-rw-r--r-- | ucd/parser.go | 227 | ||||
-rw-r--r-- | ucd/property_value_aliases.go | 62 | ||||
-rw-r--r-- | ucd/unicode_data.go | 68 |
3 files changed, 202 insertions, 155 deletions
diff --git a/ucd/parser.go b/ucd/parser.go index 9476a2f..4959a53 100644 --- a/ucd/parser.go +++ b/ucd/parser.go @@ -14,123 +14,73 @@ type CodePointRange struct { To rune } -type UnicodeData struct { - GeneralCategory map[string][]*CodePointRange +var codePointRangeNil = &CodePointRange{ + From: 0, + To: 0, } -func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { - gc2CPRange := map[string][]*CodePointRange{} - lastCPTo := rune(-1) - p := newParser(r) - for p.parse() { - if len(p.fields) == 0 { - continue - } - cpFrom, cpTo, err := parseCodePointRange(p.fields[0]) +type field string + +func (f field) codePointRange() (*CodePointRange, error) { + var from, to rune + var err error + cp := reCodePointRange.FindStringSubmatch(string(f)) + from, err = decodeHexToRune(cp[1]) + if err != nil { + return codePointRangeNil, err + } + if cp[2] != "" { + to, err = decodeHexToRune(cp[2]) if err != nil { - return nil, err - } - if cpFrom-lastCPTo > 1 { - defaultGCVal := propValAliases.GeneralCategoryDefaultValue - gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ - From: lastCPTo + 1, - To: cpFrom - 1, - }) - } - lastCPTo = cpTo - gc := NormalizeSymbolicValue(p.fields[2]) - rs, ok := gc2CPRange[gc] - if ok { - r := rs[len(rs)-1] - if cpFrom-r.To == 1 { - r.To = cpTo - } else { - gc2CPRange[gc] = append(rs, &CodePointRange{ - From: cpFrom, - To: cpTo, - }) - } - } else { - gc2CPRange[gc] = []*CodePointRange{ - { - From: cpFrom, - To: cpTo, - }, - } + return codePointRangeNil, err } + } else { + to = from + } + return &CodePointRange{ + From: from, + To: to, + }, nil +} + +func decodeHexToRune(hexCodePoint string) (rune, error) { + h := hexCodePoint + if len(h)%2 != 0 { + h = "0" + h } - if p.err != nil { - return nil, p.err + b, err := hex.DecodeString(h) + if err != nil { + return 0, err } - if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To { - defaultGCVal := propValAliases.GeneralCategoryDefaultValue - gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ - From: lastCPTo + 1, - To: propValAliases.GeneralCategoryDefaultRange.To, - }) + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) } - return &UnicodeData{ - GeneralCategory: gc2CPRange, - }, nil + n := binary.BigEndian.Uint32(b) + return rune(n), nil } -type PropertyValueAliases struct { - GeneralCategory map[string]string - GeneralCategoryDefaultRange *CodePointRange - GeneralCategoryDefaultValue string +func (f field) symbol() string { + return string(f) } -func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) { - catName2Abbs := map[string]string{} - var defaultGCCPRange *CodePointRange - var defaultGCVal string - p := newParser(r) - for p.parse() { - if len(p.fields) > 0 && p.fields[0] == "gc" { - catNameShort := NormalizeSymbolicValue(p.fields[1]) - catNameLong := NormalizeSymbolicValue(p.fields[2]) - catName2Abbs[catNameShort] = catNameShort - catName2Abbs[catNameLong] = catNameShort - for _, f := range p.fields[3:] { - catNameOther := NormalizeSymbolicValue(f) - catName2Abbs[catNameOther] = catNameShort - } - } - if len(p.defaultFields) > 0 && p.defaultFields[1] == "General_Category" { - cpFrom, cpTo, err := parseCodePointRange(p.defaultFields[0]) - if err != nil { - return nil, err - } - defaultGCCPRange = &CodePointRange{ - From: cpFrom, - To: cpTo, - } - defaultGCVal = NormalizeSymbolicValue(p.defaultFields[2]) - } - } - if p.err != nil { - return nil, p.err - } - return &PropertyValueAliases{ - GeneralCategory: catName2Abbs, - GeneralCategoryDefaultRange: defaultGCCPRange, - GeneralCategoryDefaultValue: defaultGCVal, - }, nil +func (f field) normalizedSymbol() string { + return NormalizeSymbolicValue(string(f)) } var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") -func NormalizeSymbolicValue(original string) string { - strings.Trim("", "") - v := strings.ToLower(symValReplacer.Replace(original)) +// NormalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. +// +// https://www.unicode.org/reports/tr44/#UAX44-LM3 +func NormalizeSymbolicValue(s string) string { + v := strings.ToLower(symValReplacer.Replace(s)) if strings.HasPrefix(v, "is") && v != "is" { - return v[3:] + return v[2:] } return v } -type Fields []string - var ( reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) @@ -151,23 +101,25 @@ var ( // https://www.unicode.org/reports/tr44/#Format_Conventions type parser struct { scanner *bufio.Scanner - fields Fields - defaultFields Fields + fields []field + defaultFields []field err error + + fieldBuf []field + defaultFieldBuf []field } func newParser(r io.Reader) *parser { return &parser{ - scanner: bufio.NewScanner(r), + scanner: bufio.NewScanner(r), + fieldBuf: make([]field, 50), + defaultFieldBuf: make([]field, 50), } } func (p *parser) parse() bool { for p.scanner.Scan() { - p.fields, p.defaultFields, p.err = parseRecord(p.scanner.Text()) - if p.err != nil { - return false - } + p.parseRecord(p.scanner.Text()) if p.fields != nil || p.defaultFields != nil { return true } @@ -176,63 +128,28 @@ func (p *parser) parse() bool { return false } -func parseRecord(src string) (Fields, Fields, error) { +func (p *parser) parseRecord(src string) { ms := reLine.FindStringSubmatch(src) - fields := ms[1] - comment := ms[2] - var fs Fields - if fields != "" { - fs = parseFields(fields) + mFields := ms[1] + mComment := ms[2] + if mFields != "" { + p.fields = parseFields(p.fieldBuf, mFields) + } else { + p.fields = nil } - var defaultFs Fields - if strings.HasPrefix(comment, specialCommentPrefix) { - fields := strings.Replace(comment, specialCommentPrefix, "", -1) - fs := parseFields(fields) - defaultFs = fs + if strings.HasPrefix(mComment, specialCommentPrefix) { + p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) + } else { + p.defaultFields = nil } - return fs, defaultFs, nil } -func parseFields(src string) Fields { - var fields Fields +func parseFields(buf []field, src string) []field { + n := 0 for _, f := range strings.Split(src, ";") { - fields = append(fields, strings.TrimSpace(f)) + buf[n] = field(strings.TrimSpace(f)) + n++ } - return fields -} -func parseCodePointRange(src string) (rune, rune, error) { - var from, to rune - var err error - cp := reCodePointRange.FindStringSubmatch(src) - from, err = decodeHexToRune(cp[1]) - if err != nil { - return 0, 0, err - } - if cp[2] != "" { - to, err = decodeHexToRune(cp[2]) - if err != nil { - return 0, 0, err - } - } else { - to = from - } - return from, to, nil -} - -func decodeHexToRune(hexCodePoint string) (rune, error) { - h := hexCodePoint - if len(h)%2 != 0 { - h = "0" + h - } - b, err := hex.DecodeString(h) - if err != nil { - return 0, err - } - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - return rune(n), nil + return buf[:n] } diff --git a/ucd/property_value_aliases.go b/ucd/property_value_aliases.go new file mode 100644 index 0000000..c438c54 --- /dev/null +++ b/ucd/property_value_aliases.go @@ -0,0 +1,62 @@ +package ucd + +import "io" + +type PropertyValueAliases struct { + GeneralCategory map[string]string + GeneralCategoryDefaultRange *CodePointRange + GeneralCategoryDefaultValue string +} + +// ParsePropertyValueAliases parses the PropertyValueAliases.txt. +func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) { + catName2Abbs := map[string]string{} + var defaultGCCPRange *CodePointRange + var defaultGCVal string + p := newParser(r) + for p.parse() { + // https://www.unicode.org/reports/tr44/#Property_Value_Aliases + // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property, + // > the second field specifies an abbreviated symbolic name for a value of that property, and the third + // > field specifies the long symbolic name for that value of that property. These are the preferred + // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent + // > fields. + if len(p.fields) > 0 && p.fields[0].symbol() == "gc" { + catNameShort := p.fields[1].normalizedSymbol() + catNameLong := p.fields[2].normalizedSymbol() + catName2Abbs[catNameShort] = catNameShort + catName2Abbs[catNameLong] = catNameShort + for _, f := range p.fields[3:] { + catNameOther := f.normalizedSymbol() + catName2Abbs[catNameOther] = catNameShort + } + } + + // https://www.unicode.org/reports/tr44/#Missing_Conventions + // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt. + // > ... + // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below: + // > 1. code_point_range; default_prop_val + // > 2. code_point_range; property_name; default_prop_val + // > ... + // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which + // > contain values associated with many properties. For example: + // > # @missing: 0000..10FFFF; NFD_QC; Yes + if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" { + var err error + defaultGCCPRange, err = p.defaultFields[0].codePointRange() + if err != nil { + return nil, err + } + defaultGCVal = p.defaultFields[2].normalizedSymbol() + } + } + if p.err != nil { + return nil, p.err + } + return &PropertyValueAliases{ + GeneralCategory: catName2Abbs, + GeneralCategoryDefaultRange: defaultGCCPRange, + GeneralCategoryDefaultValue: defaultGCVal, + }, nil +} diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go new file mode 100644 index 0000000..4513666 --- /dev/null +++ b/ucd/unicode_data.go @@ -0,0 +1,68 @@ +package ucd + +import "io" + +type UnicodeData struct { + GeneralCategory map[string][]*CodePointRange +} + +// ParseUnicodeData parses the UnicodeData.txt. +func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { + gc2CPRange := map[string][]*CodePointRange{} + lastCPTo := rune(-1) + p := newParser(r) + for p.parse() { + if len(p.fields) == 0 { + continue + } + cpRange, err := p.fields[0].codePointRange() + if err != nil { + return nil, err + } + if cpRange.From-lastCPTo > 1 { + defaultGCVal := propValAliases.GeneralCategoryDefaultValue + gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + From: lastCPTo + 1, + To: cpRange.From - 1, + }) + } + lastCPTo = cpRange.To + gc := p.fields[2].normalizedSymbol() + if gc == "" { + // https://www.unicode.org/reports/tr44/#Empty_Fields + // > The data file UnicodeData.txt defines many property values in each record. When a field in a data + // > line for a code point is empty, that indicates that the property takes the default value for that + // > code point. + if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To { + continue + } + gc = propValAliases.GeneralCategoryDefaultValue + } + rs, ok := gc2CPRange[gc] + if ok { + r := rs[len(rs)-1] + if cpRange.From-r.To == 1 { + r.To = cpRange.To + } else { + gc2CPRange[gc] = append(rs, cpRange) + } + } else { + gc2CPRange[gc] = []*CodePointRange{ + cpRange, + } + } + } + if p.err != nil { + return nil, p.err + } + if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To { + defaultGCVal := propValAliases.GeneralCategoryDefaultValue + gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + From: lastCPTo + 1, + To: propValAliases.GeneralCategoryDefaultRange.To, + }) + } + return &UnicodeData{ + GeneralCategory: gc2CPRange, + }, nil +} |