aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ucd/parser.go227
-rw-r--r--ucd/property_value_aliases.go62
-rw-r--r--ucd/unicode_data.go68
3 files changed, 202 insertions, 155 deletions
diff --git a/ucd/parser.go b/ucd/parser.go
index 9476a2f..4959a53 100644
--- a/ucd/parser.go
+++ b/ucd/parser.go
@@ -14,123 +14,73 @@ type CodePointRange struct {
To rune
}
-type UnicodeData struct {
- GeneralCategory map[string][]*CodePointRange
+var codePointRangeNil = &CodePointRange{
+ From: 0,
+ To: 0,
}
-func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
- gc2CPRange := map[string][]*CodePointRange{}
- lastCPTo := rune(-1)
- p := newParser(r)
- for p.parse() {
- if len(p.fields) == 0 {
- continue
- }
- cpFrom, cpTo, err := parseCodePointRange(p.fields[0])
+type field string
+
+func (f field) codePointRange() (*CodePointRange, error) {
+ var from, to rune
+ var err error
+ cp := reCodePointRange.FindStringSubmatch(string(f))
+ from, err = decodeHexToRune(cp[1])
+ if err != nil {
+ return codePointRangeNil, err
+ }
+ if cp[2] != "" {
+ to, err = decodeHexToRune(cp[2])
if err != nil {
- return nil, err
- }
- if cpFrom-lastCPTo > 1 {
- defaultGCVal := propValAliases.GeneralCategoryDefaultValue
- gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
- From: lastCPTo + 1,
- To: cpFrom - 1,
- })
- }
- lastCPTo = cpTo
- gc := NormalizeSymbolicValue(p.fields[2])
- rs, ok := gc2CPRange[gc]
- if ok {
- r := rs[len(rs)-1]
- if cpFrom-r.To == 1 {
- r.To = cpTo
- } else {
- gc2CPRange[gc] = append(rs, &CodePointRange{
- From: cpFrom,
- To: cpTo,
- })
- }
- } else {
- gc2CPRange[gc] = []*CodePointRange{
- {
- From: cpFrom,
- To: cpTo,
- },
- }
+ return codePointRangeNil, err
}
+ } else {
+ to = from
+ }
+ return &CodePointRange{
+ From: from,
+ To: to,
+ }, nil
+}
+
+func decodeHexToRune(hexCodePoint string) (rune, error) {
+ h := hexCodePoint
+ if len(h)%2 != 0 {
+ h = "0" + h
}
- if p.err != nil {
- return nil, p.err
+ b, err := hex.DecodeString(h)
+ if err != nil {
+ return 0, err
}
- if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
- defaultGCVal := propValAliases.GeneralCategoryDefaultValue
- gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
- From: lastCPTo + 1,
- To: propValAliases.GeneralCategoryDefaultRange.To,
- })
+ l := len(b)
+ for i := 0; i < 4-l; i++ {
+ b = append([]byte{0}, b...)
}
- return &UnicodeData{
- GeneralCategory: gc2CPRange,
- }, nil
+ n := binary.BigEndian.Uint32(b)
+ return rune(n), nil
}
-type PropertyValueAliases struct {
- GeneralCategory map[string]string
- GeneralCategoryDefaultRange *CodePointRange
- GeneralCategoryDefaultValue string
+func (f field) symbol() string {
+ return string(f)
}
-func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
- catName2Abbs := map[string]string{}
- var defaultGCCPRange *CodePointRange
- var defaultGCVal string
- p := newParser(r)
- for p.parse() {
- if len(p.fields) > 0 && p.fields[0] == "gc" {
- catNameShort := NormalizeSymbolicValue(p.fields[1])
- catNameLong := NormalizeSymbolicValue(p.fields[2])
- catName2Abbs[catNameShort] = catNameShort
- catName2Abbs[catNameLong] = catNameShort
- for _, f := range p.fields[3:] {
- catNameOther := NormalizeSymbolicValue(f)
- catName2Abbs[catNameOther] = catNameShort
- }
- }
- if len(p.defaultFields) > 0 && p.defaultFields[1] == "General_Category" {
- cpFrom, cpTo, err := parseCodePointRange(p.defaultFields[0])
- if err != nil {
- return nil, err
- }
- defaultGCCPRange = &CodePointRange{
- From: cpFrom,
- To: cpTo,
- }
- defaultGCVal = NormalizeSymbolicValue(p.defaultFields[2])
- }
- }
- if p.err != nil {
- return nil, p.err
- }
- return &PropertyValueAliases{
- GeneralCategory: catName2Abbs,
- GeneralCategoryDefaultRange: defaultGCCPRange,
- GeneralCategoryDefaultValue: defaultGCVal,
- }, nil
+func (f field) normalizedSymbol() string {
+ return NormalizeSymbolicValue(string(f))
}
var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")
-func NormalizeSymbolicValue(original string) string {
- strings.Trim("", "")
- v := strings.ToLower(symValReplacer.Replace(original))
+// NormalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3.
+//
+// https://www.unicode.org/reports/tr44/#UAX44-LM3
+func NormalizeSymbolicValue(s string) string {
+ v := strings.ToLower(symValReplacer.Replace(s))
if strings.HasPrefix(v, "is") && v != "is" {
- return v[3:]
+ return v[2:]
}
return v
}
-type Fields []string
-
var (
reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)
@@ -151,23 +101,25 @@ var (
// https://www.unicode.org/reports/tr44/#Format_Conventions
type parser struct {
scanner *bufio.Scanner
- fields Fields
- defaultFields Fields
+ fields []field
+ defaultFields []field
err error
+
+ fieldBuf []field
+ defaultFieldBuf []field
}
func newParser(r io.Reader) *parser {
return &parser{
- scanner: bufio.NewScanner(r),
+ scanner: bufio.NewScanner(r),
+ fieldBuf: make([]field, 50),
+ defaultFieldBuf: make([]field, 50),
}
}
func (p *parser) parse() bool {
for p.scanner.Scan() {
- p.fields, p.defaultFields, p.err = parseRecord(p.scanner.Text())
- if p.err != nil {
- return false
- }
+ p.parseRecord(p.scanner.Text())
if p.fields != nil || p.defaultFields != nil {
return true
}
@@ -176,63 +128,28 @@ func (p *parser) parse() bool {
return false
}
-func parseRecord(src string) (Fields, Fields, error) {
+func (p *parser) parseRecord(src string) {
ms := reLine.FindStringSubmatch(src)
- fields := ms[1]
- comment := ms[2]
- var fs Fields
- if fields != "" {
- fs = parseFields(fields)
+ mFields := ms[1]
+ mComment := ms[2]
+ if mFields != "" {
+ p.fields = parseFields(p.fieldBuf, mFields)
+ } else {
+ p.fields = nil
}
- var defaultFs Fields
- if strings.HasPrefix(comment, specialCommentPrefix) {
- fields := strings.Replace(comment, specialCommentPrefix, "", -1)
- fs := parseFields(fields)
- defaultFs = fs
+ if strings.HasPrefix(mComment, specialCommentPrefix) {
+ p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1))
+ } else {
+ p.defaultFields = nil
}
- return fs, defaultFs, nil
}
-func parseFields(src string) Fields {
- var fields Fields
+func parseFields(buf []field, src string) []field {
+ n := 0
for _, f := range strings.Split(src, ";") {
- fields = append(fields, strings.TrimSpace(f))
+ buf[n] = field(strings.TrimSpace(f))
+ n++
}
- return fields
-}
-func parseCodePointRange(src string) (rune, rune, error) {
- var from, to rune
- var err error
- cp := reCodePointRange.FindStringSubmatch(src)
- from, err = decodeHexToRune(cp[1])
- if err != nil {
- return 0, 0, err
- }
- if cp[2] != "" {
- to, err = decodeHexToRune(cp[2])
- if err != nil {
- return 0, 0, err
- }
- } else {
- to = from
- }
- return from, to, nil
-}
-
-func decodeHexToRune(hexCodePoint string) (rune, error) {
- h := hexCodePoint
- if len(h)%2 != 0 {
- h = "0" + h
- }
- b, err := hex.DecodeString(h)
- if err != nil {
- return 0, err
- }
- l := len(b)
- for i := 0; i < 4-l; i++ {
- b = append([]byte{0}, b...)
- }
- n := binary.BigEndian.Uint32(b)
- return rune(n), nil
+ return buf[:n]
}
diff --git a/ucd/property_value_aliases.go b/ucd/property_value_aliases.go
new file mode 100644
index 0000000..c438c54
--- /dev/null
+++ b/ucd/property_value_aliases.go
@@ -0,0 +1,62 @@
+package ucd
+
+import "io"
+
+type PropertyValueAliases struct {
+ GeneralCategory map[string]string
+ GeneralCategoryDefaultRange *CodePointRange
+ GeneralCategoryDefaultValue string
+}
+
+// ParsePropertyValueAliases parses the PropertyValueAliases.txt.
+func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
+ catName2Abbs := map[string]string{}
+ var defaultGCCPRange *CodePointRange
+ var defaultGCVal string
+ p := newParser(r)
+ for p.parse() {
+ // https://www.unicode.org/reports/tr44/#Property_Value_Aliases
+ // > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property,
+ // > the second field specifies an abbreviated symbolic name for a value of that property, and the third
+ // > field specifies the long symbolic name for that value of that property. These are the preferred
+ // > aliases. Additional aliases for some property values may be specified in the fourth or subsequent
+ // > fields.
+ if len(p.fields) > 0 && p.fields[0].symbol() == "gc" {
+ catNameShort := p.fields[1].normalizedSymbol()
+ catNameLong := p.fields[2].normalizedSymbol()
+ catName2Abbs[catNameShort] = catNameShort
+ catName2Abbs[catNameLong] = catNameShort
+ for _, f := range p.fields[3:] {
+ catNameOther := f.normalizedSymbol()
+ catName2Abbs[catNameOther] = catNameShort
+ }
+ }
+
+ // https://www.unicode.org/reports/tr44/#Missing_Conventions
+ // > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt.
+ // > ...
+ // > there are currently two syntactic patterns used for @missing lines, as summarized schematically below:
+ // > 1. code_point_range; default_prop_val
+ // > 2. code_point_range; property_name; default_prop_val
+ // > ...
+ // > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which
+ // > contain values associated with many properties. For example:
+ // > # @missing: 0000..10FFFF; NFD_QC; Yes
+ if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" {
+ var err error
+ defaultGCCPRange, err = p.defaultFields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ defaultGCVal = p.defaultFields[2].normalizedSymbol()
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ return &PropertyValueAliases{
+ GeneralCategory: catName2Abbs,
+ GeneralCategoryDefaultRange: defaultGCCPRange,
+ GeneralCategoryDefaultValue: defaultGCVal,
+ }, nil
+}
diff --git a/ucd/unicode_data.go b/ucd/unicode_data.go
new file mode 100644
index 0000000..4513666
--- /dev/null
+++ b/ucd/unicode_data.go
@@ -0,0 +1,68 @@
+package ucd
+
+import "io"
+
+type UnicodeData struct {
+ GeneralCategory map[string][]*CodePointRange
+}
+
+// ParseUnicodeData parses the UnicodeData.txt.
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+ gc2CPRange := map[string][]*CodePointRange{}
+ lastCPTo := rune(-1)
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ cpRange, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ if cpRange.From-lastCPTo > 1 {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: cpRange.From - 1,
+ })
+ }
+ lastCPTo = cpRange.To
+ gc := p.fields[2].normalizedSymbol()
+ if gc == "" {
+ // https://www.unicode.org/reports/tr44/#Empty_Fields
+ // > The data file UnicodeData.txt defines many property values in each record. When a field in a data
+ // > line for a code point is empty, that indicates that the property takes the default value for that
+ // > code point.
+ if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To {
+ continue
+ }
+ gc = propValAliases.GeneralCategoryDefaultValue
+ }
+ rs, ok := gc2CPRange[gc]
+ if ok {
+ r := rs[len(rs)-1]
+ if cpRange.From-r.To == 1 {
+ r.To = cpRange.To
+ } else {
+ gc2CPRange[gc] = append(rs, cpRange)
+ }
+ } else {
+ gc2CPRange[gc] = []*CodePointRange{
+ cpRange,
+ }
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: propValAliases.GeneralCategoryDefaultRange.To,
+ })
+ }
+ return &UnicodeData{
+ GeneralCategory: gc2CPRange,
+ }, nil
+}