aboutsummaryrefslogtreecommitdiff
path: root/src/urubu/ucd/property_value_aliases.go
blob: 4bc69db3ded264df4409c5fe25bf9970dc97823a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package ucd

import "io"

type PropertyValueAliases struct {
	GeneralCategory             map[string]string
	GeneralCategoryDefaultRange *CodePointRange
	GeneralCategoryDefaultValue string

	Script map[string]string
}

// ParsePropertyValueAliases parses the PropertyValueAliases.txt.
func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
	gcAbbs := map[string]string{}
	var defaultGCCPRange *CodePointRange
	var defaultGCVal string
	scAbbs := map[string]string{}
	p := newParser(r)
	for p.parse() {
		// https://www.unicode.org/reports/tr44/#Property_Value_Aliases
		// > In PropertyValueAliases.txt, the first field contains the abbreviated alias for a Unicode property,
		// > the second field specifies an abbreviated symbolic name for a value of that property, and the third
		// > field specifies the long symbolic name for that value of that property. These are the preferred
		// > aliases. Additional aliases for some property values may be specified in the fourth or subsequent
		// > fields.
		if len(p.fields) > 0 {
			switch p.fields[0].symbol() {
			case "gc":
				gcShort := p.fields[1].normalizedSymbol()
				gcLong := p.fields[2].normalizedSymbol()
				gcAbbs[gcShort] = gcShort
				gcAbbs[gcLong] = gcShort
				for _, f := range p.fields[3:] {
					gcShortOther := f.normalizedSymbol()
					gcAbbs[gcShortOther] = gcShort
				}
			case "sc":
				scShort := p.fields[1].normalizedSymbol()
				scLong := p.fields[2].normalizedSymbol()
				scAbbs[scShort] = scShort
				scAbbs[scLong] = scShort
				for _, f := range p.fields[3:] {
					scShortOther := f.normalizedSymbol()
					scAbbs[scShortOther] = scShort
				}
			}
		}

		// https://www.unicode.org/reports/tr44/#Missing_Conventions
		// > @missing lines are also supplied for many properties in the file PropertyValueAliases.txt.
		// > ...
		// > there are currently two syntactic patterns used for @missing lines, as summarized schematically below:
		// >     1. code_point_range; default_prop_val
		// >     2. code_point_range; property_name; default_prop_val
		// > ...
		// > Pattern #2 is used in PropertyValueAliases.txt and in DerivedNormalizationProps.txt, both of which
		// > contain values associated with many properties. For example:
		// >     # @missing: 0000..10FFFF; NFD_QC; Yes
		if len(p.defaultFields) > 0 && p.defaultFields[1].symbol() == "General_Category" {
			var err error
			defaultGCCPRange, err = p.defaultFields[0].codePointRange()
			if err != nil {
				return nil, err
			}
			defaultGCVal = p.defaultFields[2].normalizedSymbol()
		}
	}
	if p.err != nil {
		return nil, p.err
	}
	return &PropertyValueAliases{
		GeneralCategory:             gcAbbs,
		GeneralCategoryDefaultRange: defaultGCCPRange,
		GeneralCategoryDefaultValue: defaultGCVal,
		Script:                      scAbbs,
	}, nil
}

func (a *PropertyValueAliases) gcAbb(gc string) string {
	return a.GeneralCategory[gc]
}