aboutsummaryrefslogtreecommitdiff
path: root/ucd/unicode_data.go
blob: 45136667eda78c6913f3da46c5c320e438a0741e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
package ucd

import "io"

type UnicodeData struct {
	GeneralCategory map[string][]*CodePointRange
}

// ParseUnicodeData parses the UnicodeData.txt.
func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
	gc2CPRange := map[string][]*CodePointRange{}
	lastCPTo := rune(-1)
	p := newParser(r)
	for p.parse() {
		if len(p.fields) == 0 {
			continue
		}
		cpRange, err := p.fields[0].codePointRange()
		if err != nil {
			return nil, err
		}
		if cpRange.From-lastCPTo > 1 {
			defaultGCVal := propValAliases.GeneralCategoryDefaultValue
			gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
				From: lastCPTo + 1,
				To:   cpRange.From - 1,
			})
		}
		lastCPTo = cpRange.To
		gc := p.fields[2].normalizedSymbol()
		if gc == "" {
			// https://www.unicode.org/reports/tr44/#Empty_Fields
			// > The data file UnicodeData.txt defines many property values in each record. When a field in a data
			// > line for a code point is empty, that indicates that the property takes the default value for that
			// > code point.
			if cpRange.From < propValAliases.GeneralCategoryDefaultRange.From || cpRange.To > propValAliases.GeneralCategoryDefaultRange.To {
				continue
			}
			gc = propValAliases.GeneralCategoryDefaultValue
		}
		rs, ok := gc2CPRange[gc]
		if ok {
			r := rs[len(rs)-1]
			if cpRange.From-r.To == 1 {
				r.To = cpRange.To
			} else {
				gc2CPRange[gc] = append(rs, cpRange)
			}
		} else {
			gc2CPRange[gc] = []*CodePointRange{
				cpRange,
			}
		}
	}
	if p.err != nil {
		return nil, p.err
	}
	if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
		defaultGCVal := propValAliases.GeneralCategoryDefaultValue
		gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
			From: lastCPTo + 1,
			To:   propValAliases.GeneralCategoryDefaultRange.To,
		})
	}
	return &UnicodeData{
		GeneralCategory: gc2CPRange,
	}, nil
}