diff options
Diffstat (limited to 'ucd')
-rw-r--r-- | ucd/parser.go | 238 |
1 files changed, 238 insertions, 0 deletions
diff --git a/ucd/parser.go b/ucd/parser.go new file mode 100644 index 0000000..9476a2f --- /dev/null +++ b/ucd/parser.go @@ -0,0 +1,238 @@ +package ucd + +import ( + "bufio" + "encoding/binary" + "encoding/hex" + "io" + "regexp" + "strings" +) + +type CodePointRange struct { + From rune + To rune +} + +type UnicodeData struct { + GeneralCategory map[string][]*CodePointRange +} + +func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) { + gc2CPRange := map[string][]*CodePointRange{} + lastCPTo := rune(-1) + p := newParser(r) + for p.parse() { + if len(p.fields) == 0 { + continue + } + cpFrom, cpTo, err := parseCodePointRange(p.fields[0]) + if err != nil { + return nil, err + } + if cpFrom-lastCPTo > 1 { + defaultGCVal := propValAliases.GeneralCategoryDefaultValue + gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + From: lastCPTo + 1, + To: cpFrom - 1, + }) + } + lastCPTo = cpTo + gc := NormalizeSymbolicValue(p.fields[2]) + rs, ok := gc2CPRange[gc] + if ok { + r := rs[len(rs)-1] + if cpFrom-r.To == 1 { + r.To = cpTo + } else { + gc2CPRange[gc] = append(rs, &CodePointRange{ + From: cpFrom, + To: cpTo, + }) + } + } else { + gc2CPRange[gc] = []*CodePointRange{ + { + From: cpFrom, + To: cpTo, + }, + } + } + } + if p.err != nil { + return nil, p.err + } + if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To { + defaultGCVal := propValAliases.GeneralCategoryDefaultValue + gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{ + From: lastCPTo + 1, + To: propValAliases.GeneralCategoryDefaultRange.To, + }) + } + return &UnicodeData{ + GeneralCategory: gc2CPRange, + }, nil +} + +type PropertyValueAliases struct { + GeneralCategory map[string]string + GeneralCategoryDefaultRange *CodePointRange + GeneralCategoryDefaultValue string +} + +func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) { + catName2Abbs := map[string]string{} + var defaultGCCPRange *CodePointRange + var defaultGCVal string + p := newParser(r) + for p.parse() { + if len(p.fields) > 0 && p.fields[0] == "gc" { + catNameShort := NormalizeSymbolicValue(p.fields[1]) + catNameLong := NormalizeSymbolicValue(p.fields[2]) + catName2Abbs[catNameShort] = catNameShort + catName2Abbs[catNameLong] = catNameShort + for _, f := range p.fields[3:] { + catNameOther := NormalizeSymbolicValue(f) + catName2Abbs[catNameOther] = catNameShort + } + } + if len(p.defaultFields) > 0 && p.defaultFields[1] == "General_Category" { + cpFrom, cpTo, err := parseCodePointRange(p.defaultFields[0]) + if err != nil { + return nil, err + } + defaultGCCPRange = &CodePointRange{ + From: cpFrom, + To: cpTo, + } + defaultGCVal = NormalizeSymbolicValue(p.defaultFields[2]) + } + } + if p.err != nil { + return nil, p.err + } + return &PropertyValueAliases{ + GeneralCategory: catName2Abbs, + GeneralCategoryDefaultRange: defaultGCCPRange, + GeneralCategoryDefaultValue: defaultGCVal, + }, nil +} + +var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") + +func NormalizeSymbolicValue(original string) string { + strings.Trim("", "") + v := strings.ToLower(symValReplacer.Replace(original)) + if strings.HasPrefix(v, "is") && v != "is" { + return v[3:] + } + return v +} + +type Fields []string + +var ( + reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) + reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) + + specialCommentPrefix = "# @missing:" +) + +// This parser can parse data files of Unicode Character Database (UCD). +// Specifically, it has the following two functions: +// - Converts each line of the data files into a slice of fields. +// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. +// +// However, for practical purposes, each field needs to be analyzed more specifically. +// For instance, in UnicodeData.txt, the first field represents a range of code points, +// so it needs to be recognized as a hexadecimal string. +// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. +// +// https://www.unicode.org/reports/tr44/#Format_Conventions +type parser struct { + scanner *bufio.Scanner + fields Fields + defaultFields Fields + err error +} + +func newParser(r io.Reader) *parser { + return &parser{ + scanner: bufio.NewScanner(r), + } +} + +func (p *parser) parse() bool { + for p.scanner.Scan() { + p.fields, p.defaultFields, p.err = parseRecord(p.scanner.Text()) + if p.err != nil { + return false + } + if p.fields != nil || p.defaultFields != nil { + return true + } + } + p.err = p.scanner.Err() + return false +} + +func parseRecord(src string) (Fields, Fields, error) { + ms := reLine.FindStringSubmatch(src) + fields := ms[1] + comment := ms[2] + var fs Fields + if fields != "" { + fs = parseFields(fields) + } + var defaultFs Fields + if strings.HasPrefix(comment, specialCommentPrefix) { + fields := strings.Replace(comment, specialCommentPrefix, "", -1) + fs := parseFields(fields) + defaultFs = fs + } + return fs, defaultFs, nil +} + +func parseFields(src string) Fields { + var fields Fields + for _, f := range strings.Split(src, ";") { + fields = append(fields, strings.TrimSpace(f)) + } + return fields +} + +func parseCodePointRange(src string) (rune, rune, error) { + var from, to rune + var err error + cp := reCodePointRange.FindStringSubmatch(src) + from, err = decodeHexToRune(cp[1]) + if err != nil { + return 0, 0, err + } + if cp[2] != "" { + to, err = decodeHexToRune(cp[2]) + if err != nil { + return 0, 0, err + } + } else { + to = from + } + return from, to, nil +} + +func decodeHexToRune(hexCodePoint string) (rune, error) { + h := hexCodePoint + if len(h)%2 != 0 { + h = "0" + h + } + b, err := hex.DecodeString(h) + if err != nil { + return 0, err + } + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) + } + n := binary.BigEndian.Uint32(b) + return rune(n), nil +} |