package ucd import ( "bufio" "encoding/binary" "encoding/hex" "io" "regexp" "strings" ) type CodePointRange struct { From rune To rune } var codePointRangeNil = &CodePointRange{ From: 0, To: 0, } type field string func (f field) codePointRange() (*CodePointRange, error) { var from, to rune var err error cp := reCodePointRange.FindStringSubmatch(string(f)) from, err = decodeHexToRune(cp[1]) if err != nil { return codePointRangeNil, err } if cp[2] != "" { to, err = decodeHexToRune(cp[2]) if err != nil { return codePointRangeNil, err } } else { to = from } return &CodePointRange{ From: from, To: to, }, nil } func decodeHexToRune(hexCodePoint string) (rune, error) { h := hexCodePoint if len(h)%2 != 0 { h = "0" + h } b, err := hex.DecodeString(h) if err != nil { return 0, err } l := len(b) for i := 0; i < 4-l; i++ { b = append([]byte{0}, b...) } n := binary.BigEndian.Uint32(b) return rune(n), nil } func (f field) symbol() string { return string(f) } func (f field) normalizedSymbol() string { return NormalizeSymbolicValue(string(f)) } var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") // NormalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. // // https://www.unicode.org/reports/tr44/#UAX44-LM3 func NormalizeSymbolicValue(s string) string { v := strings.ToLower(symValReplacer.Replace(s)) if strings.HasPrefix(v, "is") && v != "is" { return v[2:] } return v } var ( reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) specialCommentPrefix = "# @missing:" ) // This parser can parse data files of Unicode Character Database (UCD). // Specifically, it has the following two functions: // - Converts each line of the data files into a slice of fields. // - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. // // However, for practical purposes, each field needs to be analyzed more specifically. // For instance, in UnicodeData.txt, the first field represents a range of code points, // so it needs to be recognized as a hexadecimal string. // You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. // // https://www.unicode.org/reports/tr44/#Format_Conventions type parser struct { scanner *bufio.Scanner fields []field defaultFields []field err error fieldBuf []field defaultFieldBuf []field } func newParser(r io.Reader) *parser { return &parser{ scanner: bufio.NewScanner(r), fieldBuf: make([]field, 50), defaultFieldBuf: make([]field, 50), } } func (p *parser) parse() bool { for p.scanner.Scan() { p.parseRecord(p.scanner.Text()) if p.fields != nil || p.defaultFields != nil { return true } } p.err = p.scanner.Err() return false } func (p *parser) parseRecord(src string) { ms := reLine.FindStringSubmatch(src) mFields := ms[1] mComment := ms[2] if mFields != "" { p.fields = parseFields(p.fieldBuf, mFields) } else { p.fields = nil } if strings.HasPrefix(mComment, specialCommentPrefix) { p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) } else { p.defaultFields = nil } } func parseFields(buf []field, src string) []field { n := 0 for _, f := range strings.Split(src, ";") { buf[n] = field(strings.TrimSpace(f)) n++ } return buf[:n] }