diff options
Diffstat (limited to 'ucd/parser.go')
-rw-r--r-- | ucd/parser.go | 155 |
1 files changed, 155 insertions, 0 deletions
diff --git a/ucd/parser.go b/ucd/parser.go new file mode 100644 index 0000000..88d7134 --- /dev/null +++ b/ucd/parser.go @@ -0,0 +1,155 @@ +package ucd + +import ( + "bufio" + "encoding/binary" + "encoding/hex" + "io" + "regexp" + "strings" +) + +type CodePointRange struct { + From rune + To rune +} + +var codePointRangeNil = &CodePointRange{ + From: 0, + To: 0, +} + +type field string + +func (f field) codePointRange() (*CodePointRange, error) { + var from, to rune + var err error + cp := reCodePointRange.FindStringSubmatch(string(f)) + from, err = decodeHexToRune(cp[1]) + if err != nil { + return codePointRangeNil, err + } + if cp[2] != "" { + to, err = decodeHexToRune(cp[2]) + if err != nil { + return codePointRangeNil, err + } + } else { + to = from + } + return &CodePointRange{ + From: from, + To: to, + }, nil +} + +func decodeHexToRune(hexCodePoint string) (rune, error) { + h := hexCodePoint + if len(h)%2 != 0 { + h = "0" + h + } + b, err := hex.DecodeString(h) + if err != nil { + return 0, err + } + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) + } + n := binary.BigEndian.Uint32(b) + return rune(n), nil +} + +func (f field) symbol() string { + return string(f) +} + +func (f field) normalizedSymbol() string { + return normalizeSymbolicValue(string(f)) +} + +var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") + +// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. +// +// https://www.unicode.org/reports/tr44/#UAX44-LM3 +func normalizeSymbolicValue(s string) string { + v := strings.ToLower(symValReplacer.Replace(s)) + if strings.HasPrefix(v, "is") && v != "is" { + return v[2:] + } + return v +} + +var ( + reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) + reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) + + specialCommentPrefix = "# @missing:" +) + +// This parser can parse data files of Unicode Character Database (UCD). +// Specifically, it has the following two functions: +// - Converts each line of the data files into a slice of fields. +// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. +// +// However, for practical purposes, each field needs to be analyzed more specifically. +// For instance, in UnicodeData.txt, the first field represents a range of code points, +// so it needs to be recognized as a hexadecimal string. +// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. +// +// https://www.unicode.org/reports/tr44/#Format_Conventions +type parser struct { + scanner *bufio.Scanner + fields []field + defaultFields []field + err error + + fieldBuf []field + defaultFieldBuf []field +} + +func newParser(r io.Reader) *parser { + return &parser{ + scanner: bufio.NewScanner(r), + fieldBuf: make([]field, 50), + defaultFieldBuf: make([]field, 50), + } +} + +func (p *parser) parse() bool { + for p.scanner.Scan() { + p.parseRecord(p.scanner.Text()) + if p.fields != nil || p.defaultFields != nil { + return true + } + } + p.err = p.scanner.Err() + return false +} + +func (p *parser) parseRecord(src string) { + ms := reLine.FindStringSubmatch(src) + mFields := ms[1] + mComment := ms[2] + if mFields != "" { + p.fields = parseFields(p.fieldBuf, mFields) + } else { + p.fields = nil + } + if strings.HasPrefix(mComment, specialCommentPrefix) { + p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) + } else { + p.defaultFields = nil + } +} + +func parseFields(buf []field, src string) []field { + n := 0 + for _, f := range strings.Split(src, ";") { + buf[n] = field(strings.TrimSpace(f)) + n++ + } + + return buf[:n] +} |