diff options
Diffstat (limited to 'src/urubu/ucd/parser.go')
-rw-r--r-- | src/urubu/ucd/parser.go | 155 |
1 files changed, 0 insertions, 155 deletions
diff --git a/src/urubu/ucd/parser.go b/src/urubu/ucd/parser.go deleted file mode 100644 index 88d7134..0000000 --- a/src/urubu/ucd/parser.go +++ /dev/null @@ -1,155 +0,0 @@ -package ucd - -import ( - "bufio" - "encoding/binary" - "encoding/hex" - "io" - "regexp" - "strings" -) - -type CodePointRange struct { - From rune - To rune -} - -var codePointRangeNil = &CodePointRange{ - From: 0, - To: 0, -} - -type field string - -func (f field) codePointRange() (*CodePointRange, error) { - var from, to rune - var err error - cp := reCodePointRange.FindStringSubmatch(string(f)) - from, err = decodeHexToRune(cp[1]) - if err != nil { - return codePointRangeNil, err - } - if cp[2] != "" { - to, err = decodeHexToRune(cp[2]) - if err != nil { - return codePointRangeNil, err - } - } else { - to = from - } - return &CodePointRange{ - From: from, - To: to, - }, nil -} - -func decodeHexToRune(hexCodePoint string) (rune, error) { - h := hexCodePoint - if len(h)%2 != 0 { - h = "0" + h - } - b, err := hex.DecodeString(h) - if err != nil { - return 0, err - } - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - return rune(n), nil -} - -func (f field) symbol() string { - return string(f) -} - -func (f field) normalizedSymbol() string { - return normalizeSymbolicValue(string(f)) -} - -var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "") - -// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3. -// -// https://www.unicode.org/reports/tr44/#UAX44-LM3 -func normalizeSymbolicValue(s string) string { - v := strings.ToLower(symValReplacer.Replace(s)) - if strings.HasPrefix(v, "is") && v != "is" { - return v[2:] - } - return v -} - -var ( - reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`) - reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`) - - specialCommentPrefix = "# @missing:" -) - -// This parser can parse data files of Unicode Character Database (UCD). -// Specifically, it has the following two functions: -// - Converts each line of the data files into a slice of fields. -// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields. -// -// However, for practical purposes, each field needs to be analyzed more specifically. -// For instance, in UnicodeData.txt, the first field represents a range of code points, -// so it needs to be recognized as a hexadecimal string. -// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser. -// -// https://www.unicode.org/reports/tr44/#Format_Conventions -type parser struct { - scanner *bufio.Scanner - fields []field - defaultFields []field - err error - - fieldBuf []field - defaultFieldBuf []field -} - -func newParser(r io.Reader) *parser { - return &parser{ - scanner: bufio.NewScanner(r), - fieldBuf: make([]field, 50), - defaultFieldBuf: make([]field, 50), - } -} - -func (p *parser) parse() bool { - for p.scanner.Scan() { - p.parseRecord(p.scanner.Text()) - if p.fields != nil || p.defaultFields != nil { - return true - } - } - p.err = p.scanner.Err() - return false -} - -func (p *parser) parseRecord(src string) { - ms := reLine.FindStringSubmatch(src) - mFields := ms[1] - mComment := ms[2] - if mFields != "" { - p.fields = parseFields(p.fieldBuf, mFields) - } else { - p.fields = nil - } - if strings.HasPrefix(mComment, specialCommentPrefix) { - p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1)) - } else { - p.defaultFields = nil - } -} - -func parseFields(buf []field, src string) []field { - n := 0 - for _, f := range strings.Split(src, ";") { - buf[n] = field(strings.TrimSpace(f)) - n++ - } - - return buf[:n] -} |