aboutsummaryrefslogtreecommitdiff
path: root/ucd/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'ucd/parser.go')
-rw-r--r--ucd/parser.go155
1 files changed, 155 insertions, 0 deletions
diff --git a/ucd/parser.go b/ucd/parser.go
new file mode 100644
index 0000000..88d7134
--- /dev/null
+++ b/ucd/parser.go
@@ -0,0 +1,155 @@
+package ucd
+
+import (
+ "bufio"
+ "encoding/binary"
+ "encoding/hex"
+ "io"
+ "regexp"
+ "strings"
+)
+
+type CodePointRange struct {
+ From rune
+ To rune
+}
+
+var codePointRangeNil = &CodePointRange{
+ From: 0,
+ To: 0,
+}
+
+type field string
+
+func (f field) codePointRange() (*CodePointRange, error) {
+ var from, to rune
+ var err error
+ cp := reCodePointRange.FindStringSubmatch(string(f))
+ from, err = decodeHexToRune(cp[1])
+ if err != nil {
+ return codePointRangeNil, err
+ }
+ if cp[2] != "" {
+ to, err = decodeHexToRune(cp[2])
+ if err != nil {
+ return codePointRangeNil, err
+ }
+ } else {
+ to = from
+ }
+ return &CodePointRange{
+ From: from,
+ To: to,
+ }, nil
+}
+
+func decodeHexToRune(hexCodePoint string) (rune, error) {
+ h := hexCodePoint
+ if len(h)%2 != 0 {
+ h = "0" + h
+ }
+ b, err := hex.DecodeString(h)
+ if err != nil {
+ return 0, err
+ }
+ l := len(b)
+ for i := 0; i < 4-l; i++ {
+ b = append([]byte{0}, b...)
+ }
+ n := binary.BigEndian.Uint32(b)
+ return rune(n), nil
+}
+
+func (f field) symbol() string {
+ return string(f)
+}
+
+func (f field) normalizedSymbol() string {
+ return normalizeSymbolicValue(string(f))
+}
+
+var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")
+
+// normalizeSymbolicValue normalizes a symbolic value. The normalized value meets UAX44-LM3.
+//
+// https://www.unicode.org/reports/tr44/#UAX44-LM3
+func normalizeSymbolicValue(s string) string {
+ v := strings.ToLower(symValReplacer.Replace(s))
+ if strings.HasPrefix(v, "is") && v != "is" {
+ return v[2:]
+ }
+ return v
+}
+
+var (
+ reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
+ reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)
+
+ specialCommentPrefix = "# @missing:"
+)
+
+// This parser can parse data files of Unicode Character Database (UCD).
+// Specifically, it has the following two functions:
+// - Converts each line of the data files into a slice of fields.
+// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields.
+//
+// However, for practical purposes, each field needs to be analyzed more specifically.
+// For instance, in UnicodeData.txt, the first field represents a range of code points,
+// so it needs to be recognized as a hexadecimal string.
+// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser.
+//
+// https://www.unicode.org/reports/tr44/#Format_Conventions
+type parser struct {
+ scanner *bufio.Scanner
+ fields []field
+ defaultFields []field
+ err error
+
+ fieldBuf []field
+ defaultFieldBuf []field
+}
+
+func newParser(r io.Reader) *parser {
+ return &parser{
+ scanner: bufio.NewScanner(r),
+ fieldBuf: make([]field, 50),
+ defaultFieldBuf: make([]field, 50),
+ }
+}
+
+func (p *parser) parse() bool {
+ for p.scanner.Scan() {
+ p.parseRecord(p.scanner.Text())
+ if p.fields != nil || p.defaultFields != nil {
+ return true
+ }
+ }
+ p.err = p.scanner.Err()
+ return false
+}
+
+func (p *parser) parseRecord(src string) {
+ ms := reLine.FindStringSubmatch(src)
+ mFields := ms[1]
+ mComment := ms[2]
+ if mFields != "" {
+ p.fields = parseFields(p.fieldBuf, mFields)
+ } else {
+ p.fields = nil
+ }
+ if strings.HasPrefix(mComment, specialCommentPrefix) {
+ p.defaultFields = parseFields(p.defaultFieldBuf, strings.Replace(mComment, specialCommentPrefix, "", -1))
+ } else {
+ p.defaultFields = nil
+ }
+}
+
+func parseFields(buf []field, src string) []field {
+ n := 0
+ for _, f := range strings.Split(src, ";") {
+ buf[n] = field(strings.TrimSpace(f))
+ n++
+ }
+
+ return buf[:n]
+}