aboutsummaryrefslogtreecommitdiff
path: root/ucd
diff options
context:
space:
mode:
Diffstat (limited to 'ucd')
-rw-r--r--ucd/parser.go238
1 files changed, 238 insertions, 0 deletions
diff --git a/ucd/parser.go b/ucd/parser.go
new file mode 100644
index 0000000..9476a2f
--- /dev/null
+++ b/ucd/parser.go
@@ -0,0 +1,238 @@
+package ucd
+
+import (
+ "bufio"
+ "encoding/binary"
+ "encoding/hex"
+ "io"
+ "regexp"
+ "strings"
+)
+
+type CodePointRange struct {
+ From rune
+ To rune
+}
+
+type UnicodeData struct {
+ GeneralCategory map[string][]*CodePointRange
+}
+
+func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
+ gc2CPRange := map[string][]*CodePointRange{}
+ lastCPTo := rune(-1)
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ cpFrom, cpTo, err := parseCodePointRange(p.fields[0])
+ if err != nil {
+ return nil, err
+ }
+ if cpFrom-lastCPTo > 1 {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: cpFrom - 1,
+ })
+ }
+ lastCPTo = cpTo
+ gc := NormalizeSymbolicValue(p.fields[2])
+ rs, ok := gc2CPRange[gc]
+ if ok {
+ r := rs[len(rs)-1]
+ if cpFrom-r.To == 1 {
+ r.To = cpTo
+ } else {
+ gc2CPRange[gc] = append(rs, &CodePointRange{
+ From: cpFrom,
+ To: cpTo,
+ })
+ }
+ } else {
+ gc2CPRange[gc] = []*CodePointRange{
+ {
+ From: cpFrom,
+ To: cpTo,
+ },
+ }
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
+ defaultGCVal := propValAliases.GeneralCategoryDefaultValue
+ gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
+ From: lastCPTo + 1,
+ To: propValAliases.GeneralCategoryDefaultRange.To,
+ })
+ }
+ return &UnicodeData{
+ GeneralCategory: gc2CPRange,
+ }, nil
+}
+
+type PropertyValueAliases struct {
+ GeneralCategory map[string]string
+ GeneralCategoryDefaultRange *CodePointRange
+ GeneralCategoryDefaultValue string
+}
+
+func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
+ catName2Abbs := map[string]string{}
+ var defaultGCCPRange *CodePointRange
+ var defaultGCVal string
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) > 0 && p.fields[0] == "gc" {
+ catNameShort := NormalizeSymbolicValue(p.fields[1])
+ catNameLong := NormalizeSymbolicValue(p.fields[2])
+ catName2Abbs[catNameShort] = catNameShort
+ catName2Abbs[catNameLong] = catNameShort
+ for _, f := range p.fields[3:] {
+ catNameOther := NormalizeSymbolicValue(f)
+ catName2Abbs[catNameOther] = catNameShort
+ }
+ }
+ if len(p.defaultFields) > 0 && p.defaultFields[1] == "General_Category" {
+ cpFrom, cpTo, err := parseCodePointRange(p.defaultFields[0])
+ if err != nil {
+ return nil, err
+ }
+ defaultGCCPRange = &CodePointRange{
+ From: cpFrom,
+ To: cpTo,
+ }
+ defaultGCVal = NormalizeSymbolicValue(p.defaultFields[2])
+ }
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+ return &PropertyValueAliases{
+ GeneralCategory: catName2Abbs,
+ GeneralCategoryDefaultRange: defaultGCCPRange,
+ GeneralCategoryDefaultValue: defaultGCVal,
+ }, nil
+}
+
+var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")
+
+func NormalizeSymbolicValue(original string) string {
+ strings.Trim("", "")
+ v := strings.ToLower(symValReplacer.Replace(original))
+ if strings.HasPrefix(v, "is") && v != "is" {
+ return v[3:]
+ }
+ return v
+}
+
+type Fields []string
+
+var (
+ reLine = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
+ reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)
+
+ specialCommentPrefix = "# @missing:"
+)
+
+// This parser can parse data files of Unicode Character Database (UCD).
+// Specifically, it has the following two functions:
+// - Converts each line of the data files into a slice of fields.
+// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields.
+//
+// However, for practical purposes, each field needs to be analyzed more specifically.
+// For instance, in UnicodeData.txt, the first field represents a range of code points,
+// so it needs to be recognized as a hexadecimal string.
+// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser.
+//
+// https://www.unicode.org/reports/tr44/#Format_Conventions
+type parser struct {
+ scanner *bufio.Scanner
+ fields Fields
+ defaultFields Fields
+ err error
+}
+
+func newParser(r io.Reader) *parser {
+ return &parser{
+ scanner: bufio.NewScanner(r),
+ }
+}
+
+func (p *parser) parse() bool {
+ for p.scanner.Scan() {
+ p.fields, p.defaultFields, p.err = parseRecord(p.scanner.Text())
+ if p.err != nil {
+ return false
+ }
+ if p.fields != nil || p.defaultFields != nil {
+ return true
+ }
+ }
+ p.err = p.scanner.Err()
+ return false
+}
+
+func parseRecord(src string) (Fields, Fields, error) {
+ ms := reLine.FindStringSubmatch(src)
+ fields := ms[1]
+ comment := ms[2]
+ var fs Fields
+ if fields != "" {
+ fs = parseFields(fields)
+ }
+ var defaultFs Fields
+ if strings.HasPrefix(comment, specialCommentPrefix) {
+ fields := strings.Replace(comment, specialCommentPrefix, "", -1)
+ fs := parseFields(fields)
+ defaultFs = fs
+ }
+ return fs, defaultFs, nil
+}
+
+func parseFields(src string) Fields {
+ var fields Fields
+ for _, f := range strings.Split(src, ";") {
+ fields = append(fields, strings.TrimSpace(f))
+ }
+ return fields
+}
+
+func parseCodePointRange(src string) (rune, rune, error) {
+ var from, to rune
+ var err error
+ cp := reCodePointRange.FindStringSubmatch(src)
+ from, err = decodeHexToRune(cp[1])
+ if err != nil {
+ return 0, 0, err
+ }
+ if cp[2] != "" {
+ to, err = decodeHexToRune(cp[2])
+ if err != nil {
+ return 0, 0, err
+ }
+ } else {
+ to = from
+ }
+ return from, to, nil
+}
+
+func decodeHexToRune(hexCodePoint string) (rune, error) {
+ h := hexCodePoint
+ if len(h)%2 != 0 {
+ h = "0" + h
+ }
+ b, err := hex.DecodeString(h)
+ if err != nil {
+ return 0, err
+ }
+ l := len(b)
+ for i := 0; i < 4-l; i++ {
+ b = append([]byte{0}, b...)
+ }
+ n := binary.BigEndian.Uint32(b)
+ return rune(n), nil
+}