aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-11-25 21:18:34 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-11-25 21:18:34 +0900
commit6ebbc8f9829bf0f3127367769c662d1a8f881a2d (patch)
treee45af1104e3ce736134353c1805fe0c91d04998a
parentSupport White_Space property (Meet RL1.2 of UTS #18 partially) (diff)
downloadtre-6ebbc8f9829bf0f3127367769c662d1a8f881a2d.tar.gz
tre-6ebbc8f9829bf0f3127367769c662d1a8f881a2d.tar.xz
Support Lowercase and Uppercase property (Meet RL1.2 of UTS #18 partially)
Diffstat (limited to '')
-rw-r--r--README.md4
-rw-r--r--compiler/parser.go55
-rw-r--r--compiler/ucd.go48
-rw-r--r--compiler/ucd_table.go47
-rw-r--r--compiler/ucd_table.go.tmpl24
-rw-r--r--ucd/prop_list.go25
6 files changed, 174 insertions, 29 deletions
diff --git a/README.md b/README.md
index a02b061..729cb82 100644
--- a/README.md
+++ b/README.md
@@ -242,7 +242,7 @@ The code point expressions match a character that has a specified code point. Th
| \u{3042} | U+3042 (hiragana あ) |
| \u{01F63A} | U+1F63A (grinning cat 😺) |
-The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only `General_Category` and `White_Space`. When you omitted the equal symbol and a right-side value, maleeni interprets a symbol in `\p{...}` as the `General_Category` value.
+The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only `General_Category`, `Lowercase`, `Uppercase`, and `White_Space`. When you omitted the equal symbol and a right-side value, maleeni interprets a symbol in `\p{...}` as the `General_Category` value.
| Example | Description |
|-----------------------------|----------------------------------------------------|
@@ -250,6 +250,8 @@ The character property expressions match a character that has a specified charac
| \p{gc=Letter} | the same as \p{General_Category=Letter} |
| \p{Letter} | the same as \p{General_Category=Letter} |
| \p{l} | the same as \p{General_Category=Letter} |
+| \p{Lowercase=yes} | any one character whose Lowercase is yes |
+| \p{Uppercase=yes} | any one character whose Uppercase is yes |
| \p{White_Space=yes} | any one character whose White_Space is yes |
| \p{wspace=yes} | the same as \p{White_Space=yes} |
diff --git a/compiler/parser.go b/compiler/parser.go
index dd73c28..d64b79c 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -538,6 +538,7 @@ func (p *parser) parseCharProp() astNode {
sym2 = p.lastTok.propSymbol
}
+ var alt astNode
var propName, propVal string
if sym2 != "" {
propName = sym1
@@ -546,37 +547,49 @@ func (p *parser) parseCharProp() astNode {
propName = "gc"
propVal = sym1
}
- cpRanges, inverse, err := findCodePointRanges(propName, propVal)
+ pat, err := normalizeCharacterProperty(propName, propVal)
if err != nil {
p.errMsgDetails = fmt.Sprintf("%v", err)
raiseSyntaxError(synErrCharPropUnsupported)
}
-
- var alt astNode
- if inverse {
- r := cpRanges[0]
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), genAnyCharAST())
- if alt == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ if pat != "" {
+ p := newParser(bytes.NewReader([]byte(pat)))
+ ast, err := p.parse()
+ if err != nil {
+ panic(err)
+ }
+ alt = ast
+ } else {
+ cpRanges, inverse, err := findCodePointRanges(propName, propVal)
+ if err != nil {
+ p.errMsgDetails = fmt.Sprintf("%v", err)
+ raiseSyntaxError(synErrCharPropUnsupported)
}
- for _, r := range cpRanges[1:] {
+ if inverse {
+ r := cpRanges[0]
from := genNormalCharAST(r.From)
to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), alt)
+ alt = exclude(genRangeAST(from, to), genAnyCharAST())
if alt == nil {
panic(fmt.Errorf("a pattern that isn't matching any symbols"))
}
- }
- } else {
- for _, r := range cpRanges {
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = genAltNode(
- alt,
- genRangeAST(from, to),
- )
+ for _, r := range cpRanges[1:] {
+ from := genNormalCharAST(r.From)
+ to := genNormalCharAST(r.To)
+ alt = exclude(genRangeAST(from, to), alt)
+ if alt == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ from := genNormalCharAST(r.From)
+ to := genNormalCharAST(r.To)
+ alt = genAltNode(
+ alt,
+ genRangeAST(from, to),
+ )
+ }
}
}
diff --git a/compiler/ucd.go b/compiler/ucd.go
index 5ad0986..3c0bee1 100644
--- a/compiler/ucd.go
+++ b/compiler/ucd.go
@@ -5,10 +5,38 @@ package compiler
import (
"fmt"
+ "strings"
"github.com/nihei9/maleeni/ucd"
)
+func normalizeCharacterProperty(propName, propVal string) (string, error) {
+ name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)]
+ if !ok {
+ return "", fmt.Errorf("unsupported character property name: %v", propName)
+ }
+ props, ok := derivedCoreProperties[name]
+ if !ok {
+ return "", nil
+ }
+ var b strings.Builder
+ yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)]
+ if !ok {
+ return "", fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ fmt.Fprint(&b, "[")
+ } else {
+ fmt.Fprint(&b, "[^")
+ }
+ for _, prop := range props {
+ fmt.Fprint(&b, prop)
+ }
+ fmt.Fprint(&b, "]")
+
+ return b.String(), nil
+}
+
func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool, error) {
name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)]
if !ok {
@@ -33,6 +61,26 @@ func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool,
ranges = append(ranges, rs...)
}
return ranges, false, nil
+ case "olower":
+ yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return otherLowercaseCodePoints, false, nil
+ } else {
+ return otherLowercaseCodePoints, true, nil
+ }
+ case "oupper":
+ yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return otherUppercaseCodePoints, false, nil
+ } else {
+ return otherUppercaseCodePoints, true, nil
+ }
case "wspace":
yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)]
if !ok {
diff --git a/compiler/ucd_table.go b/compiler/ucd_table.go
index 4c3c723..545a9c8 100644
--- a/compiler/ucd_table.go
+++ b/compiler/ucd_table.go
@@ -24,10 +24,24 @@ var compositGeneralCategories = map[string][]string{
"c": {"cc", "cf", "cs", "co", "cn"},
}
+// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt
+var derivedCoreProperties = map[string][]string{
+ "lower": {`\p{Ll}`, `\p{Other_Lowercase=yes}`},
+ "upper": {`\p{Lu}`, `\p{Other_Uppercase=yes}`},
+}
+
// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
var propertyNameAbbs = map[string]string{
"generalcategory": "gc",
"gc": "gc",
+ "lowercase": "lower",
+ "lower": "lower",
+ "uppercase": "upper",
+ "upper": "upper",
+ "otherlowercase": "olower",
+ "olower": "olower",
+ "otheruppercase": "oupper",
+ "oupper": "oupper",
"whitespace": "wspace",
"wspace": "wspace",
"space": "wspace",
@@ -4105,6 +4119,39 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{
}
// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var otherLowercaseCodePoints = []*ucd.CodePointRange{
+ &ucd.CodePointRange{From: rune(170), To: rune(170)},
+ &ucd.CodePointRange{From: rune(186), To: rune(186)},
+ &ucd.CodePointRange{From: rune(688), To: rune(696)},
+ &ucd.CodePointRange{From: rune(704), To: rune(705)},
+ &ucd.CodePointRange{From: rune(736), To: rune(740)},
+ &ucd.CodePointRange{From: rune(837), To: rune(837)},
+ &ucd.CodePointRange{From: rune(890), To: rune(890)},
+ &ucd.CodePointRange{From: rune(7468), To: rune(7530)},
+ &ucd.CodePointRange{From: rune(7544), To: rune(7544)},
+ &ucd.CodePointRange{From: rune(7579), To: rune(7615)},
+ &ucd.CodePointRange{From: rune(8305), To: rune(8305)},
+ &ucd.CodePointRange{From: rune(8319), To: rune(8319)},
+ &ucd.CodePointRange{From: rune(8336), To: rune(8348)},
+ &ucd.CodePointRange{From: rune(8560), To: rune(8575)},
+ &ucd.CodePointRange{From: rune(9424), To: rune(9449)},
+ &ucd.CodePointRange{From: rune(11388), To: rune(11389)},
+ &ucd.CodePointRange{From: rune(42652), To: rune(42653)},
+ &ucd.CodePointRange{From: rune(42864), To: rune(42864)},
+ &ucd.CodePointRange{From: rune(43000), To: rune(43001)},
+ &ucd.CodePointRange{From: rune(43868), To: rune(43871)},
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var otherUppercaseCodePoints = []*ucd.CodePointRange{
+ &ucd.CodePointRange{From: rune(8544), To: rune(8559)},
+ &ucd.CodePointRange{From: rune(9398), To: rune(9423)},
+ &ucd.CodePointRange{From: rune(127280), To: rune(127305)},
+ &ucd.CodePointRange{From: rune(127312), To: rune(127337)},
+ &ucd.CodePointRange{From: rune(127344), To: rune(127369)},
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
var whiteSpaceCodePoints = []*ucd.CodePointRange{
&ucd.CodePointRange{From: rune(9), To: rune(13)},
&ucd.CodePointRange{From: rune(32), To: rune(32)},
diff --git a/compiler/ucd_table.go.tmpl b/compiler/ucd_table.go.tmpl
index 80142c7..ad336d9 100644
--- a/compiler/ucd_table.go.tmpl
+++ b/compiler/ucd_table.go.tmpl
@@ -24,10 +24,24 @@ var compositGeneralCategories = map[string][]string{
"c": {"cc", "cf", "cs", "co", "cn"},
}
+// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt
+var derivedCoreProperties = map[string][]string{
+ "lower": {`\p{Ll}`, `\p{Other_Lowercase=yes}`},
+ "upper": {`\p{Lu}`, `\p{Other_Uppercase=yes}`},
+}
+
// https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt
var propertyNameAbbs = map[string]string{
"generalcategory": "gc",
"gc": "gc",
+ "lowercase": "lower",
+ "lower": "lower",
+ "uppercase": "upper",
+ "upper": "upper",
+ "otherlowercase": "olower",
+ "olower": "olower",
+ "otheruppercase": "oupper",
+ "oupper": "oupper",
"whitespace": "wspace",
"wspace": "wspace",
"space": "wspace",
@@ -59,6 +73,16 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ {{ range $prop
}
// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var otherLowercaseCodePoints = []*ucd.CodePointRange{ {{ range .PropList.OtherLowercase }}
+ &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var otherUppercaseCodePoints = []*ucd.CodePointRange{ {{ range .PropList.OtherUppercase }}
+ &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
+}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
var whiteSpaceCodePoints = []*ucd.CodePointRange{ {{ range .PropList.WhiteSpace }}
&ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
}
diff --git a/ucd/prop_list.go b/ucd/prop_list.go
index cd4a7fe..1ceaaea 100644
--- a/ucd/prop_list.go
+++ b/ucd/prop_list.go
@@ -3,32 +3,43 @@ package ucd
import "io"
type PropList struct {
- WhiteSpace []*CodePointRange
+ OtherLowercase []*CodePointRange
+ OtherUppercase []*CodePointRange
+ WhiteSpace []*CodePointRange
}
// ParsePropList parses the PropList.txt.
func ParsePropList(r io.Reader) (*PropList, error) {
+ var ol []*CodePointRange
+ var ou []*CodePointRange
var ws []*CodePointRange
p := newParser(r)
for p.parse() {
if len(p.fields) == 0 {
continue
}
- if p.fields[1].symbol() != "White_Space" {
- continue
- }
-
+
cp, err := p.fields[0].codePointRange()
if err != nil {
return nil, err
}
- ws = append(ws, cp)
+
+ switch p.fields[1].symbol() {
+ case "Other_Lowercase":
+ ol = append(ol, cp)
+ case "Other_Uppercase":
+ ou = append(ou, cp)
+ case "White_Space":
+ ws = append(ws, cp)
+ }
}
if p.err != nil {
return nil, p.err
}
return &PropList{
- WhiteSpace: ws,
+ OtherLowercase: ol,
+ OtherUppercase: ou,
+ WhiteSpace: ws,
}, nil
}