aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--README.md4
-rw-r--r--cmd/generator/main.go14
-rw-r--r--compiler/parser.go30
-rw-r--r--compiler/ucd.go53
-rw-r--r--compiler/ucd_table.go31
-rw-r--r--compiler/ucd_table.go.tmpl21
-rw-r--r--ucd/prop_list.go34
7 files changed, 161 insertions, 26 deletions
diff --git a/README.md b/README.md
index a6899db..a02b061 100644
--- a/README.md
+++ b/README.md
@@ -242,7 +242,7 @@ The code point expressions match a character that has a specified code point. Th
| \u{3042} | U+3042 (hiragana あ) |
| \u{01F63A} | U+1F63A (grinning cat 😺) |
-The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only General_Category.
+The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only `General_Category` and `White_Space`. When you omitted the equal symbol and a right-side value, maleeni interprets a symbol in `\p{...}` as the `General_Category` value.
| Example | Description |
|-----------------------------|----------------------------------------------------|
@@ -250,6 +250,8 @@ The character property expressions match a character that has a specified charac
| \p{gc=Letter} | the same as \p{General_Category=Letter} |
| \p{Letter} | the same as \p{General_Category=Letter} |
| \p{l} | the same as \p{General_Category=Letter} |
+| \p{White_Space=yes} | any one character whose White_Space is yes |
+| \p{wspace=yes} | the same as \p{White_Space=yes} |
As you escape the special character with `\`, you can write a rule that matches the special character itself.
The following escape sequences are available outside of bracket expressions.
diff --git a/cmd/generator/main.go b/cmd/generator/main.go
index 3edcef5..ae57709 100644
--- a/cmd/generator/main.go
+++ b/cmd/generator/main.go
@@ -43,6 +43,18 @@ func gen() error {
return err
}
}
+ var propList *ucd.PropList
+ {
+ resp, err := http.Get("https://www.unicode.org/Public/13.0.0/ucd/PropList.txt")
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+ propList, err = ucd.ParsePropList(resp.Body)
+ if err != nil {
+ return err
+ }
+ }
tmpl, err := template.ParseFiles("../compiler/ucd_table.go.tmpl")
if err != nil {
return err
@@ -51,10 +63,12 @@ func gen() error {
err = tmpl.Execute(&b, struct {
GeneratorName string
UnicodeData *ucd.UnicodeData
+ PropList *ucd.PropList
PropertyValueAliases *ucd.PropertyValueAliases
}{
GeneratorName: "generator/main.go",
UnicodeData: unicodeData,
+ PropList: propList,
PropertyValueAliases: propValAliases,
})
if err != nil {
diff --git a/compiler/parser.go b/compiler/parser.go
index 55b8238..dd73c28 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -546,20 +546,38 @@ func (p *parser) parseCharProp() astNode {
propName = "gc"
propVal = sym1
}
- cpRanges, err := findCodePointRanges(propName, propVal)
+ cpRanges, inverse, err := findCodePointRanges(propName, propVal)
if err != nil {
p.errMsgDetails = fmt.Sprintf("%v", err)
raiseSyntaxError(synErrCharPropUnsupported)
}
var alt astNode
- for _, r := range cpRanges {
+ if inverse {
+ r := cpRanges[0]
from := genNormalCharAST(r.From)
to := genNormalCharAST(r.To)
- alt = genAltNode(
- alt,
- genRangeAST(from, to),
- )
+ alt = exclude(genRangeAST(from, to), genAnyCharAST())
+ if alt == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ for _, r := range cpRanges[1:] {
+ from := genNormalCharAST(r.From)
+ to := genNormalCharAST(r.To)
+ alt = exclude(genRangeAST(from, to), alt)
+ if alt == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ from := genNormalCharAST(r.From)
+ to := genNormalCharAST(r.To)
+ alt = genAltNode(
+ alt,
+ genRangeAST(from, to),
+ )
+ }
}
if !p.consume(tokenKindRBrace) {
diff --git a/compiler/ucd.go b/compiler/ucd.go
index 506f03a..5ad0986 100644
--- a/compiler/ucd.go
+++ b/compiler/ucd.go
@@ -9,28 +9,43 @@ import (
"github.com/nihei9/maleeni/ucd"
)
-func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, error) {
- name := ucd.NormalizeSymbolicValue(propName)
- val := ucd.NormalizeSymbolicValue(propVal)
- name, ok := propertyNameAbbs[name]
+func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool, error) {
+ name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)]
if !ok {
- return nil, fmt.Errorf("unsupported character property: %v", propName)
+ return nil, false, fmt.Errorf("unsupported character property name: %v", propName)
}
- val, ok = generalCategoryValueAbbs[val]
- if !ok {
- return nil, fmt.Errorf("unsupported character property value: %v", val)
- }
- vals, ok := compositGeneralCategories[val]
- if !ok {
- vals = []string{val}
- }
- var ranges []*ucd.CodePointRange
- for _, v := range vals {
- rs, ok := generalCategoryCodePoints[v]
+ switch name {
+ case "gc":
+ val, ok := generalCategoryValueAbbs[ucd.NormalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ vals, ok := compositGeneralCategories[val]
if !ok {
- return nil, fmt.Errorf("invalie value of the General_Category property: %v", v)
+ vals = []string{val}
+ }
+ var ranges []*ucd.CodePointRange
+ for _, v := range vals {
+ rs, ok := generalCategoryCodePoints[v]
+ if !ok {
+ return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v)
+ }
+ ranges = append(ranges, rs...)
+ }
+ return ranges, false, nil
+ case "wspace":
+ yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)]
+ if !ok {
+ return nil, false, fmt.Errorf("unsupported character property value: %v", propVal)
+ }
+ if yes {
+ return whiteSpaceCodePoints, false, nil
+ } else {
+ return whiteSpaceCodePoints, true, nil
}
- ranges = append(ranges, rs...)
}
- return ranges, nil
+
+ // If the process reaches this code, it's a bug. We must handle all of the properties registered with
+ // the `propertyNameAbbs`.
+ return nil, false, fmt.Errorf("character property '%v' is unavailable", propName)
}
diff --git a/compiler/ucd_table.go b/compiler/ucd_table.go
index c941f4c..4c3c723 100644
--- a/compiler/ucd_table.go
+++ b/compiler/ucd_table.go
@@ -28,6 +28,22 @@ var compositGeneralCategories = map[string][]string{
var propertyNameAbbs = map[string]string{
"generalcategory": "gc",
"gc": "gc",
+ "whitespace": "wspace",
+ "wspace": "wspace",
+ "space": "wspace",
+}
+
+// https://www.unicode.org/reports/tr44/#Type_Key_Table
+// https://www.unicode.org/reports/tr44/#Binary_Values_Table
+var binaryValues = map[string]bool{
+ "yes": true,
+ "y": true,
+ "true": true,
+ "t": true,
+ "no": false,
+ "n": false,
+ "false": false,
+ "f": false,
}
// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
@@ -4087,3 +4103,18 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{
&ucd.CodePointRange{From: rune(12288), To: rune(12288)},
},
}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var whiteSpaceCodePoints = []*ucd.CodePointRange{
+ &ucd.CodePointRange{From: rune(9), To: rune(13)},
+ &ucd.CodePointRange{From: rune(32), To: rune(32)},
+ &ucd.CodePointRange{From: rune(133), To: rune(133)},
+ &ucd.CodePointRange{From: rune(160), To: rune(160)},
+ &ucd.CodePointRange{From: rune(5760), To: rune(5760)},
+ &ucd.CodePointRange{From: rune(8192), To: rune(8202)},
+ &ucd.CodePointRange{From: rune(8232), To: rune(8232)},
+ &ucd.CodePointRange{From: rune(8233), To: rune(8233)},
+ &ucd.CodePointRange{From: rune(8239), To: rune(8239)},
+ &ucd.CodePointRange{From: rune(8287), To: rune(8287)},
+ &ucd.CodePointRange{From: rune(12288), To: rune(12288)},
+}
diff --git a/compiler/ucd_table.go.tmpl b/compiler/ucd_table.go.tmpl
index a364191..80142c7 100644
--- a/compiler/ucd_table.go.tmpl
+++ b/compiler/ucd_table.go.tmpl
@@ -28,6 +28,22 @@ var compositGeneralCategories = map[string][]string{
var propertyNameAbbs = map[string]string{
"generalcategory": "gc",
"gc": "gc",
+ "whitespace": "wspace",
+ "wspace": "wspace",
+ "space": "wspace",
+}
+
+// https://www.unicode.org/reports/tr44/#Type_Key_Table
+// https://www.unicode.org/reports/tr44/#Binary_Values_Table
+var binaryValues = map[string]bool{
+ "yes": true,
+ "y": true,
+ "true": true,
+ "t": true,
+ "no": false,
+ "n": false,
+ "false": false,
+ "f": false,
}
// https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt
@@ -41,3 +57,8 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ {{ range $prop
&ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
},{{ end }}
}
+
+// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt
+var whiteSpaceCodePoints = []*ucd.CodePointRange{ {{ range .PropList.WhiteSpace }}
+ &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }}
+}
diff --git a/ucd/prop_list.go b/ucd/prop_list.go
new file mode 100644
index 0000000..cd4a7fe
--- /dev/null
+++ b/ucd/prop_list.go
@@ -0,0 +1,34 @@
+package ucd
+
+import "io"
+
+type PropList struct {
+ WhiteSpace []*CodePointRange
+}
+
+// ParsePropList parses the PropList.txt.
+func ParsePropList(r io.Reader) (*PropList, error) {
+ var ws []*CodePointRange
+ p := newParser(r)
+ for p.parse() {
+ if len(p.fields) == 0 {
+ continue
+ }
+ if p.fields[1].symbol() != "White_Space" {
+ continue
+ }
+
+ cp, err := p.fields[0].codePointRange()
+ if err != nil {
+ return nil, err
+ }
+ ws = append(ws, cp)
+ }
+ if p.err != nil {
+ return nil, p.err
+ }
+
+ return &PropList{
+ WhiteSpace: ws,
+ }, nil
+}