diff options
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | compiler/parser.go | 55 | ||||
-rw-r--r-- | compiler/ucd.go | 48 | ||||
-rw-r--r-- | compiler/ucd_table.go | 47 | ||||
-rw-r--r-- | compiler/ucd_table.go.tmpl | 24 | ||||
-rw-r--r-- | ucd/prop_list.go | 25 |
6 files changed, 174 insertions, 29 deletions
@@ -242,7 +242,7 @@ The code point expressions match a character that has a specified code point. Th | \u{3042} | U+3042 (hiragana あ) | | \u{01F63A} | U+1F63A (grinning cat 😺) | -The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only `General_Category` and `White_Space`. When you omitted the equal symbol and a right-side value, maleeni interprets a symbol in `\p{...}` as the `General_Category` value. +The character property expressions match a character that has a specified character property of the Unicode. Currently, maleeni supports only `General_Category`, `Lowercase`, `Uppercase`, and `White_Space`. When you omitted the equal symbol and a right-side value, maleeni interprets a symbol in `\p{...}` as the `General_Category` value. | Example | Description | |-----------------------------|----------------------------------------------------| @@ -250,6 +250,8 @@ The character property expressions match a character that has a specified charac | \p{gc=Letter} | the same as \p{General_Category=Letter} | | \p{Letter} | the same as \p{General_Category=Letter} | | \p{l} | the same as \p{General_Category=Letter} | +| \p{Lowercase=yes} | any one character whose Lowercase is yes | +| \p{Uppercase=yes} | any one character whose Uppercase is yes | | \p{White_Space=yes} | any one character whose White_Space is yes | | \p{wspace=yes} | the same as \p{White_Space=yes} | diff --git a/compiler/parser.go b/compiler/parser.go index dd73c28..d64b79c 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -538,6 +538,7 @@ func (p *parser) parseCharProp() astNode { sym2 = p.lastTok.propSymbol } + var alt astNode var propName, propVal string if sym2 != "" { propName = sym1 @@ -546,37 +547,49 @@ func (p *parser) parseCharProp() astNode { propName = "gc" propVal = sym1 } - cpRanges, inverse, err := findCodePointRanges(propName, propVal) + pat, err := normalizeCharacterProperty(propName, propVal) if err != nil { p.errMsgDetails = fmt.Sprintf("%v", err) raiseSyntaxError(synErrCharPropUnsupported) } - - var alt astNode - if inverse { - r := cpRanges[0] - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), genAnyCharAST()) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) + if pat != "" { + p := newParser(bytes.NewReader([]byte(pat))) + ast, err := p.parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := findCodePointRanges(propName, propVal) + if err != nil { + p.errMsgDetails = fmt.Sprintf("%v", err) + raiseSyntaxError(synErrCharPropUnsupported) } - for _, r := range cpRanges[1:] { + if inverse { + r := cpRanges[0] from := genNormalCharAST(r.From) to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), alt) + alt = exclude(genRangeAST(from, to), genAnyCharAST()) if alt == nil { panic(fmt.Errorf("a pattern that isn't matching any symbols")) } - } - } else { - for _, r := range cpRanges { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = genAltNode( - alt, - genRangeAST(from, to), - ) + for _, r := range cpRanges[1:] { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = exclude(genRangeAST(from, to), alt) + if alt == nil { + panic(fmt.Errorf("a pattern that isn't matching any symbols")) + } + } + } else { + for _, r := range cpRanges { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = genAltNode( + alt, + genRangeAST(from, to), + ) + } } } diff --git a/compiler/ucd.go b/compiler/ucd.go index 5ad0986..3c0bee1 100644 --- a/compiler/ucd.go +++ b/compiler/ucd.go @@ -5,10 +5,38 @@ package compiler import ( "fmt" + "strings" "github.com/nihei9/maleeni/ucd" ) +func normalizeCharacterProperty(propName, propVal string) (string, error) { + name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)] + if !ok { + return "", fmt.Errorf("unsupported character property name: %v", propName) + } + props, ok := derivedCoreProperties[name] + if !ok { + return "", nil + } + var b strings.Builder + yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)] + if !ok { + return "", fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + fmt.Fprint(&b, "[") + } else { + fmt.Fprint(&b, "[^") + } + for _, prop := range props { + fmt.Fprint(&b, prop) + } + fmt.Fprint(&b, "]") + + return b.String(), nil +} + func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool, error) { name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)] if !ok { @@ -33,6 +61,26 @@ func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool, ranges = append(ranges, rs...) } return ranges, false, nil + case "olower": + yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return otherLowercaseCodePoints, false, nil + } else { + return otherLowercaseCodePoints, true, nil + } + case "oupper": + yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return otherUppercaseCodePoints, false, nil + } else { + return otherUppercaseCodePoints, true, nil + } case "wspace": yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)] if !ok { diff --git a/compiler/ucd_table.go b/compiler/ucd_table.go index 4c3c723..545a9c8 100644 --- a/compiler/ucd_table.go +++ b/compiler/ucd_table.go @@ -24,10 +24,24 @@ var compositGeneralCategories = map[string][]string{ "c": {"cc", "cf", "cs", "co", "cn"}, } +// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt +var derivedCoreProperties = map[string][]string{ + "lower": {`\p{Ll}`, `\p{Other_Lowercase=yes}`}, + "upper": {`\p{Lu}`, `\p{Other_Uppercase=yes}`}, +} + // https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt var propertyNameAbbs = map[string]string{ "generalcategory": "gc", "gc": "gc", + "lowercase": "lower", + "lower": "lower", + "uppercase": "upper", + "upper": "upper", + "otherlowercase": "olower", + "olower": "olower", + "otheruppercase": "oupper", + "oupper": "oupper", "whitespace": "wspace", "wspace": "wspace", "space": "wspace", @@ -4105,6 +4119,39 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ } // https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var otherLowercaseCodePoints = []*ucd.CodePointRange{ + &ucd.CodePointRange{From: rune(170), To: rune(170)}, + &ucd.CodePointRange{From: rune(186), To: rune(186)}, + &ucd.CodePointRange{From: rune(688), To: rune(696)}, + &ucd.CodePointRange{From: rune(704), To: rune(705)}, + &ucd.CodePointRange{From: rune(736), To: rune(740)}, + &ucd.CodePointRange{From: rune(837), To: rune(837)}, + &ucd.CodePointRange{From: rune(890), To: rune(890)}, + &ucd.CodePointRange{From: rune(7468), To: rune(7530)}, + &ucd.CodePointRange{From: rune(7544), To: rune(7544)}, + &ucd.CodePointRange{From: rune(7579), To: rune(7615)}, + &ucd.CodePointRange{From: rune(8305), To: rune(8305)}, + &ucd.CodePointRange{From: rune(8319), To: rune(8319)}, + &ucd.CodePointRange{From: rune(8336), To: rune(8348)}, + &ucd.CodePointRange{From: rune(8560), To: rune(8575)}, + &ucd.CodePointRange{From: rune(9424), To: rune(9449)}, + &ucd.CodePointRange{From: rune(11388), To: rune(11389)}, + &ucd.CodePointRange{From: rune(42652), To: rune(42653)}, + &ucd.CodePointRange{From: rune(42864), To: rune(42864)}, + &ucd.CodePointRange{From: rune(43000), To: rune(43001)}, + &ucd.CodePointRange{From: rune(43868), To: rune(43871)}, +} + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var otherUppercaseCodePoints = []*ucd.CodePointRange{ + &ucd.CodePointRange{From: rune(8544), To: rune(8559)}, + &ucd.CodePointRange{From: rune(9398), To: rune(9423)}, + &ucd.CodePointRange{From: rune(127280), To: rune(127305)}, + &ucd.CodePointRange{From: rune(127312), To: rune(127337)}, + &ucd.CodePointRange{From: rune(127344), To: rune(127369)}, +} + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt var whiteSpaceCodePoints = []*ucd.CodePointRange{ &ucd.CodePointRange{From: rune(9), To: rune(13)}, &ucd.CodePointRange{From: rune(32), To: rune(32)}, diff --git a/compiler/ucd_table.go.tmpl b/compiler/ucd_table.go.tmpl index 80142c7..ad336d9 100644 --- a/compiler/ucd_table.go.tmpl +++ b/compiler/ucd_table.go.tmpl @@ -24,10 +24,24 @@ var compositGeneralCategories = map[string][]string{ "c": {"cc", "cf", "cs", "co", "cn"}, } +// https://www.unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt +var derivedCoreProperties = map[string][]string{ + "lower": {`\p{Ll}`, `\p{Other_Lowercase=yes}`}, + "upper": {`\p{Lu}`, `\p{Other_Uppercase=yes}`}, +} + // https://www.unicode.org/Public/13.0.0/ucd/PropertyAliases.txt var propertyNameAbbs = map[string]string{ "generalcategory": "gc", "gc": "gc", + "lowercase": "lower", + "lower": "lower", + "uppercase": "upper", + "upper": "upper", + "otherlowercase": "olower", + "olower": "olower", + "otheruppercase": "oupper", + "oupper": "oupper", "whitespace": "wspace", "wspace": "wspace", "space": "wspace", @@ -59,6 +73,16 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ {{ range $prop } // https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var otherLowercaseCodePoints = []*ucd.CodePointRange{ {{ range .PropList.OtherLowercase }} + &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} +} + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var otherUppercaseCodePoints = []*ucd.CodePointRange{ {{ range .PropList.OtherUppercase }} + &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} +} + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt var whiteSpaceCodePoints = []*ucd.CodePointRange{ {{ range .PropList.WhiteSpace }} &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} } diff --git a/ucd/prop_list.go b/ucd/prop_list.go index cd4a7fe..1ceaaea 100644 --- a/ucd/prop_list.go +++ b/ucd/prop_list.go @@ -3,32 +3,43 @@ package ucd import "io" type PropList struct { - WhiteSpace []*CodePointRange + OtherLowercase []*CodePointRange + OtherUppercase []*CodePointRange + WhiteSpace []*CodePointRange } // ParsePropList parses the PropList.txt. func ParsePropList(r io.Reader) (*PropList, error) { + var ol []*CodePointRange + var ou []*CodePointRange var ws []*CodePointRange p := newParser(r) for p.parse() { if len(p.fields) == 0 { continue } - if p.fields[1].symbol() != "White_Space" { - continue - } - + cp, err := p.fields[0].codePointRange() if err != nil { return nil, err } - ws = append(ws, cp) + + switch p.fields[1].symbol() { + case "Other_Lowercase": + ol = append(ol, cp) + case "Other_Uppercase": + ou = append(ou, cp) + case "White_Space": + ws = append(ws, cp) + } } if p.err != nil { return nil, p.err } return &PropList{ - WhiteSpace: ws, + OtherLowercase: ol, + OtherUppercase: ou, + WhiteSpace: ws, }, nil } |