diff options
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/parser.go | 30 | ||||
-rw-r--r-- | compiler/ucd.go | 53 | ||||
-rw-r--r-- | compiler/ucd_table.go | 31 | ||||
-rw-r--r-- | compiler/ucd_table.go.tmpl | 21 |
4 files changed, 110 insertions, 25 deletions
diff --git a/compiler/parser.go b/compiler/parser.go index 55b8238..dd73c28 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -546,20 +546,38 @@ func (p *parser) parseCharProp() astNode { propName = "gc" propVal = sym1 } - cpRanges, err := findCodePointRanges(propName, propVal) + cpRanges, inverse, err := findCodePointRanges(propName, propVal) if err != nil { p.errMsgDetails = fmt.Sprintf("%v", err) raiseSyntaxError(synErrCharPropUnsupported) } var alt astNode - for _, r := range cpRanges { + if inverse { + r := cpRanges[0] from := genNormalCharAST(r.From) to := genNormalCharAST(r.To) - alt = genAltNode( - alt, - genRangeAST(from, to), - ) + alt = exclude(genRangeAST(from, to), genAnyCharAST()) + if alt == nil { + panic(fmt.Errorf("a pattern that isn't matching any symbols")) + } + for _, r := range cpRanges[1:] { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = exclude(genRangeAST(from, to), alt) + if alt == nil { + panic(fmt.Errorf("a pattern that isn't matching any symbols")) + } + } + } else { + for _, r := range cpRanges { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = genAltNode( + alt, + genRangeAST(from, to), + ) + } } if !p.consume(tokenKindRBrace) { diff --git a/compiler/ucd.go b/compiler/ucd.go index 506f03a..5ad0986 100644 --- a/compiler/ucd.go +++ b/compiler/ucd.go @@ -9,28 +9,43 @@ import ( "github.com/nihei9/maleeni/ucd" ) -func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, error) { - name := ucd.NormalizeSymbolicValue(propName) - val := ucd.NormalizeSymbolicValue(propVal) - name, ok := propertyNameAbbs[name] +func findCodePointRanges(propName, propVal string) ([]*ucd.CodePointRange, bool, error) { + name, ok := propertyNameAbbs[ucd.NormalizeSymbolicValue(propName)] if !ok { - return nil, fmt.Errorf("unsupported character property: %v", propName) + return nil, false, fmt.Errorf("unsupported character property name: %v", propName) } - val, ok = generalCategoryValueAbbs[val] - if !ok { - return nil, fmt.Errorf("unsupported character property value: %v", val) - } - vals, ok := compositGeneralCategories[val] - if !ok { - vals = []string{val} - } - var ranges []*ucd.CodePointRange - for _, v := range vals { - rs, ok := generalCategoryCodePoints[v] + switch name { + case "gc": + val, ok := generalCategoryValueAbbs[ucd.NormalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + vals, ok := compositGeneralCategories[val] if !ok { - return nil, fmt.Errorf("invalie value of the General_Category property: %v", v) + vals = []string{val} + } + var ranges []*ucd.CodePointRange + for _, v := range vals { + rs, ok := generalCategoryCodePoints[v] + if !ok { + return nil, false, fmt.Errorf("invalid value of the General_Category property: %v", v) + } + ranges = append(ranges, rs...) + } + return ranges, false, nil + case "wspace": + yes, ok := binaryValues[ucd.NormalizeSymbolicValue(propVal)] + if !ok { + return nil, false, fmt.Errorf("unsupported character property value: %v", propVal) + } + if yes { + return whiteSpaceCodePoints, false, nil + } else { + return whiteSpaceCodePoints, true, nil } - ranges = append(ranges, rs...) } - return ranges, nil + + // If the process reaches this code, it's a bug. We must handle all of the properties registered with + // the `propertyNameAbbs`. + return nil, false, fmt.Errorf("character property '%v' is unavailable", propName) } diff --git a/compiler/ucd_table.go b/compiler/ucd_table.go index c941f4c..4c3c723 100644 --- a/compiler/ucd_table.go +++ b/compiler/ucd_table.go @@ -28,6 +28,22 @@ var compositGeneralCategories = map[string][]string{ var propertyNameAbbs = map[string]string{ "generalcategory": "gc", "gc": "gc", + "whitespace": "wspace", + "wspace": "wspace", + "space": "wspace", +} + +// https://www.unicode.org/reports/tr44/#Type_Key_Table +// https://www.unicode.org/reports/tr44/#Binary_Values_Table +var binaryValues = map[string]bool{ + "yes": true, + "y": true, + "true": true, + "t": true, + "no": false, + "n": false, + "false": false, + "f": false, } // https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt @@ -4087,3 +4103,18 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ &ucd.CodePointRange{From: rune(12288), To: rune(12288)}, }, } + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var whiteSpaceCodePoints = []*ucd.CodePointRange{ + &ucd.CodePointRange{From: rune(9), To: rune(13)}, + &ucd.CodePointRange{From: rune(32), To: rune(32)}, + &ucd.CodePointRange{From: rune(133), To: rune(133)}, + &ucd.CodePointRange{From: rune(160), To: rune(160)}, + &ucd.CodePointRange{From: rune(5760), To: rune(5760)}, + &ucd.CodePointRange{From: rune(8192), To: rune(8202)}, + &ucd.CodePointRange{From: rune(8232), To: rune(8232)}, + &ucd.CodePointRange{From: rune(8233), To: rune(8233)}, + &ucd.CodePointRange{From: rune(8239), To: rune(8239)}, + &ucd.CodePointRange{From: rune(8287), To: rune(8287)}, + &ucd.CodePointRange{From: rune(12288), To: rune(12288)}, +} diff --git a/compiler/ucd_table.go.tmpl b/compiler/ucd_table.go.tmpl index a364191..80142c7 100644 --- a/compiler/ucd_table.go.tmpl +++ b/compiler/ucd_table.go.tmpl @@ -28,6 +28,22 @@ var compositGeneralCategories = map[string][]string{ var propertyNameAbbs = map[string]string{ "generalcategory": "gc", "gc": "gc", + "whitespace": "wspace", + "wspace": "wspace", + "space": "wspace", +} + +// https://www.unicode.org/reports/tr44/#Type_Key_Table +// https://www.unicode.org/reports/tr44/#Binary_Values_Table +var binaryValues = map[string]bool{ + "yes": true, + "y": true, + "true": true, + "t": true, + "no": false, + "n": false, + "false": false, + "f": false, } // https://www.unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt @@ -41,3 +57,8 @@ var generalCategoryCodePoints = map[string][]*ucd.CodePointRange{ {{ range $prop &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} },{{ end }} } + +// https://www.unicode.org/Public/13.0.0/ucd/PropList.txt +var whiteSpaceCodePoints = []*ucd.CodePointRange{ {{ range .PropList.WhiteSpace }} + &ucd.CodePointRange{From: rune({{ .From }}), To: rune({{ .To }})},{{ end }} +} |