diff options
-rw-r--r-- | compiler/parser.go | 31 | ||||
-rw-r--r-- | compiler/parser_test.go | 32 | ||||
-rw-r--r-- | ucd/api.go | 13 | ||||
-rw-r--r-- | ucd/property.go | 14 |
4 files changed, 89 insertions, 1 deletions
diff --git a/compiler/parser.go b/compiler/parser.go index 980e65c..89c8301 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -251,14 +251,38 @@ type parser struct { lastTok *token incomplete bool errMsgDetails string + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool } func newParser(src io.Reader) *parser { return &parser{ - lex: newLexer(src), + lex: newLexer(src), + isContributoryPropertyExposed: false, } } +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + func (p *parser) parse() (ast astNode, retErr error) { defer func() { err := recover() @@ -548,6 +572,10 @@ func (p *parser) parseCharProp() astNode { propName = "" propVal = sym1 } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.errMsgDetails = propName + raiseSyntaxError(synErrCharPropUnsupported) + } pat, err := ucd.NormalizeCharacterProperty(propName, propVal) if err != nil { p.errMsgDetails = fmt.Sprintf("%v", err) @@ -555,6 +583,7 @@ func (p *parser) parseCharProp() astNode { } if pat != "" { p := newParser(bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() ast, err := p.parse() if err != nil { panic(err) diff --git a/compiler/parser_test.go b/compiler/parser_test.go index 7c33fb4..e4a6fe2 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/nihei9/maleeni/spec" + "github.com/nihei9/maleeni/ucd" ) func symPos(n uint16) symbolPosition { @@ -1241,6 +1242,37 @@ func TestParse(t *testing.T) { } } +func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { + for _, cProp := range ucd.ContributoryProperties() { + t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { + ast, _, err := parse([]*patternEntry{ + { + id: spec.LexModeKindIDMin, + pattern: []byte(fmt.Sprintf(`\p{%v=yes}`, cProp)), + }, + }, nil) + if err == nil { + t.Fatalf("expected syntax error; got: nil") + } + parseErrs, ok := err.(*ParseErrors) + if !ok { + t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) + } + parseErr := parseErrs.Errors[0].Cause + synErr, ok := parseErr.(*SyntaxError) + if !ok { + t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) + } + if synErr != synErrCharPropUnsupported { + t.Fatalf("unexpected syntax error; want: %v, got: %v", synErrCharPropUnsupported, synErr) + } + if ast != nil { + t.Fatalf("ast is not nil") + } + }) + } +} + func TestParse_FollowAndSymbolTable(t *testing.T) { root, symTab, err := parse([]*patternEntry{ { @@ -39,6 +39,19 @@ func NormalizeCharacterProperty(propName, propVal string) (string, error) { return b.String(), nil } +func IsContributoryProperty(propName string) bool { + if propName == "" { + return false + } + + for _, p := range contributoryProperties { + if propName == p { + return true + } + } + return false +} + func FindCodePointRanges(propName, propVal string) ([]*CodePointRange, bool, error) { if propName == "" { propName = "gc" diff --git a/ucd/property.go b/ucd/property.go index 71bbcad..0b2ac62 100644 --- a/ucd/property.go +++ b/ucd/property.go @@ -1,5 +1,19 @@ package ucd +// contributoryProperties is a set of contributory properties maleeni uses internally. Property statuses are +// defined in the following table. +// +// https://unicode.org/reports/tr44/#Property_List_Table +var contributoryProperties = []string{ + "oalpha", + "olower", + "oupper", +} + +func ContributoryProperties() []string { + return contributoryProperties +} + // https://www.unicode.org/reports/tr44/#GC_Values_Table var compositGeneralCategories = map[string][]string{ // Cased_Letter |