package parser import ( "fmt" "os" "reflect" "strings" "testing" "testing/internal/testdeps" spec "urubu/spec/grammar" "urubu/ucd" ) func TestLexer(t *testing.T) { tests := []struct { caption string src string tokens []*token err error }{ { caption: "lexer can recognize ordinaly characters", src: "123abcいろは", tokens: []*token{ newToken(tokenKindChar, '1'), newToken(tokenKindChar, '2'), newToken(tokenKindChar, '3'), newToken(tokenKindChar, 'a'), newToken(tokenKindChar, 'b'), newToken(tokenKindChar, 'c'), newToken(tokenKindChar, 'い'), newToken(tokenKindChar, 'ろ'), newToken(tokenKindChar, 'は'), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the special characters in default mode", src: ".*+?|()[\\u", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), newToken(tokenKindRepeatOneOrMore, nullChar), newToken(tokenKindOption, nullChar), newToken(tokenKindAlt, nullChar), newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the escape sequences in default mode", src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), newToken(tokenKindChar, '*'), newToken(tokenKindChar, '+'), newToken(tokenKindChar, '?'), newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), newToken(tokenKindChar, '['), newToken(tokenKindEOF, nullChar), }, }, { caption: "], {, and } are treated as an ordinary character in default mode", src: "]{}", tokens: []*token{ newToken(tokenKindChar, ']'), newToken(tokenKindChar, '{'), newToken(tokenKindChar, '}'), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the special characters in bracket expression mode", src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("09AF"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("09abcf"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the escape sequences in bracket expression mode", src: "[\\^a\\-z]", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '^'), newToken(tokenKindChar, 'a'), newToken(tokenKindChar, '-'), newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "in a bracket expression, the special characters are also handled as normal characters", src: "[\\\\.*+?|()[", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), newToken(tokenKindChar, '*'), newToken(tokenKindChar, '+'), newToken(tokenKindChar, '?'), newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), newToken(tokenKindChar, '['), newToken(tokenKindEOF, nullChar), }, }, { caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters", // [...-...][...-][-...][-] // ~~~~~~~ ~ ~ ~ // ^ ^ ^ ^ // | | | `-- Ordinary Character (b) // | | `-- Ordinary Character (b) // | `-- Ordinary Character (b) // `-- Character Range (a) // // a. *-* is handled as a character-range expression. // b. *-, -*, or - are handled as ordinary characters. src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, '-'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", // [^...^...][^] // ~~ ~ ~~ // ^ ^ ^^ // | | |`-- Ordinary Character (c) // | | `-- Bracket Expression // | `-- Ordinary Character (b) // `-- Inverse Bracket Expression (a) // // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. src: "[^^][^]", tokens: []*token{ newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, '^'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '^'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer raises an error when an invalid escape sequence appears", src: "\\@", err: synErrInvalidEscSeq, }, { caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", src: "\\", err: synErrIncompletedEscSeq, }, { caption: "lexer raises an error when an invalid escape sequence appears", src: "[\\@", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), }, err: synErrInvalidEscSeq, }, { caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", src: "[\\", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), }, err: synErrIncompletedEscSeq, }, { caption: "lexer can recognize the special characters and code points in code point expression mode", src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("0123"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("4567"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("89abcd"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("efAB"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("CDEF01"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("0123"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("4567"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("89abcd"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("efAB"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("CDEF01"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("0123"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("4567"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("89abcd"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("efAB"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("CDEF01"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "a one digit hex string isn't a valid code point", src: "\\u{0", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a two digits hex string isn't a valid code point", src: "\\u{01", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a three digits hex string isn't a valid code point", src: "\\u{012", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a four digits hex string is a valid code point", src: "\\u{0123}", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("0123"), newToken(tokenKindRBrace, nullChar), }, }, { caption: "a five digits hex string isn't a valid code point", src: "\\u{01234", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a six digits hex string is a valid code point", src: "\\u{012345}", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCodePointToken("012345"), newToken(tokenKindRBrace, nullChar), }, }, { caption: "a seven digits hex string isn't a valid code point", src: "\\u{0123456", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a code point must be hex digits", src: "\\u{g", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "a code point must be hex digits", src: "\\u{G", tokens: []*token{ newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindLBrace, nullChar), }, err: synErrInvalidCodePoint, }, { caption: "lexer can recognize the special characters and symbols in character property expression mode", src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]", tokens: []*token{ newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("General_Category"), newToken(tokenKindEqual, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("General_Category"), newToken(tokenKindEqual, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindCharPropLeader, nullChar), newToken(tokenKindLBrace, nullChar), newCharPropSymbolToken("General_Category"), newToken(tokenKindEqual, nullChar), newCharPropSymbolToken("Letter"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the special characters and symbols in fragment expression mode", src: "\\f{integer}", tokens: []*token{ newToken(tokenKindFragmentLeader, nullChar), newToken(tokenKindLBrace, nullChar), newFragmentSymbolToken("integer"), newToken(tokenKindRBrace, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "a fragment expression is not supported in a bracket expression", src: "[\\f", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), }, err: synErrInvalidEscSeq, }, { caption: "a fragment expression is not supported in an inverse bracket expression", src: "[^\\f", tokens: []*token{ newToken(tokenKindInverseBExpOpen, nullChar), }, err: synErrInvalidEscSeq, }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { lex := newLexer(strings.NewReader(tt.src)) var err error var tok *token i := 0 for { tok, err = lex.next() if err != nil { break } if i >= len(tt.tokens) { break } eTok := tt.tokens[i] i++ testToken(t, tok, eTok) if tok.kind == tokenKindEOF { break } } if tt.err != nil { if err != ParseErr { t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) } detail, cause := lex.error() if cause != tt.err { t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail) } } else { if err != nil { t.Fatalf("unexpected error: %v", err) } } if i < len(tt.tokens) { t.Fatalf("expecte more tokens") } }) } } func testToken(t *testing.T, a, e *token) { t.Helper() if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { t.Fatalf("unexpected token: want: %+v, got: %+v", e, a) } } func TestParse(t *testing.T) { tests := []struct { pattern string fragments map[spec.LexKindName]string ast CPTree syntaxError error // When an AST is large, as patterns containing a character property expression, this test only checks // that the pattern is parsable. The check of the validity of such AST is performed by checking that it // can be matched correctly using the driver. skipTestAST bool }{ { pattern: "a", ast: newSymbolNode('a'), }, { pattern: "abc", ast: genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), }, { pattern: "a?", ast: newOptionNode( newSymbolNode('a'), ), }, { pattern: "[abc]?", ast: newOptionNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "\\u{3042}?", ast: newOptionNode( newSymbolNode('\u3042'), ), }, { pattern: "\\p{Letter}?", skipTestAST: true, }, { pattern: "\\f{a2c}?", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, ast: newOptionNode( newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), ), }, { pattern: "(a)?", ast: newOptionNode( newSymbolNode('a'), ), }, { pattern: "((a?)?)?", ast: newOptionNode( newOptionNode( newOptionNode( newSymbolNode('a'), ), ), ), }, { pattern: "(abc)?", ast: newOptionNode( genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "(a|b)?", ast: newOptionNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), ), ), }, { pattern: "?", syntaxError: synErrRepNoTarget, }, { pattern: "(?)", syntaxError: synErrRepNoTarget, }, { pattern: "a|?", syntaxError: synErrRepNoTarget, }, { pattern: "?|b", syntaxError: synErrRepNoTarget, }, { pattern: "a??", syntaxError: synErrRepNoTarget, }, { pattern: "a*", ast: newRepeatNode( newSymbolNode('a'), ), }, { pattern: "[abc]*", ast: newRepeatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "\\u{3042}*", ast: newRepeatNode( newSymbolNode('\u3042'), ), }, { pattern: "\\p{Letter}*", skipTestAST: true, }, { pattern: "\\f{a2c}*", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, ast: newRepeatNode( newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), ), }, { pattern: "((a*)*)*", ast: newRepeatNode( newRepeatNode( newRepeatNode( newSymbolNode('a'), ), ), ), }, { pattern: "(abc)*", ast: newRepeatNode( genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "(a|b)*", ast: newRepeatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), ), ), }, { pattern: "*", syntaxError: synErrRepNoTarget, }, { pattern: "(*)", syntaxError: synErrRepNoTarget, }, { pattern: "a|*", syntaxError: synErrRepNoTarget, }, { pattern: "*|b", syntaxError: synErrRepNoTarget, }, { pattern: "a**", syntaxError: synErrRepNoTarget, }, { pattern: "a+", ast: genConcatNode( newSymbolNode('a'), newRepeatNode( newSymbolNode('a'), ), ), }, { pattern: "[abc]+", ast: genConcatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), newRepeatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), ), }, { pattern: "\\u{3042}+", ast: genConcatNode( newSymbolNode('\u3042'), newRepeatNode( newSymbolNode('\u3042'), ), ), }, { pattern: "\\p{Letter}+", skipTestAST: true, }, { pattern: "\\f{a2c}+", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, ast: genConcatNode( newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), newRepeatNode( newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), ), ), }, { pattern: "((a+)+)+", ast: genConcatNode( genConcatNode( genConcatNode( genConcatNode( newSymbolNode('a'), newRepeatNode( newSymbolNode('a'), ), ), newRepeatNode( genConcatNode( newSymbolNode('a'), newRepeatNode( newSymbolNode('a'), ), ), ), ), newRepeatNode( genConcatNode( genConcatNode( newSymbolNode('a'), newRepeatNode( newSymbolNode('a'), ), ), newRepeatNode( genConcatNode( newSymbolNode('a'), newRepeatNode( newSymbolNode('a'), ), ), ), ), ), ), ), }, { pattern: "(abc)+", ast: genConcatNode( genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), newRepeatNode( genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), ), }, { pattern: "(a|b)+", ast: genConcatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), ), newRepeatNode( genAltNode( newSymbolNode('a'), newSymbolNode('b'), ), ), ), }, { pattern: "+", syntaxError: synErrRepNoTarget, }, { pattern: "(+)", syntaxError: synErrRepNoTarget, }, { pattern: "a|+", syntaxError: synErrRepNoTarget, }, { pattern: "+|b", syntaxError: synErrRepNoTarget, }, { pattern: "a++", syntaxError: synErrRepNoTarget, }, { pattern: ".", ast: newRangeSymbolNode(0x00, 0x10FFFF), }, { pattern: "[a]", ast: newSymbolNode('a'), }, { pattern: "[abc]", ast: genAltNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), }, { pattern: "[a-z]", ast: newRangeSymbolNode('a', 'z'), }, { pattern: "[A-Za-z]", ast: genAltNode( newRangeSymbolNode('A', 'Z'), newRangeSymbolNode('a', 'z'), ), }, { pattern: "[\\u{004E}]", ast: newSymbolNode('N'), }, { pattern: "[\\u{0061}-\\u{007A}]", ast: newRangeSymbolNode('a', 'z'), }, { pattern: "[\\p{Lu}]", skipTestAST: true, }, { pattern: "[a-\\p{Lu}]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[\\p{Lu}-z]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[\\p{Lu}-\\p{Ll}]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[z-a]", syntaxError: synErrRangeInvalidOrder, }, { pattern: "a[]", syntaxError: synErrBExpNoElem, }, { pattern: "[]a", syntaxError: synErrBExpNoElem, }, { pattern: "[]", syntaxError: synErrBExpNoElem, }, { pattern: "[^\\u{004E}]", ast: genAltNode( newRangeSymbolNode(0x00, '\u004E'-1), newRangeSymbolNode('\u004E'+1, 0x10FFFF), ), }, { pattern: "[^\\u{0061}-\\u{007A}]", ast: genAltNode( newRangeSymbolNode(0x00, '\u0061'-1), newRangeSymbolNode('\u007A'+1, 0x10FFFF), ), }, { pattern: "[^\\p{Lu}]", skipTestAST: true, }, { pattern: "[^a-\\p{Lu}]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[^\\p{Lu}-z]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[^\\p{Lu}-\\p{Ll}]", syntaxError: synErrRangePropIsUnavailable, }, { pattern: "[^\\u{0000}-\\u{10FFFF}]", syntaxError: synErrUnmatchablePattern, }, { pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]", syntaxError: synErrUnmatchablePattern, }, { pattern: "[^]", ast: newSymbolNode('^'), }, { pattern: "[", syntaxError: synErrBExpUnclosed, }, { pattern: "([", syntaxError: synErrBExpUnclosed, }, { pattern: "[a", syntaxError: synErrBExpUnclosed, }, { pattern: "([a", syntaxError: synErrBExpUnclosed, }, { pattern: "[a-", syntaxError: synErrBExpUnclosed, }, { pattern: "([a-", syntaxError: synErrBExpUnclosed, }, { pattern: "[^", syntaxError: synErrBExpUnclosed, }, { pattern: "([^", syntaxError: synErrBExpUnclosed, }, { pattern: "[^a", syntaxError: synErrBExpUnclosed, }, { pattern: "([^a", syntaxError: synErrBExpUnclosed, }, { pattern: "[^a-", syntaxError: synErrBExpUnclosed, }, { pattern: "([^a-", syntaxError: synErrBExpUnclosed, }, { pattern: "]", ast: newSymbolNode(']'), }, { pattern: "(]", syntaxError: synErrGroupUnclosed, }, { pattern: "a]", ast: genConcatNode( newSymbolNode('a'), newSymbolNode(']'), ), }, { pattern: "(a]", syntaxError: synErrGroupUnclosed, }, { pattern: "([)", syntaxError: synErrBExpUnclosed, }, { pattern: "([a)", syntaxError: synErrBExpUnclosed, }, { pattern: "[a-]", ast: genAltNode( newSymbolNode('a'), newSymbolNode('-'), ), }, { pattern: "[^a-]", ast: genAltNode( newRangeSymbolNode(0x00, 0x2C), newRangeSymbolNode(0x2E, 0x60), newRangeSymbolNode(0x62, 0x10FFFF), ), }, { pattern: "[-z]", ast: genAltNode( newSymbolNode('-'), newSymbolNode('z'), ), }, { pattern: "[^-z]", ast: newAltNode( newRangeSymbolNode(0x00, 0x2C), newAltNode( newRangeSymbolNode(0x2E, 0x79), newRangeSymbolNode(0x7B, 0x10FFFF), ), ), }, { pattern: "[-]", ast: newSymbolNode('-'), }, { pattern: "[^-]", ast: genAltNode( newRangeSymbolNode(0x00, 0x2C), newRangeSymbolNode(0x2E, 0x10FFFF), ), }, { pattern: "[^01]", ast: genAltNode( newRangeSymbolNode(0x00, '0'-1), newRangeSymbolNode('1'+1, 0x10FFFF), ), }, { pattern: "[^10]", ast: genAltNode( newRangeSymbolNode(0x00, '0'-1), newRangeSymbolNode('1'+1, 0x10FFFF), ), }, { pattern: "[^a-z]", ast: genAltNode( newRangeSymbolNode(0x00, 'a'-1), newRangeSymbolNode('z'+1, 0x10FFFF), ), }, { pattern: "[^az]", ast: genAltNode( newRangeSymbolNode(0x00, 'a'-1), genAltNode( newRangeSymbolNode('a'+1, 'z'-1), newRangeSymbolNode('z'+1, 0x10FFFF), ), ), }, { pattern: "\\u{006E}", ast: newSymbolNode('\u006E'), }, { pattern: "\\u{03BD}", ast: newSymbolNode('\u03BD'), }, { pattern: "\\u{306B}", ast: newSymbolNode('\u306B'), }, { pattern: "\\u{01F638}", ast: newSymbolNode('\U0001F638'), }, { pattern: "\\u{0000}", ast: newSymbolNode('\u0000'), }, { pattern: "\\u{10FFFF}", ast: newSymbolNode('\U0010FFFF'), }, { pattern: "\\u{110000}", syntaxError: synErrCPExpOutOfRange, }, { pattern: "\\u", syntaxError: synErrCPExpInvalidForm, }, { pattern: "\\u{", syntaxError: synErrCPExpInvalidForm, }, { pattern: "\\u{03BD", syntaxError: synErrCPExpInvalidForm, }, { pattern: "\\u{}", syntaxError: synErrCPExpInvalidForm, }, { pattern: "\\p{Letter}", skipTestAST: true, }, { pattern: "\\p{General_Category=Letter}", skipTestAST: true, }, { pattern: "\\p{ Letter }", skipTestAST: true, }, { pattern: "\\p{ General_Category = Letter }", skipTestAST: true, }, { pattern: "\\p", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{Letter", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{General_Category=}", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{General_Category= }", syntaxError: synErrCharPropInvalidSymbol, }, { pattern: "\\p{=Letter}", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{ =Letter}", syntaxError: synErrCharPropInvalidSymbol, }, { pattern: "\\p{=}", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\p{}", syntaxError: synErrCharPropExpInvalidForm, }, { pattern: "\\f{a2c}", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, ast: newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "\\f{ a2c }", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, ast: newFragmentNode("a2c", genConcatNode( newSymbolNode('a'), newSymbolNode('b'), newSymbolNode('c'), ), ), }, { pattern: "\\f", syntaxError: synErrFragmentExpInvalidForm, }, { pattern: "\\f{", syntaxError: synErrFragmentExpInvalidForm, }, { pattern: "\\f{a2c", fragments: map[spec.LexKindName]string{ "a2c": "abc", }, syntaxError: synErrFragmentExpInvalidForm, }, { pattern: "(a)", ast: newSymbolNode('a'), }, { pattern: "(((a)))", ast: newSymbolNode('a'), }, { pattern: "a()", syntaxError: synErrGroupNoElem, }, { pattern: "()a", syntaxError: synErrGroupNoElem, }, { pattern: "()", syntaxError: synErrGroupNoElem, }, { pattern: "(", syntaxError: synErrGroupUnclosed, }, { pattern: "a(", syntaxError: synErrGroupUnclosed, }, { pattern: "(a", syntaxError: synErrGroupUnclosed, }, { pattern: "((", syntaxError: synErrGroupUnclosed, }, { pattern: "((a)", syntaxError: synErrGroupUnclosed, }, { pattern: ")", syntaxError: synErrGroupNoInitiator, }, { pattern: "a)", syntaxError: synErrGroupNoInitiator, }, { pattern: ")a", syntaxError: synErrGroupNoInitiator, }, { pattern: "))", syntaxError: synErrGroupNoInitiator, }, { pattern: "(a))", syntaxError: synErrGroupNoInitiator, }, { pattern: "Mulder|Scully", ast: genAltNode( genConcatNode( newSymbolNode('M'), newSymbolNode('u'), newSymbolNode('l'), newSymbolNode('d'), newSymbolNode('e'), newSymbolNode('r'), ), genConcatNode( newSymbolNode('S'), newSymbolNode('c'), newSymbolNode('u'), newSymbolNode('l'), newSymbolNode('l'), newSymbolNode('y'), ), ), }, { pattern: "Langly|Frohike|Byers", ast: genAltNode( genConcatNode( newSymbolNode('L'), newSymbolNode('a'), newSymbolNode('n'), newSymbolNode('g'), newSymbolNode('l'), newSymbolNode('y'), ), genConcatNode( newSymbolNode('F'), newSymbolNode('r'), newSymbolNode('o'), newSymbolNode('h'), newSymbolNode('i'), newSymbolNode('k'), newSymbolNode('e'), ), genConcatNode( newSymbolNode('B'), newSymbolNode('y'), newSymbolNode('e'), newSymbolNode('r'), newSymbolNode('s'), ), ), }, { pattern: "|", syntaxError: synErrAltLackOfOperand, }, { pattern: "||", syntaxError: synErrAltLackOfOperand, }, { pattern: "Mulder|", syntaxError: synErrAltLackOfOperand, }, { pattern: "|Scully", syntaxError: synErrAltLackOfOperand, }, { pattern: "Langly|Frohike|", syntaxError: synErrAltLackOfOperand, }, { pattern: "Langly||Byers", syntaxError: synErrAltLackOfOperand, }, { pattern: "|Frohike|Byers", syntaxError: synErrAltLackOfOperand, }, { pattern: "|Frohike|", syntaxError: synErrAltLackOfOperand, }, { pattern: "Fox(|)Mulder", syntaxError: synErrAltLackOfOperand, }, { pattern: "(Fox|)Mulder", syntaxError: synErrAltLackOfOperand, }, { pattern: "Fox(|Mulder)", syntaxError: synErrAltLackOfOperand, }, } for i, tt := range tests { t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { fragmentTrees := map[spec.LexKindName]CPTree{} for kind, pattern := range tt.fragments { p := NewParser(kind, strings.NewReader(pattern)) root, err := p.Parse() if err != nil { t.Fatal(err) } fragmentTrees[kind] = root } err := CompleteFragments(fragmentTrees) if err != nil { t.Fatal(err) } p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern)) root, err := p.Parse() if tt.syntaxError != nil { // printCPTree(os.Stdout, root, "", "") if err != ParseErr { t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) } _, synErr := p.Error() if synErr != tt.syntaxError { t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr) } if root != nil { t.Fatalf("tree must be nil") } } else { if err != nil { detail, cause := p.Error() t.Fatalf("%v: %v: %v", err, cause, detail) } if root == nil { t.Fatal("tree must be non-nil") } complete, err := ApplyFragments(root, fragmentTrees) if err != nil { t.Fatal(err) } if !complete { t.Fatalf("incomplete fragments") } // printCPTree(os.Stdout, root, "", "") if !tt.skipTestAST { r := root.(*rootNode) testAST(t, tt.ast, r.tree) } } }) } } func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { for _, cProp := range ucd.ContributoryProperties() { t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp))) root, err := p.Parse() if err == nil { t.Fatalf("expected syntax error: got: nil") } _, synErr := p.Error() if synErr != synErrCharPropUnsupported { t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr) } if root != nil { t.Fatalf("tree is not nil") } }) } } func TestExclude(t *testing.T) { for _, test := range []struct { caption string target CPTree base CPTree result CPTree }{ // t.From > b.From && t.To < b.To // |t.From - b.From| = 1 // |b.To - t.To| = 1 // // Target (t): +--+ // Base (b): +--+--+--+ // Result (b - t): +--+ +--+ { caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1", target: newSymbolNode('1'), base: newRangeSymbolNode('0', '2'), result: newAltNode( newSymbolNode('0'), newSymbolNode('2'), ), }, // |t.From - b.From| > 1 // |b.To - t.To| > 1 // // Target (t): +--+ // Base (b): +--+--+--+--+--+ // Result (b - t): +--+--+ +--+--+ { caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1", target: newSymbolNode('2'), base: newRangeSymbolNode('0', '4'), result: newAltNode( newRangeSymbolNode('0', '1'), newRangeSymbolNode('3', '4'), ), }, // t.From <= b.From && t.To >= b.From && t.To < b.To // |b.From - t.From| = 0 // |t.To - b.From| = 0 // |b.To - t.To| = 1 // // Target (t): +--+ // Base (b): +--+--+ // Result (b - t): +--+ { caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", target: newSymbolNode('0'), base: newRangeSymbolNode('0', '1'), result: newSymbolNode('1'), }, // |b.From - t.From| = 0 // |t.To - b.From| = 0 // |b.To - t.To| > 1 // // Target (t): +--+ // Base (b): +--+--+--+ // Result (b - t): +--+--+ { caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", target: newSymbolNode('0'), base: newRangeSymbolNode('0', '2'), result: newRangeSymbolNode('1', '2'), }, // |b.From - t.From| = 0 // |t.To - b.From| > 0 // |b.To - t.To| = 1 // // Target (t): +--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+ { caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", target: newRangeSymbolNode('0', '1'), base: newRangeSymbolNode('0', '2'), result: newSymbolNode('2'), }, // |b.From - t.From| = 0 // |t.To - b.From| > 0 // |b.To - t.To| > 1 // // Target (t): +--+--+ // Base (b): +--+--+--+--+ // Result (b - t): +--+--+ { caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", target: newRangeSymbolNode('0', '1'), base: newRangeSymbolNode('0', '3'), result: newRangeSymbolNode('2', '3'), }, // |b.From - t.From| > 0 // |t.To - b.From| = 0 // |b.To - t.To| = 1 // // Target (t): +--+--+ // Base (b): +--+--+ // Result (b - t): +--+ { caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", target: newRangeSymbolNode('0', '1'), base: newRangeSymbolNode('1', '2'), result: newSymbolNode('2'), }, // |b.From - t.From| > 0 // |t.To - b.From| = 0 // |b.To - t.To| > 1 // // Target (t): +--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+--+ { caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", target: newRangeSymbolNode('0', '1'), base: newRangeSymbolNode('1', '3'), result: newRangeSymbolNode('2', '3'), }, // |b.From - t.From| > 0 // |t.To - b.From| > 0 // |b.To - t.To| = 1 // // Target (t): +--+--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+ { caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", target: newRangeSymbolNode('0', '2'), base: newRangeSymbolNode('1', '3'), result: newSymbolNode('3'), }, // |b.From - t.From| > 0 // |t.To - b.From| > 0 // |b.To - t.To| > 1 // // Target (t): +--+--+--+ // Base (b): +--+--+--+--+ // Result (b - t): +--+--+ { caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", target: newRangeSymbolNode('0', '2'), base: newRangeSymbolNode('1', '4'), result: newRangeSymbolNode('3', '4'), }, // t.From > b.From && t.From <= b.To && t.To >= b.To // |t.From - b.From| = 1 // |b.To - t.From| = 0 // |t.To - b.To| = 0 // // Target (t): +--+ // Base (b): +--+--+ // Result (b - t): +--+ { caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", target: newSymbolNode('1'), base: newRangeSymbolNode('0', '1'), result: newSymbolNode('0'), }, // |t.From - b.From| = 1 // |b.To - t.From| = 0 // |t.To - b.To| > 0 // // Target (t): +--+--+ // Base (b): +--+--+ // Result (b - t): +--+ { caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('1', '2'), base: newRangeSymbolNode('0', '1'), result: newSymbolNode('0'), }, // |t.From - b.From| = 1 // |b.To - t.From| > 0 // |t.To - b.To| = 0 // // Target (t): +--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+ { caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", target: newRangeSymbolNode('1', '2'), base: newRangeSymbolNode('0', '2'), result: newSymbolNode('0'), }, // |t.From - b.From| = 1 // |b.To - t.From| > 0 // |t.To - b.To| > 0 // // Target (t): +--+--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+ { caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('1', '3'), base: newRangeSymbolNode('0', '2'), result: newSymbolNode('0'), }, // |t.From - b.From| > 1 // |b.To - t.From| = 0 // |t.To - b.To| = 0 // // Target (t): +--+ // Base (b): +--+--+--+ // Result (b - t): +--+--+ { caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", target: newSymbolNode('2'), base: newRangeSymbolNode('0', '2'), result: newRangeSymbolNode('0', '1'), }, // |t.From - b.From| > 1 // |b.To - t.From| = 0 // |t.To - b.To| > 0 // // Target (t): +--+--+ // Base (b): +--+--+--+ // Result (b - t): +--+--+ { caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('2', '3'), base: newRangeSymbolNode('0', '2'), result: newRangeSymbolNode('0', '1'), }, // |t.From - b.From| > 1 // |b.To - t.From| > 0 // |t.To - b.To| = 0 // // Target (t): +--+--+ // Base (b): +--+--+--+--+ // Result (b - t): +--+--+ { caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", target: newRangeSymbolNode('2', '3'), base: newRangeSymbolNode('0', '3'), result: newRangeSymbolNode('0', '1'), }, // |t.From - b.From| > 1 // |b.To - t.From| > 0 // |t.To - b.To| > 0 // // Target (t): +--+--+--+ // Base (b): +--+--+--+--+ // Result (b - t): +--+--+ { caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('2', '4'), base: newRangeSymbolNode('0', '3'), result: newRangeSymbolNode('0', '1'), }, // t.From <= b.From && t.To >= b.To // |b.From - t.From| = 0 // |t.To - b.To| = 0 // // Target (t): +--+ // Base (b): +--+ // Result (b - t): N/A { caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0", target: newSymbolNode('0'), base: newSymbolNode('0'), result: nil, }, // |b.From - t.From| = 0 // |t.To - b.To| > 0 // // Target (t): +--+--+ // Base (b): +--+ // Result (b - t): N/A { caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('0', '1'), base: newSymbolNode('0'), result: nil, }, // |b.From - t.From| > 0 // |t.To - b.To| = 0 // // Target (t): +--+--+ // Base (b): +--+ // Result (b - t): N/A { caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0", target: newRangeSymbolNode('0', '1'), base: newSymbolNode('1'), result: nil, }, // |b.From - t.From| > 0 // |t.To - b.To| > 0 // // Target (t): +--+--+--+ // Base (b): +--+ // Result (b - t): N/A { caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0", target: newRangeSymbolNode('0', '2'), base: newSymbolNode('1'), result: nil, }, // Others // |b.From - t.From| = 1 // // Target (t): +--+ // Base (b): +--+ // Result (b - t): +--+ { caption: "|b.From - t.From| = 1", target: newSymbolNode('0'), base: newSymbolNode('1'), result: newSymbolNode('1'), }, // |b.From - t.From| > 1 // // Target (t): +--+ // Base (b): +--+ // Result (b - t): +--+ { caption: "|b.From - t.From| > 1", target: newSymbolNode('0'), base: newSymbolNode('2'), result: newSymbolNode('2'), }, // |t.To - b.To| = 1 // // Target (t): +--+ // Base (b): +--+ // Result (b - t): +--+ { caption: "|t.To - b.To| = 1", target: newSymbolNode('1'), base: newSymbolNode('0'), result: newSymbolNode('0'), }, // |t.To - b.To| > 1 // // Target (t): +--+ // Base (b): +--+ // Result (b - t): +--+ { caption: "|t.To - b.To| > 1", target: newSymbolNode('2'), base: newSymbolNode('0'), result: newSymbolNode('0'), }, } { t.Run(test.caption, func(t *testing.T) { r := exclude(test.target, test.base) testAST(t, test.result, r) }) } } func testAST(t *testing.T, expected, actual CPTree) { t.Helper() aTy := reflect.TypeOf(actual) eTy := reflect.TypeOf(expected) if eTy != aTy { t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy) } if actual == nil { return } switch e := expected.(type) { case *symbolNode: a := actual.(*symbolNode) if a.From != e.From || a.To != e.To { t.Fatalf("unexpected node: want: %+v, got: %+v", e, a) } } eLeft, eRight := expected.children() aLeft, aRight := actual.children() testAST(t, eLeft, aLeft) testAST(t, eRight, aRight) } func MainTest() { tests := []testing.InternalTest{ { "TestLexer", TestLexer }, { "TestParse", TestParse }, { "TestParse_ContributoryPropertyIsNotExposed", TestParse_ContributoryPropertyIsNotExposed }, { "TestExclude", TestExclude }, } deps := testdeps.TestDeps{} benchmarks := []testing.InternalBenchmark {} fuzzTargets := []testing.InternalFuzzTarget{} examples := []testing.InternalExample {} m := testing.MainStart(deps, tests, benchmarks, fuzzTargets, examples) os.Exit(m.Run()) }