diff options
author | EuAndreh <eu@euandre.org> | 2025-07-15 21:37:16 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2025-07-15 21:44:55 -0300 |
commit | 1ce80e005a374488c186d0f545af33096f6523d5 (patch) | |
tree | 997f02f05a40d1fab2c9854dfdf5c687b8bb47c7 | |
parent | Support tokenizing `.` wildcard operator. (diff) | |
download | paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.gz paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.xz |
Only tolerate escaping of special chars
* src/paca.mjs
(escapingStateStep): Return an error when escaping non-metacharacters.
This way cases like \d, which is syntax for [0-9] which will
eventually be recognized, will not change its behaviour from a noop
escape of "d" to matching digits.
(operatorChars, isOperator): Hoist both of these up before their usage
in `escapingStateStep()`.
* tests/paca.mjs
(test_isOperator): Hoist its definition and position inside the
`runTests([...])` array to match src/paca.mjs.
(test_escapingStateStep): Adjust existing cases and add test case for
good/bad escapes.
(test_tokenizeRegexStep): Fix bad starting escape, that broke because
it was escaping a non-metacharacter.
-rw-r--r-- | src/paca.mjs | 34 | ||||
-rw-r--r-- | tests/paca.mjs | 63 |
2 files changed, 59 insertions, 38 deletions
diff --git a/src/paca.mjs b/src/paca.mjs index 5d11b05..7a01407 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -24,19 +24,33 @@ const shouldConcat = (char, next) => char !== "{" && !nonConcatOperators.has(next); +const operatorChars = new Set([...nonConcatOperators, "(", "."]); +const isOperator = char => + operatorChars.has(char); + const numFromDigits = digits => digits.length === 0 ? -1 : Number(digits.join("")); -const escapingStateStep = ({ out, _state, context }, char, _index, next) => ({ - out: out.concat( - char, - shouldConcat(null, next) ? [{ operator: "concat" }] : [], - ), - state: ConcatStep.ACCEPTING, - context, -}); +const escapingStateStep = ({ out, state, context }, char, _index, next) => + !(isOperator(char) || char === "\\") + ? reduced({ + out, + state, + context, + error: new SyntaxError( + "unknown escape sequence: \\" + char, + ), + }) + : { + out: out.concat( + char, + shouldConcat(null, next) ? [{ operator: "concat" }] : [], + ), + state: ConcatStep.ACCEPTING, + context, + }; const rangeStateStep = ({ out, state, context }, char, _index, _next) => { if (char === "}") { @@ -313,10 +327,6 @@ const transitionChars = new Set(Object.keys(TRANSITION_FNS)); const isTransition = char => transitionChars.has(char); -const operatorChars = new Set([...nonConcatOperators, "(", "."]); -const isOperator = char => - operatorChars.has(char); - const tokenizeRegexStep = chars => ({ out, state, context }, char, index) => { const next = chars[index + 1]; diff --git a/tests/paca.mjs b/tests/paca.mjs index bc6e1d0..789f959 100644 --- a/tests/paca.mjs +++ b/tests/paca.mjs @@ -5,6 +5,7 @@ import { ValueError, ConcatStep, shouldConcat, + isOperator, numFromDigits, escapingStateStep, rangeStateStep, @@ -13,7 +14,6 @@ import { ANCHOR_FNS, isAnchor, isTransition, - isOperator, tokenizeRegexStep, tokenizeRegexFn, tokenizeRegex, @@ -84,6 +84,25 @@ const test_shouldConcat = t => { }); }; +const test_isOperator = t => { + t.start("isOperator()"); + + t.testing("operators and open parens are true", () => { + t.assertEq(isOperator("*"), true); + t.assertEq(isOperator("|"), true); + t.assertEq(isOperator("+"), true); + t.assertEq(isOperator("?"), true); + t.assertEq(isOperator("("), true); + t.assertEq(isOperator(")"), true); + t.assertEq(isOperator("."), true); + }); + + t.testing("false for everyday non-meta chars", () => { + t.assertEq(isOperator("a"), false); + t.assertEq(isOperator("_"), false); + }); +}; + const test_numFromDigits = t => { t.start("numFromDigits()"); @@ -105,12 +124,12 @@ const test_escapingStateStep = t => { t.testing("add a concat when applicable", () => { const given = escapingStateStep( { out: [ 1, 2, 3 ] }, - "a", + "*", null, "b", ); const expected = { - out: [ 1, 2, 3, "a", { operator: "concat" } ], + out: [ 1, 2, 3, "*", { operator: "concat" } ], state: "accepting", context: undefined, }; @@ -120,17 +139,28 @@ const test_escapingStateStep = t => { t.testing("without a concat when not applicable", () => { const given = escapingStateStep( { out: [ 1, 2, 3 ] }, - "a", + "$", null, ")", ); const expected = { - out: [ 1, 2, 3, "a" ], + out: [ 1, 2, 3, "$" ], state: "accepting", context: undefined, }; t.assertEq(given, expected); }); + + t.testing("error when escaping a non-escapeable char", () => { + const { value: { error }} = escapingStateStep( + {}, + "a", + null, + null, + ); + t.assertEq(error.message, "unknown escape sequence: \\a"), + t.assertEq(error instanceof SyntaxError, true); + }); }; const test_rangeStateStep = t => { @@ -635,25 +665,6 @@ const test_isTransition = t => { }); }; -const test_isOperator = t => { - t.start("isOperator()"); - - t.testing("operators and open parens are true", () => { - t.assertEq(isOperator("*"), true); - t.assertEq(isOperator("|"), true); - t.assertEq(isOperator("+"), true); - t.assertEq(isOperator("?"), true); - t.assertEq(isOperator("("), true); - t.assertEq(isOperator(")"), true); - t.assertEq(isOperator("."), true); - }); - - t.testing("false for everyday non-meta chars", () => { - t.assertEq(isOperator("a"), false); - t.assertEq(isOperator("_"), false); - }); -}; - const test_tokenizeRegexStep = t => { t.start("tokenizeRegexStep()"); @@ -671,7 +682,7 @@ const test_tokenizeRegexStep = t => { const stepFn = tokenizeRegexStep(regex); const steps = [{ out: [], - state: ConcatStep.ESCAPING, + state: ConcatStep.ACCEPTING, context: null, }, { out: ["a", cat], @@ -3041,6 +3052,7 @@ const test_compile = t => { runTests([ test_shouldConcat, + test_isOperator, test_numFromDigits, test_escapingStateStep, test_rangeStateStep, @@ -3049,7 +3061,6 @@ runTests([ test_ANCHOR_FNS, test_isAnchor, test_isTransition, - test_isOperator, test_tokenizeRegexStep, test_tokenizeRegexFn, test_tokenizeRegex, |