diff options
author | EuAndreh <eu@euandre.org> | 2025-07-16 07:18:42 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2025-07-16 07:18:42 -0300 |
commit | 5d135ee551fa29574cbd558b4feaa46328d54bd4 (patch) | |
tree | d2a3d79b563227b92d40b9184c1c70f9f2185be3 | |
parent | Only tolerate escaping of special chars (diff) | |
download | paca-5d135ee551fa29574cbd558b4feaa46328d54bd4.tar.gz paca-5d135ee551fa29574cbd558b4feaa46328d54bd4.tar.xz |
Differentiate an "operator" from a "meta" character
The character class `[a-z]`, and specially the wildcard `.`, aren't
operators: they really do represent themselves with their own special
semantics, and they take no operands. So instead of have the "operator"
type behave in two ways, with and without arguments, we instead have
this new type, the "meta" character. In equivalence to the literal
character, the metacharacter represents itself, and also takes no
argument. We also can not touch the precedence parsing of operators by
tainting it with special conditions for "." and "class", since they
should behave just like literal characters: be pushed directly onto the
stack.
As of now, there are only 2 meta characters: "class" and ".".
* src/paca.mjs
(operatorChars): Remove "." from the set of operator characters.
(classStateStep): Return `{ meta: "class" }` instead of
`{ operator: "class" }`.
(isMeta): Add equivalent to `isTransition()` and `isOperator()`.
(opFor, tokenizeRegexStep): Add new `opFor()` function for classifying
a given character, choosing between an operator, a metacharacter and
a literal character, and use this function in the body of
`tokenizeRegexStep()`.
(PRECEDENCE): Remove early entry of precedence values for "class" and
".".
(toPostfixStep): Instead of just checking if a character is a literal
one before pushing it onto the stack, check that it isn't an
operator just by checking if it is an object that has the `operator`
attribute.
* tests/paca.mjs
(test_isOperator): Remove test case for ".", as it is no longer
considered an operator.
(classStateStep): Update to rename from `{ operator: "class" }` to
`{ meta: "class" }`.
(test_toPostfixStep, test_toPostfix): Add test cases for meta
characters.
(test_OPERATOR_FNS): BONUS - Use direct assignment to reset the array
to an empty value instead of `arr.splice(0)`.
-rw-r--r-- | src/paca.mjs | 22 | ||||
-rw-r--r-- | tests/paca.mjs | 74 |
2 files changed, 72 insertions, 24 deletions
diff --git a/src/paca.mjs b/src/paca.mjs index 7a01407..ccff1ca 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -24,7 +24,7 @@ const shouldConcat = (char, next) => char !== "{" && !nonConcatOperators.has(next); -const operatorChars = new Set([...nonConcatOperators, "(", "."]); +const operatorChars = new Set([...nonConcatOperators, "("]); const isOperator = char => operatorChars.has(char); @@ -167,8 +167,8 @@ const classStateStep = ({ out, state, context }, char, _index, _next) => { return { out: out.concat({ - operator: "class", - set: context.set, + meta: "class", + set: context.set, }), state: ConcatStep.ACCEPTING, context: null, @@ -327,6 +327,15 @@ const transitionChars = new Set(Object.keys(TRANSITION_FNS)); const isTransition = char => transitionChars.has(char); +const metaChars = new Set(["."]); +const isMeta = char => + metaChars.has(char); + +const opFor = char => + isOperator(char) ? { operator: char } + : isMeta(char) ? { meta: char } + : char; + const tokenizeRegexStep = chars => ({ out, state, context }, char, index) => { const next = chars[index + 1]; @@ -357,10 +366,9 @@ const tokenizeRegexStep = chars => ({ out, state, context }, char, index) => { ); } - const op = isOperator(char) ? { operator: char } : char; return { out: out.concat( - op, + opFor(char), shouldConcat(char, next) ? [{ operator: "concat" }] : [], @@ -398,8 +406,6 @@ const PRECEDENCE = { "range": 3, "concat": 2, "|": 1, - "class": 1, - ".": 1, }; const shouldPush = (stack, token) => @@ -414,7 +420,7 @@ const findLowerPrecedenceItem = (stack, token) => ); const toPostfixStep = ({ out, stack }, token, _index, tokens) => { - if (typeof token === "string") { + if (!token.operator) { return { out: out.concat(token), stack, diff --git a/tests/paca.mjs b/tests/paca.mjs index 789f959..55185f7 100644 --- a/tests/paca.mjs +++ b/tests/paca.mjs @@ -94,7 +94,6 @@ const test_isOperator = t => { t.assertEq(isOperator("?"), true); t.assertEq(isOperator("("), true); t.assertEq(isOperator(")"), true); - t.assertEq(isOperator("."), true); }); t.testing("false for everyday non-meta chars", () => { @@ -356,8 +355,8 @@ const test_classStateStep = t => { ); const expected = { out: [ 1, 2, 3, { - operator: "class", - set: [ 4, 5, 6 ], + meta: "class", + set: [ 4, 5, 6 ], }], state: "accepting", context: null, @@ -912,22 +911,22 @@ const test_tokenizeRegexStep = t => { }, }, { out: [caret, { - operator: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], }], state: "accepting", context: null, }, { out: [caret, { - operator: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], }, star], state: "accepting", context: null, }, { out: [caret, { - operator: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], }, star, dollar], state: "accepting", context: null, @@ -1444,6 +1443,29 @@ const test_toPostfixStep = t => { ); }); + t.testing("non-operators go directly to out", () => { + t.assertEq( + toPostfixStep( + { out: ["a"], stack: [{ operator: "(" }] }, + { meta: "." }, + ), + { + out: ["a", { meta: "." }], + stack: [{ operator: "(" }], + }, + ); + t.assertEq( + toPostfixStep( + { out: ["a"], stack: [{ operator: "*" }] }, + { meta: "class" }, + ), + { + out: ["a", { meta: "class" }], + stack: [{ operator: "*" }], + }, + ); + }); + t.testing("parens put things on the stack", () => { t.assertEq( toPostfixStep({ @@ -1628,7 +1650,8 @@ const test_toPostfix = t => { t.start("toPostfix()"); t.testing("regex table", () => { - const concat = { operator: "concat" }; + const concat = { operator: "concat" }; + const wildcard = { meta: "." }; const table = [{ in: [], expected: [] @@ -1693,6 +1716,25 @@ const test_toPostfix = t => { ], expected: [ ], + }, { + in: [ + "a", + concat, + "b", + concat, + wildcard, + concat, + "c", + ], + expected: [ + "a", + "b", + concat, + wildcard, + concat, + "c", + concat, + ], }]; for (const test of table) { t.assertEq( @@ -2006,7 +2048,7 @@ const test_zeroOrOne = t => { const test_OPERATORS_FNS = t => { t.start("OPERATORS_FNS"); - const arr = []; + let arr = []; const fn = ret => (...args) => { arr.push({ args, @@ -2018,7 +2060,7 @@ const test_OPERATORS_FNS = t => { t.testing("star", () => { - arr.splice(0); + arr = []; t.assertEq( OPERATORS_FNS({ zeroOrMoreFn: fn(111) })["*"](input), [1, 2, 111], @@ -2027,7 +2069,7 @@ const test_OPERATORS_FNS = t => { }); t.testing("plus", () => { - arr.splice(0); + arr = []; t.assertEq( OPERATORS_FNS({ oneOrMoreFn: fn(222) })["+"](input), [1, 2, 222], @@ -2036,7 +2078,7 @@ const test_OPERATORS_FNS = t => { }); t.testing("question mark", () => { - arr.splice(0); + arr = []; t.assertEq( OPERATORS_FNS({ zeroOrOneFn: fn(333) })["?"](input), [1, 2, 333], @@ -2045,7 +2087,7 @@ const test_OPERATORS_FNS = t => { }); t.testing("pipe", () => { - arr.splice(0); + arr = []; t.assertEq( OPERATORS_FNS({ unionFn: fn(444) })["|"](input), [1, 444], @@ -2054,7 +2096,7 @@ const test_OPERATORS_FNS = t => { }); t.testing("concat", () => { - arr.splice(0); + arr = []; t.assertEq( OPERATORS_FNS({ concatFn: fn(555) })["concat"](input), [1, 555], |