diff options
author | EuAndreh <eu@euandre.org> | 2025-07-16 10:32:04 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2025-07-16 10:32:04 -0300 |
commit | 69e36152600599126d95ae1085b0777a830bc97b (patch) | |
tree | 43527bb37c38cde9921f579153e85edf915ce97f | |
parent | Differentiate an "operator" from a "meta" character (diff) | |
download | paca-69e36152600599126d95ae1085b0777a830bc97b.tar.gz paca-69e36152600599126d95ae1085b0777a830bc97b.tar.xz |
Build NFA nodes for "." and "class" metacharacters
* src/paca.mjs
(characterClass): Add function that builds the NFA node for
`{ meta: "class" }`. This node leaves the "direct" and
"transitions" keys empty, and add its data under the "meta" key.
One option was to use an inline function that could simply be
called directly during the search to check for a match, but instead
I chose a data representation instead, in order to keep the NFA
literal as obvious and self-representing as possible. Later, the
searching part will have to properly interpret the data of "meta"
properly, instead of blindly executing an opaque function. This
does separate the compilation from execution logic, but keep the NFA
clean of opaque closures.
(wildcard): Add function that buildl the NFA node for `{ meta: "." }`.
Similar to `characterClass()`, the new "meta" key contains pure data
that represents the execution of the metacharacter during search.
(baseNFA, literal): Rename the existing `baseNFA()` to `literal()`.
Then add a new `baseNFA()` function that decides between a character
literal and a metacharacter.
(buildNFAStep): Instead of checking the type of `token`, we check if
`token` has the "operator" attribute, since we now have
metacharacters that also aren't strings.
(classStateStep): Add missing "caret" key to the final metacharacter
output. It was already being detected, just not included in the
result.
(escapingStateStep): Stick to 80 columns.
* tests/paca.mjs
(test_characterClass, test_wildcard, test_baseNFA): Add obligatory
test cases.
(test_buildNFAStep): Include test case for metacharacter.
-rw-r--r-- | src/paca.mjs | 82 | ||||
-rw-r--r-- | tests/paca.mjs | 261 |
2 files changed, 305 insertions, 38 deletions
diff --git a/src/paca.mjs b/src/paca.mjs index ccff1ca..0625d5c 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -46,7 +46,9 @@ const escapingStateStep = ({ out, state, context }, char, _index, next) => : { out: out.concat( char, - shouldConcat(null, next) ? [{ operator: "concat" }] : [], + shouldConcat(null, next) + ? [{ operator: "concat" }] + : [], ), state: ConcatStep.ACCEPTING, context, @@ -167,8 +169,9 @@ const classStateStep = ({ out, state, context }, char, _index, _next) => { return { out: out.concat({ - meta: "class", - set: context.set, + meta: "class", + set: context.set, + caret: !!context.caret, }), state: ConcatStep.ACCEPTING, context: null, @@ -495,7 +498,7 @@ const emptyNFA = () => { }; }; -const baseNFA = (edge, id) => { +const literal = (edge, id) => { const startID = id + 0; const endID = id + 1; const nextID = id + 2; @@ -620,6 +623,63 @@ const zeroOrOne = nfa => { }; }; +const characterClass = ({ set, caret }, id) => { + const start = id + 0; + const end = id + 1; + const nextID = id + 2; + const { string, object } = Object.groupBy(set, x => typeof x); + const matches = new Set(string); + const ranges = Object.fromEntries(object.map( + ({ from, to }) => [ from.charCodeAt(0), to.charCodeAt(0) ], + )); + return { + start, + end, + nextID, + nodes: { + [start]: { + direct: [], + transitions: {}, + meta: { + op: caret ? "excludes" : "includes", + to: end, + matches, + ranges, + }, + }, + [end]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const wildcard = (_edge, id) => { + const start = id + 0; + const end = id + 1; + const nextID = id + 2; + return { + start, + end, + nextID, + nodes: { + [start]: { + direct: [], + transitions: {}, + meta: { + op: true, + to: end, + }, + }, + [end]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + const OPERATORS_FNS = ({ zeroOrMoreFn = zeroOrMore, oneOrMoreFn = oneOrMore, @@ -639,11 +699,21 @@ const OPERATORS_FNS = ({ last(stack), )), }); - const OPERATORS = OPERATORS_FNS(); +const METACHARACTERS_FNS = { + "class": characterClass, + ".": wildcard, +}; + +const baseNFA = (token, id) => ( + !token.meta + ? literal + : METACHARACTERS_FNS[token.meta] +)(token, id); + const buildNFAStep = (stack, token) => - typeof token === "string" + !token.operator ? stack.concat(baseNFA(token, last(stack)?.nextID || 1)) : OPERATORS[token.operator](stack); diff --git a/tests/paca.mjs b/tests/paca.mjs index 55185f7..06b3c8b 100644 --- a/tests/paca.mjs +++ b/tests/paca.mjs @@ -22,13 +22,16 @@ import { toPostfixStep, toPostfix, emptyNFA, - baseNFA, + literal, concat, union, zeroOrMore, oneOrMore, zeroOrOne, + characterClass, + wildcard, OPERATORS_FNS, + baseNFA, buildNFAStep, buildNFA, allDirects, @@ -355,8 +358,9 @@ const test_classStateStep = t => { ); const expected = { out: [ 1, 2, 3, { - meta: "class", - set: [ 4, 5, 6 ], + meta: "class", + set: [ 4, 5, 6 ], + caret: false, }], state: "accepting", context: null, @@ -911,22 +915,25 @@ const test_tokenizeRegexStep = t => { }, }, { out: [caret, { - meta: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], + caret: false, }], state: "accepting", context: null, }, { out: [caret, { - meta: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], + caret: false, }, star], state: "accepting", context: null, }, { out: [caret, { - meta: "class", - set: [ "b", "e", "h", "i", "l", "o", "s" ], + meta: "class", + set: [ "b", "e", "h", "i", "l", "o", "s" ], + caret: false, }, star, dollar], state: "accepting", context: null, @@ -1768,8 +1775,8 @@ const test_emptyNFA = t => { }); }; -const test_baseNFA = t => { - t.start("baseNFA()"); +const test_literal = t => { + t.start("literal()"); t.testing("the given edge connects the nodes", () => { const expected = { @@ -1787,17 +1794,17 @@ const test_baseNFA = t => { }, }, }; - t.assertEq(baseNFA("X", 123), expected); + t.assertEq(literal("X", 123), expected); }); }; const test_concat = t => { t.start("concat()"); - t.testing("on baseNFA() and on more complex NFAs", () => { - const a = baseNFA("a", 1); - const b = baseNFA("b", a.nextID); - const a_ = baseNFA("a", b.nextID); + t.testing("on literal() and on more complex NFAs", () => { + const a = literal("a", 1); + const b = literal("b", a.nextID); + const a_ = literal("a", b.nextID); const expected1 = { start: 1, end: 4, @@ -1861,9 +1868,9 @@ const test_concat = t => { const test_union = t => { t.start("union()"); - t.testing("on baseNFA() and on more complex NFAs", () => { - const a = baseNFA("a", 1); - const b = baseNFA("b", a.nextID); + t.testing("on literal() and on more complex NFAs", () => { + const a = literal("a", 1); + const b = literal("b", a.nextID); const expected1 = { start: 5, end: 6, @@ -1897,7 +1904,7 @@ const test_union = t => { }; t.assertEq(union(a, b), expected1); - const c = baseNFA("c", expected1.nextID); + const c = literal("c", expected1.nextID); const expected2 = { start: 9, end: 10, @@ -1952,8 +1959,8 @@ const test_union = t => { const test_zeroOrMore = t => { t.start("zeroOrMore()"); - t.testing("on baseNFA()", () => { - const a = baseNFA("a", 1); + t.testing("on literal()", () => { + const a = literal("a", 1); const expected = { start: 3, end: 4, @@ -1984,8 +1991,8 @@ const test_zeroOrMore = t => { const test_oneOrMore = t => { t.start("oneOrMore()"); - t.testing("on baseNFA()", () => { - const a = baseNFA("a", 1); + t.testing("on literal()", () => { + const a = literal("a", 1); const expected = { start: 1, end: 4, @@ -2016,8 +2023,8 @@ const test_oneOrMore = t => { const test_zeroOrOne = t => { t.start("zeroOrOne()"); - t.testing("on baseNFA()", () => { - const a = baseNFA("a", 1); + t.testing("on literal()", () => { + const a = literal("a", 1); const expected = { start: 3, end: 4, @@ -2045,6 +2052,109 @@ const test_zeroOrOne = t => { }); }; +const test_characterClass = t => { + t.start("characterClass()"); + + t.testing("when caret is true, op is excludes", () => { + const expected = { + start: 3, + end: 4, + nextID: 5, + nodes: { + 3: { + direct: [], + transitions: {}, + meta: { + op: "excludes", + to: 4, + matches: new Set([ + "-", "a", "b", "c", + ".", "_", "$", "^", + ]), + ranges: { + 102: 112, + 116: 122, + }, + }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(characterClass({ + set: [ + "-", "a", "b", "c", { from: "f", to: "p" }, ".", + { from: "t", to: "z" }, "_", "$", "^", "a", "_", + ], + caret: true, + }, 3), expected); + }); + + t.testing("when caret is false, op is includes", () => { + const expected = { + start: 7, + end: 8, + nextID: 9, + nodes: { + 7: { + direct: [], + transitions: {}, + meta: { + op: "includes", + to: 8, + matches: new Set([]), + ranges: { + 48: 57, + 65: 90, + 97: 122, + }, + }, + }, + 8: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(characterClass({ + set: [ + { from: "a", to: "z" }, + { from: "A", to: "Z" }, + { from: "0", to: "9" }, + ], + }, 7), expected); + }); +}; + +const test_wildcard = t => { + t.start("wildcard()"); + + t.testing("the returned nfa always matches", () => { + const expected = { + start: 5, + end: 6, + nextID: 7, + nodes: { + 5: { + direct: [], + transitions: {}, + meta: { + op: true, + to: 6, + } + }, + 6: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(wildcard("IGNORED", 5), expected); + }); +}; + const test_OPERATORS_FNS = t => { t.start("OPERATORS_FNS"); @@ -2105,6 +2215,49 @@ const test_OPERATORS_FNS = t => { }); }; +const test_baseNFA = t => { + t.start("baseNFA()"); + + t.testing("decide between metacharacter and literal", () => { + const expected1 = { + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }; + const expected2 = { + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [], + transitions: {}, + meta: { + op: true, + to: 2, + }, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(baseNFA("a", 1), expected1); + t.assertEq(baseNFA({ meta: "." }, 1), expected2); + }); +}; + const test_buildNFAStep = t => { t.start("buildNFAStep()"); @@ -2129,7 +2282,7 @@ const test_buildNFAStep = t => { }); t.testing("literal char with existing nextID", () => { - const input = baseNFA("a", 1); + const input = [ literal("a", 1) ]; const expected = [{ start: 1, end: 2, @@ -2159,15 +2312,56 @@ const test_buildNFAStep = t => { }, }, }]; - t.assertEq(buildNFAStep([input], "b"), expected); + t.assertEq(buildNFAStep(input, "b"), expected); t.assertEq( - buildNFAStep([input], "b"), - [ baseNFA("a", 1), baseNFA("b", 3) ] + buildNFAStep(input, "b"), + [ literal("a", 1), literal("b", 3) ] + ); + }); + + t.testing("metacharacter", () => { + const input = [ literal("a", 1) ]; + const expected = [{ + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }, { + start: 3, + end: 4, + nextID: 5, + nodes: { + 3: { + direct: [], + transitions: {}, + meta: { + op: true, + to: 4, + }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }]; + t.assertEq( + buildNFAStep(input, { meta: "." }), + expected, ); }); t.testing("operator", () => { - const input = [ baseNFA("a", 1), baseNFA("b", 3) ]; + const input = [ literal("a", 1), literal("b", 3) ]; const expected = [{ start: 1, end: 4, @@ -3111,13 +3305,16 @@ runTests([ test_toPostfixStep, test_toPostfix, test_emptyNFA, - test_baseNFA, + test_literal, test_concat, test_union, test_zeroOrMore, test_oneOrMore, test_zeroOrOne, + test_characterClass, + test_wildcard, test_OPERATORS_FNS, + test_baseNFA, test_buildNFAStep, test_buildNFA, test_allDirects, |