From 846adfdbb0931b9bda93cf38383103b44d7bd62b Mon Sep 17 00:00:00 2001 From: EuAndreh Date: Sun, 20 Jul 2025 09:31:16 -0300 Subject: Add initial support for caret and dollar metacharacters --- src/paca.mjs | 59 +++++++++++++++++++-- tests/paca.mjs | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 200 insertions(+), 22 deletions(-) diff --git a/src/paca.mjs b/src/paca.mjs index 4a85067..21a28e8 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -15,7 +15,7 @@ const ConcatStep = { CLASS: "class", }; -const nonConcatOperators = new Set(["*", "+", "?", "|", ")", "$"]); +const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]); const shouldConcat = (char, next) => next !== undefined && @@ -24,7 +24,7 @@ const shouldConcat = (char, next) => char !== "{" && !nonConcatOperators.has(next); -const operatorChars = new Set([...nonConcatOperators, "("]); +const operatorChars = new Set([...nonConcatOperators, "(", "$"]); const isOperator = char => operatorChars.has(char); @@ -301,7 +301,10 @@ const ANCHOR_FNS = { ), }) : { - out: out.concat({ operator: "caret" }), + out: out.concat([ + { meta: "^" }, + { operator: "concat" }, + ]), state, context, }, @@ -316,7 +319,7 @@ const ANCHOR_FNS = { ), }) : { - out: out.concat({ operator: "dollar" }), + out: out.concat([{ meta: "$" }]), state, context, }, @@ -693,6 +696,52 @@ const wildcard = (_edge, id) => { }; }; +const caret = (_edge, id) => { + const start = id + 0; + const end = id + 1; + return { + start, + end, + nodes: { + [start]: { + direct: [], + transitions: {}, + meta: { + op: "caret", + to: end, + }, + }, + [end]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const dollar = (_edge, id) => { + const start = id + 0; + const end = id + 1; + return { + start, + end, + nodes: { + [start]: { + direct: [], + transitions: {}, + meta: { + op: "dollar", + to: end, + }, + }, + [end]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + const OPERATORS_FNS = ({ zeroOrMoreFn = zeroOrMore, oneOrMoreFn = oneOrMore, @@ -717,6 +766,8 @@ const OPERATORS = OPERATORS_FNS(); const METACHARACTERS_FNS = { "class": characterClass, ".": wildcard, + "^": caret, + "$": dollar, }; const baseNFA = (token, id) => ( diff --git a/tests/paca.mjs b/tests/paca.mjs index 36ab2a7..6913b9e 100644 --- a/tests/paca.mjs +++ b/tests/paca.mjs @@ -34,6 +34,8 @@ import { inRange, characterClass, wildcard, + caret, + dollar, OPERATORS_FNS, baseNFA, buildNFAStep, @@ -619,7 +621,7 @@ const test_ANCHOR_FNS = t => { t.testing("caret operator gets added to output", () => { const given = ANCHOR_FNS["^"]({ out: [ 1 ] }, null, 0, null); const expected = { - out: [ 1, { operator: "caret" } ], + out: [ 1, { meta: "^" }, { operator: "concat" } ], state: undefined, context: undefined, }; @@ -634,7 +636,7 @@ const test_ANCHOR_FNS = t => { undefined, ); const expected = { - out: [ 2, { operator: "dollar" } ], + out: [ 2, { meta: "$" } ], state: undefined, context: undefined, }; @@ -682,8 +684,8 @@ const test_tokenizeRegexStep = t => { const oparen = { operator: "(" }; const cparen = { operator: ")" }; const star = { operator: "*" }; - const caret = { operator: "caret" }; - const dollar = { operator: "dollar" }; + const caret = { meta: "^" }; + const dollar = { meta: "$" }; t.testing("when escaping we get whatever the char is", () => { @@ -836,11 +838,11 @@ const test_tokenizeRegexStep = t => { state: ConcatStep.ACCEPTING, context: null, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.ACCEPTING, context: null, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -850,7 +852,7 @@ const test_tokenizeRegexStep = t => { set: [], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -860,7 +862,7 @@ const test_tokenizeRegexStep = t => { set: ["b"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -870,7 +872,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -880,7 +882,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e", "h"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -890,7 +892,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e", "h", "i"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -900,7 +902,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e", "h", "i", "l"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -910,7 +912,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e", "h", "i", "l", "o"], }, }, { - out: [caret], + out: [caret, cat], state: ConcatStep.CLASS, context: { range: { @@ -920,7 +922,7 @@ const test_tokenizeRegexStep = t => { set: ["b", "e", "h", "i", "l", "o", "s"], }, }, { - out: [caret, { + out: [caret, cat, { meta: "class", set: [ "b", "e", "h", "i", "l", "o", "s" ], caret: false, @@ -928,19 +930,19 @@ const test_tokenizeRegexStep = t => { state: "accepting", context: null, }, { - out: [caret, { + out: [caret, cat, { meta: "class", set: [ "b", "e", "h", "i", "l", "o", "s" ], caret: false, - }, star], + }, star, cat], state: "accepting", context: null, }, { - out: [caret, { + out: [caret, cat, { meta: "class", set: [ "b", "e", "h", "i", "l", "o", "s" ], caret: false, - }, star, dollar], + }, star, cat, dollar], state: "accepting", context: null, }]; @@ -2281,6 +2283,58 @@ const test_wildcard = t => { }); }; +const test_caret = t => { + t.start("caret()"); + + t.testing("we get the NFA with the caret meta attribute", () => { + const expected = { + start: 3, + end: 4, + nodes: { + 3: { + direct: [], + transitions: {}, + meta: { + op: "caret", + to: 4, + }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(caret("IGNORED", 3), expected); + }); +}; + +const test_dollar = t => { + t.start("dollar()"); + + t.testing("we get the NFA that matches the end of string", () => { + const expected = { + start: 2, + end: 3, + nodes: { + 2: { + direct: [], + transitions: {}, + meta: { + op: "dollar", + to: 3, + }, + }, + 3: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(dollar("IGNORED", 2), expected); + }); +}; + const test_OPERATORS_FNS = t => { t.start("OPERATORS_FNS"); @@ -2600,6 +2654,69 @@ const test_buildNFA = t => { expected, ); }); + + t.testing("example with metacharacters", () => { + const regex = "^[behilos]*$"; + const expected = { + start: 1, + end: 8, + nodes: { + 1: { + direct: [], + transitions: {}, + meta: { + op: "caret", + to: 2, + }, + }, + 2: { + direct: [5], + transitions: {}, + }, + 3: { + direct: [], + transitions: {}, + meta: { + op: "includes", + to: 4, + matches: new Set([ + "b", "e", "h", "i", "l", + "o", "s", + ]), + ranges: {}, + }, + }, + 4: { + direct: [5], + transitions: {}, + }, + 5: { + direct: [3, 6], + transitions: {}, + }, + 6: { + direct: [7], + transitions: {}, + }, + 7: { + direct: [], + transitions: {}, + meta: { + op: "dollar", + to: 8, + }, + }, + 8: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq( + buildNFA(toPostfix(tokenizeRegex(explode(regex)))), + expected, + ); + }); }; const test_allDirects = t => { @@ -2844,6 +2961,14 @@ const test_searchNFA = t => { t.assertEq(searchNFA(nfa, "babac"), true); t.assertEq(searchNFA(nfa, "babaca"), false); }); + + t.testing("regex with metacharacters", () => { + const regex = "^[behilos]*"; + const nfa = buildNFA(toPostfix(tokenizeRegex(explode(regex)))); + t.assertEq(searchNFA(nfa, "helios"), true); + t.assertEq(searchNFA(nfa, "helios "), false); + t.assertEq(searchNFA(nfa, "abc"), false); + }); }; const test_nodeID = t => { @@ -3522,6 +3647,8 @@ runTests([ test_inRange, test_characterClass, test_wildcard, + test_caret, + test_dollar, test_OPERATORS_FNS, test_baseNFA, test_buildNFAStep, -- cgit v1.2.3