Add initial support for caret and dollar metacharacters

author: EuAndreh <eu@euandre.org> 2025-07-20 09:31:16 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-20 09:31:16 -0300
commit: 846adfdbb0931b9bda93cf38383103b44d7bd62b (patch)
tree: 5b321562a45af19f713bac31fcbea3cac35ffa26
parent: .gitignore: Remove trailing slash from node_modules rule (diff)
download: paca-846adfdbb0931b9bda93cf38383103b44d7bd62b.tar.gz
paca-846adfdbb0931b9bda93cf38383103b44d7bd62b.tar.xz
2 files changed, 200 insertions, 22 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
index 4a85067..21a28e8 100644
--- a/src/paca.mjs
+++ b/src/paca.mjs
@@ -15,7 +15,7 @@ const ConcatStep = {
 	CLASS:     "class",
 };
 
-const nonConcatOperators = new Set(["*", "+", "?", "|", ")", "$"]);
+const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]);
 
 const shouldConcat = (char, next) =>
 	next !== undefined &&
@@ -24,7 +24,7 @@ const shouldConcat = (char, next) =>
 	char !== "{" &&
 	!nonConcatOperators.has(next);
 
-const operatorChars = new Set([...nonConcatOperators, "("]);
+const operatorChars = new Set([...nonConcatOperators, "(", "$"]);
 const isOperator = char =>
 	operatorChars.has(char);
 
@@ -301,7 +301,10 @@ const ANCHOR_FNS = {
 				),
 			})
 			: {
-				out: out.concat({ operator: "caret" }),
+				out: out.concat([
+					{ meta: "^" },
+					{ operator: "concat" },
+				]),
 				state,
 				context,
 			},
@@ -316,7 +319,7 @@ const ANCHOR_FNS = {
 				),
 			})
 			: {
-				out: out.concat({ operator: "dollar" }),
+				out: out.concat([{ meta: "$" }]),
 				state,
 				context,
 			},
@@ -693,6 +696,52 @@ const wildcard = (_edge, id) => {
 	};
 };
 
+const caret = (_edge, id) => {
+	const start = id + 0;
+	const end   = id + 1;
+	return {
+		start,
+		end,
+		nodes: {
+			[start]: {
+				direct:      [],
+				transitions: {},
+				meta: {
+					op: "caret",
+					to: end,
+				},
+			},
+			[end]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const dollar = (_edge, id) => {
+	const start = id + 0;
+	const end   = id + 1;
+	return {
+		start,
+		end,
+		nodes: {
+			[start]: {
+				direct:      [],
+				transitions: {},
+				meta: {
+					op: "dollar",
+					to: end,
+				},
+			},
+			[end]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
 const OPERATORS_FNS = ({
 	zeroOrMoreFn = zeroOrMore,
 	oneOrMoreFn  = oneOrMore,
@@ -717,6 +766,8 @@ const OPERATORS = OPERATORS_FNS();
 const METACHARACTERS_FNS = {
 	"class": characterClass,
 	".":     wildcard,
+	"^":     caret,
+	"$":     dollar,
 };
 
 const baseNFA = (token, id) => (
diff --git a/tests/paca.mjs b/tests/paca.mjs
index 36ab2a7..6913b9e 100644
--- a/tests/paca.mjs
+++ b/tests/paca.mjs
@@ -34,6 +34,8 @@ import {
 	inRange,
 	characterClass,
 	wildcard,
+	caret,
+	dollar,
 	OPERATORS_FNS,
 	baseNFA,
 	buildNFAStep,
@@ -619,7 +621,7 @@ const test_ANCHOR_FNS = t => {
 	t.testing("caret operator gets added to output", () => {
 		const given = ANCHOR_FNS["^"]({ out: [ 1 ] }, null, 0, null);
 		const expected = {
-			out:     [ 1, { operator: "caret" } ],
+			out:     [ 1, { meta: "^" }, { operator: "concat" } ],
 			state:   undefined,
 			context: undefined,
 		};
@@ -634,7 +636,7 @@ const test_ANCHOR_FNS = t => {
 			undefined,
 		);
 		const expected = {
-			out:     [ 2, { operator: "dollar" } ],
+			out:     [ 2, { meta: "$" } ],
 			state:   undefined,
 			context: undefined,
 		};
@@ -682,8 +684,8 @@ const test_tokenizeRegexStep = t => {
 	const oparen = { operator: "(" };
 	const cparen = { operator: ")" };
 	const star   = { operator: "*" };
-	const caret  = { operator: "caret"  };
-	const dollar = { operator: "dollar" };
+	const caret  = { meta: "^" };
+	const dollar = { meta: "$" };
 
 
 	t.testing("when escaping we get whatever the char is", () => {
@@ -836,11 +838,11 @@ const test_tokenizeRegexStep = t => {
 			state:   ConcatStep.ACCEPTING,
 			context: null,
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.ACCEPTING,
 			context: null,
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -850,7 +852,7 @@ const test_tokenizeRegexStep = t => {
 				set: [],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -860,7 +862,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -870,7 +872,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -880,7 +882,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e", "h"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -890,7 +892,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e", "h", "i"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -900,7 +902,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e", "h", "i", "l"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -910,7 +912,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e", "h", "i", "l", "o"],
 			},
 		}, {
-			out:     [caret],
+			out:     [caret, cat],
 			state:   ConcatStep.CLASS,
 			context: {
 				range: {
@@ -920,7 +922,7 @@ const test_tokenizeRegexStep = t => {
 				set: ["b", "e", "h", "i", "l", "o", "s"],
 			},
 		}, {
-			out: [caret, {
+			out: [caret, cat, {
 				meta:  "class",
 				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
 				caret: false,
@@ -928,19 +930,19 @@ const test_tokenizeRegexStep = t => {
 			state:   "accepting",
 			context: null,
 		}, {
-			out: [caret, {
+			out: [caret, cat, {
 				meta:  "class",
 				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
 				caret: false,
-			}, star],
+			}, star, cat],
 			state:   "accepting",
 			context: null,
 		}, {
-			out: [caret, {
+			out: [caret, cat, {
 				meta:  "class",
 				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
 				caret: false,
-			}, star, dollar],
+			}, star, cat, dollar],
 			state:   "accepting",
 			context: null,
 		}];
@@ -2281,6 +2283,58 @@ const test_wildcard = t => {
 	});
 };
 
+const test_caret = t => {
+	t.start("caret()");
+
+	t.testing("we get the NFA with the caret meta attribute", () => {
+		const expected = {
+			start:  3,
+			end:    4,
+			nodes: {
+				3: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: "caret",
+						to: 4,
+					},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(caret("IGNORED", 3), expected);
+	});
+};
+
+const test_dollar = t => {
+	t.start("dollar()");
+
+	t.testing("we get the NFA that matches the end of string", () => {
+		const expected = {
+			start:  2,
+			end:    3,
+			nodes: {
+				2: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: "dollar",
+						to: 3,
+					},
+				},
+				3: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(dollar("IGNORED", 2), expected);
+	});
+};
+
 const test_OPERATORS_FNS = t => {
 	t.start("OPERATORS_FNS");
 
@@ -2600,6 +2654,69 @@ const test_buildNFA = t => {
 			expected,
 		);
 	});
+
+	t.testing("example with metacharacters", () => {
+		const regex = "^[behilos]*$";
+		const expected = {
+			start:  1,
+			end:    8,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: "caret",
+						to: 2,
+					},
+				},
+				2: {
+					direct:      [5],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: "includes",
+						to: 4,
+						matches: new Set([
+							"b", "e", "h", "i", "l",
+							"o", "s",
+						]),
+						ranges: {},
+					},
+				},
+				4: {
+					direct:      [5],
+					transitions: {},
+				},
+				5: {
+					direct:      [3, 6],
+					transitions: {},
+				},
+				6: {
+					direct:      [7],
+					transitions: {},
+				},
+				7: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: "dollar",
+						to: 8,
+					},
+				},
+				8: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(
+			buildNFA(toPostfix(tokenizeRegex(explode(regex)))),
+			expected,
+		);
+	});
 };
 
 const test_allDirects = t => {
@@ -2844,6 +2961,14 @@ const test_searchNFA = t => {
 		t.assertEq(searchNFA(nfa, "babac"), true);
 		t.assertEq(searchNFA(nfa, "babaca"), false);
 	});
+
+	t.testing("regex with metacharacters", () => {
+		const regex = "^[behilos]*";
+		const nfa = buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+		t.assertEq(searchNFA(nfa, "helios"),  true);
+		t.assertEq(searchNFA(nfa, "helios "), false);
+		t.assertEq(searchNFA(nfa, "abc"), false);
+	});
 };
 
 const test_nodeID = t => {
@@ -3522,6 +3647,8 @@ runTests([
 	test_inRange,
 	test_characterClass,
 	test_wildcard,
+	test_caret,
+	test_dollar,
 	test_OPERATORS_FNS,
 	test_baseNFA,
 	test_buildNFAStep,
author	EuAndreh <eu@euandre.org>	2025-07-20 09:31:16 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-20 09:31:16 -0300
commit	846adfdbb0931b9bda93cf38383103b44d7bd62b (patch)
tree	5b321562a45af19f713bac31fcbea3cac35ffa26
parent	.gitignore: Remove trailing slash from node_modules rule (diff)
download	paca-846adfdbb0931b9bda93cf38383103b44d7bd62b.tar.gz paca-846adfdbb0931b9bda93cf38383103b44d7bd62b.tar.xz