Only tolerate escaping of special chars

* src/paca.mjs (escapingStateStep): Return an error when escaping non-metacharacters. This way cases like \d, which is syntax for [0-9] which will eventually be recognized, will not change its behaviour from a noop escape of "d" to matching digits. (operatorChars, isOperator): Hoist both of these up before their usage in `escapingStateStep()`. * tests/paca.mjs (test_isOperator): Hoist its definition and position inside the `runTests([...])` array to match src/paca.mjs. (test_escapingStateStep): Adjust existing cases and add test case for good/bad escapes. (test_tokenizeRegexStep): Fix bad starting escape, that broke because it was escaping a non-metacharacter.
author: EuAndreh <eu@euandre.org> 2025-07-15 21:37:16 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-15 21:44:55 -0300
commit: 1ce80e005a374488c186d0f545af33096f6523d5 (patch)
tree: 997f02f05a40d1fab2c9854dfdf5c687b8bb47c7
parent: Support tokenizing `.` wildcard operator. (diff)
download: paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.gz
paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.xz
2 files changed, 59 insertions, 38 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
index 5d11b05..7a01407 100644
--- a/src/paca.mjs
+++ b/src/paca.mjs
@@ -24,19 +24,33 @@ const shouldConcat = (char, next) =>
 	char !== "{" &&
 	!nonConcatOperators.has(next);
 
+const operatorChars = new Set([...nonConcatOperators, "(", "."]);
+const isOperator = char =>
+	operatorChars.has(char);
+
 const numFromDigits = digits =>
 	digits.length === 0
 		? -1
 		: Number(digits.join(""));
 
-const escapingStateStep = ({ out, _state, context }, char, _index, next) => ({
-	out: out.concat(
-		char,
-		shouldConcat(null, next) ? [{ operator: "concat" }] : [],
-	),
-	state: ConcatStep.ACCEPTING,
-	context,
-});
+const escapingStateStep = ({ out, state, context }, char, _index, next) => 
+	!(isOperator(char) || char === "\\")
+		? reduced({
+			out,
+			state,
+			context,
+			error: new SyntaxError(
+				"unknown escape sequence: \\" + char,
+			),
+		})
+		: {
+			out: out.concat(
+				char,
+				shouldConcat(null, next) ? [{ operator: "concat" }] : [],
+			),
+			state: ConcatStep.ACCEPTING,
+			context,
+		};
 
 const rangeStateStep = ({ out, state, context }, char, _index, _next) => {
 	if (char === "}") {
@@ -313,10 +327,6 @@ const transitionChars = new Set(Object.keys(TRANSITION_FNS));
 const isTransition = char =>
 	transitionChars.has(char);
 
-const operatorChars = new Set([...nonConcatOperators, "(", "."]);
-const isOperator = char =>
-	operatorChars.has(char);
-
 const tokenizeRegexStep = chars => ({ out, state, context }, char, index) => {
 	const next = chars[index + 1];
 
diff --git a/tests/paca.mjs b/tests/paca.mjs
index bc6e1d0..789f959 100644
--- a/tests/paca.mjs
+++ b/tests/paca.mjs
@@ -5,6 +5,7 @@ import {
 	ValueError,
 	ConcatStep,
 	shouldConcat,
+	isOperator,
 	numFromDigits,
 	escapingStateStep,
 	rangeStateStep,
@@ -13,7 +14,6 @@ import {
 	ANCHOR_FNS,
 	isAnchor,
 	isTransition,
-	isOperator,
 	tokenizeRegexStep,
 	tokenizeRegexFn,
 	tokenizeRegex,
@@ -84,6 +84,25 @@ const test_shouldConcat = t => {
 	});
 };
 
+const test_isOperator = t => {
+	t.start("isOperator()");
+
+	t.testing("operators and open parens are true", () => {
+		t.assertEq(isOperator("*"), true);
+		t.assertEq(isOperator("|"), true);
+		t.assertEq(isOperator("+"), true);
+		t.assertEq(isOperator("?"), true);
+		t.assertEq(isOperator("("), true);
+		t.assertEq(isOperator(")"), true);
+		t.assertEq(isOperator("."), true);
+	});
+
+	t.testing("false for everyday non-meta chars", () => {
+		t.assertEq(isOperator("a"), false);
+		t.assertEq(isOperator("_"), false);
+	});
+};
+
 const test_numFromDigits = t => {
 	t.start("numFromDigits()");
 
@@ -105,12 +124,12 @@ const test_escapingStateStep = t => {
 	t.testing("add a concat when applicable", () => {
 		const given = escapingStateStep(
 			{ out: [ 1, 2, 3 ] },
-			"a",
+			"*",
 			null,
 			"b",
 		);
 		const expected = {
-			out:     [ 1, 2, 3, "a", { operator: "concat" } ],
+			out:     [ 1, 2, 3, "*", { operator: "concat" } ],
 			state:   "accepting",
 			context: undefined,
 		};
@@ -120,17 +139,28 @@ const test_escapingStateStep = t => {
 	t.testing("without a concat when not applicable", () => {
 		const given = escapingStateStep(
 			{ out: [ 1, 2, 3 ] },
-			"a",
+			"$",
 			null,
 			")",
 		);
 		const expected = {
-			out:     [ 1, 2, 3, "a" ],
+			out:     [ 1, 2, 3, "$" ],
 			state:   "accepting",
 			context: undefined,
 		};
 		t.assertEq(given, expected);
 	});
+
+	t.testing("error when escaping a non-escapeable char", () => {
+		const { value: { error }} = escapingStateStep(
+			{},
+			"a",
+			null,
+			null,
+		);
+		t.assertEq(error.message, "unknown escape sequence: \\a"),
+		t.assertEq(error instanceof SyntaxError, true);
+	});
 };
 
 const test_rangeStateStep = t => {
@@ -635,25 +665,6 @@ const test_isTransition = t => {
 	});
 };
 
-const test_isOperator = t => {
-	t.start("isOperator()");
-
-	t.testing("operators and open parens are true", () => {
-		t.assertEq(isOperator("*"), true);
-		t.assertEq(isOperator("|"), true);
-		t.assertEq(isOperator("+"), true);
-		t.assertEq(isOperator("?"), true);
-		t.assertEq(isOperator("("), true);
-		t.assertEq(isOperator(")"), true);
-		t.assertEq(isOperator("."), true);
-	});
-
-	t.testing("false for everyday non-meta chars", () => {
-		t.assertEq(isOperator("a"), false);
-		t.assertEq(isOperator("_"), false);
-	});
-};
-
 const test_tokenizeRegexStep = t => {
 	t.start("tokenizeRegexStep()");
 
@@ -671,7 +682,7 @@ const test_tokenizeRegexStep = t => {
 		const stepFn = tokenizeRegexStep(regex);
 		const steps = [{
 			out:     [],
-			state:   ConcatStep.ESCAPING,
+			state:   ConcatStep.ACCEPTING,
 			context: null,
 		}, {
 			out:     ["a", cat],
@@ -3041,6 +3052,7 @@ const test_compile = t => {
 
 runTests([
 	test_shouldConcat,
+	test_isOperator,
 	test_numFromDigits,
 	test_escapingStateStep,
 	test_rangeStateStep,
@@ -3049,7 +3061,6 @@ runTests([
 	test_ANCHOR_FNS,
 	test_isAnchor,
 	test_isTransition,
-	test_isOperator,
 	test_tokenizeRegexStep,
 	test_tokenizeRegexFn,
 	test_tokenizeRegex,
author	EuAndreh <eu@euandre.org>	2025-07-15 21:37:16 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-15 21:44:55 -0300
commit	1ce80e005a374488c186d0f545af33096f6523d5 (patch)
tree	997f02f05a40d1fab2c9854dfdf5c687b8bb47c7
parent	Support tokenizing `.` wildcard operator. (diff)
download	paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.gz paca-1ce80e005a374488c186d0f545af33096f6523d5.tar.xz