Build NFA nodes for "." and "class" metacharacters

* src/paca.mjs (characterClass): Add function that builds the NFA node for `{ meta: "class" }`. This node leaves the "direct" and "transitions" keys empty, and add its data under the "meta" key. One option was to use an inline function that could simply be called directly during the search to check for a match, but instead I chose a data representation instead, in order to keep the NFA literal as obvious and self-representing as possible. Later, the searching part will have to properly interpret the data of "meta" properly, instead of blindly executing an opaque function. This does separate the compilation from execution logic, but keep the NFA clean of opaque closures. (wildcard): Add function that buildl the NFA node for `{ meta: "." }`. Similar to `characterClass()`, the new "meta" key contains pure data that represents the execution of the metacharacter during search. (baseNFA, literal): Rename the existing `baseNFA()` to `literal()`. Then add a new `baseNFA()` function that decides between a character literal and a metacharacter. (buildNFAStep): Instead of checking the type of `token`, we check if `token` has the "operator" attribute, since we now have metacharacters that also aren't strings. (classStateStep): Add missing "caret" key to the final metacharacter output. It was already being detected, just not included in the result. (escapingStateStep): Stick to 80 columns. * tests/paca.mjs (test_characterClass, test_wildcard, test_baseNFA): Add obligatory test cases. (test_buildNFAStep): Include test case for metacharacter.
author: EuAndreh <eu@euandre.org> 2025-07-16 10:32:04 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-16 10:32:04 -0300
commit: 69e36152600599126d95ae1085b0777a830bc97b (patch)
tree: 43527bb37c38cde9921f579153e85edf915ce97f
parent: Differentiate an "operator" from a "meta" character (diff)
download: paca-69e36152600599126d95ae1085b0777a830bc97b.tar.gz
paca-69e36152600599126d95ae1085b0777a830bc97b.tar.xz
2 files changed, 305 insertions, 38 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
index ccff1ca..0625d5c 100644
--- a/src/paca.mjs
+++ b/src/paca.mjs
@@ -46,7 +46,9 @@ const escapingStateStep = ({ out, state, context }, char, _index, next) =>
 		: {
 			out: out.concat(
 				char,
-				shouldConcat(null, next) ? [{ operator: "concat" }] : [],
+				shouldConcat(null, next)
+					? [{ operator: "concat" }]
+					: [],
 			),
 			state: ConcatStep.ACCEPTING,
 			context,
@@ -167,8 +169,9 @@ const classStateStep = ({ out, state, context }, char, _index, _next) => {
 
 		return {
 			out: out.concat({
-				meta: "class",
-				set:  context.set,
+				meta:  "class",
+				set:   context.set,
+				caret: !!context.caret,
 			}),
 			state:   ConcatStep.ACCEPTING,
 			context: null,
@@ -495,7 +498,7 @@ const emptyNFA = () => {
 	};
 };
 
-const baseNFA = (edge, id) => {
+const literal = (edge, id) => {
 	const startID = id + 0;
 	const endID   = id + 1;
 	const nextID  = id + 2;
@@ -620,6 +623,63 @@ const zeroOrOne = nfa => {
 	};
 };
 
+const characterClass = ({ set, caret }, id) => {
+	const start  = id + 0;
+	const end    = id + 1;
+	const nextID = id + 2;
+	const { string, object } = Object.groupBy(set, x => typeof x);
+	const matches = new Set(string);
+	const ranges  = Object.fromEntries(object.map(
+		({ from, to }) => [ from.charCodeAt(0), to.charCodeAt(0) ],
+	));
+	return {
+		start,
+		end,
+		nextID,
+		nodes: {
+			[start]: {
+				direct:      [],
+				transitions: {},
+				meta: {
+					op: caret ? "excludes" : "includes",
+					to: end,
+					matches,
+					ranges,
+				},
+			},
+			[end]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const wildcard = (_edge, id) => {
+	const start  = id + 0;
+	const end    = id + 1;
+	const nextID = id + 2;
+	return {
+		start,
+		end,
+		nextID,
+		nodes: {
+			[start]: {
+				direct:      [],
+				transitions: {},
+				meta: {
+					op: true,
+					to: end,
+				},
+			},
+			[end]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
 const OPERATORS_FNS = ({
 	zeroOrMoreFn = zeroOrMore,
 	oneOrMoreFn  = oneOrMore,
@@ -639,11 +699,21 @@ const OPERATORS_FNS = ({
 		last(stack),
 	)),
 });
-
 const OPERATORS = OPERATORS_FNS();
 
+const METACHARACTERS_FNS = {
+	"class": characterClass,
+	".":     wildcard,
+};
+
+const baseNFA = (token, id) => (
+	!token.meta
+		? literal
+		: METACHARACTERS_FNS[token.meta]
+)(token, id);
+
 const buildNFAStep = (stack, token) =>
-	typeof token === "string"
+	!token.operator
 		? stack.concat(baseNFA(token, last(stack)?.nextID || 1))
 		: OPERATORS[token.operator](stack);
 
diff --git a/tests/paca.mjs b/tests/paca.mjs
index 55185f7..06b3c8b 100644
--- a/tests/paca.mjs
+++ b/tests/paca.mjs
@@ -22,13 +22,16 @@ import {
 	toPostfixStep,
 	toPostfix,
 	emptyNFA,
-	baseNFA,
+	literal,
 	concat,
 	union,
 	zeroOrMore,
 	oneOrMore,
 	zeroOrOne,
+	characterClass,
+	wildcard,
 	OPERATORS_FNS,
+	baseNFA,
 	buildNFAStep,
 	buildNFA,
 	allDirects,
@@ -355,8 +358,9 @@ const test_classStateStep = t => {
 		);
 		const expected = {
 			out:     [ 1, 2, 3, {
-				meta: "class",
-				set:  [ 4, 5, 6 ],
+				meta:  "class",
+				set:   [ 4, 5, 6 ],
+				caret: false,
 			}],
 			state:   "accepting",
 			context: null,
@@ -911,22 +915,25 @@ const test_tokenizeRegexStep = t => {
 			},
 		}, {
 			out: [caret, {
-				meta: "class",
-				set:  [ "b", "e", "h", "i", "l", "o", "s" ],
+				meta:  "class",
+				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
+				caret: false,
 			}],
 			state:   "accepting",
 			context: null,
 		}, {
 			out: [caret, {
-				meta: "class",
-				set:  [ "b", "e", "h", "i", "l", "o", "s" ],
+				meta:  "class",
+				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
+				caret: false,
 			}, star],
 			state:   "accepting",
 			context: null,
 		}, {
 			out: [caret, {
-				meta: "class",
-				set:  [ "b", "e", "h", "i", "l", "o", "s" ],
+				meta:  "class",
+				set:   [ "b", "e", "h", "i", "l", "o", "s" ],
+				caret: false,
 			}, star, dollar],
 			state:   "accepting",
 			context: null,
@@ -1768,8 +1775,8 @@ const test_emptyNFA = t => {
 	});
 };
 
-const test_baseNFA = t => {
-	t.start("baseNFA()");
+const test_literal = t => {
+	t.start("literal()");
 
 	t.testing("the given edge connects the nodes", () => {
 		const expected = {
@@ -1787,17 +1794,17 @@ const test_baseNFA = t => {
 				},
 			},
 		};
-		t.assertEq(baseNFA("X", 123), expected);
+		t.assertEq(literal("X", 123), expected);
 	});
 };
 
 const test_concat = t => {
 	t.start("concat()");
 
-	t.testing("on baseNFA() and on more complex NFAs", () => {
-		const a  = baseNFA("a", 1);
-		const b  = baseNFA("b", a.nextID);
-		const a_ = baseNFA("a", b.nextID);
+	t.testing("on literal() and on more complex NFAs", () => {
+		const a  = literal("a", 1);
+		const b  = literal("b", a.nextID);
+		const a_ = literal("a", b.nextID);
 		const expected1 = {
 			start:  1,
 			end:    4,
@@ -1861,9 +1868,9 @@ const test_concat = t => {
 const test_union = t => {
 	t.start("union()");
 
-	t.testing("on baseNFA() and on more complex NFAs", () => {
-		const a = baseNFA("a", 1);
-		const b = baseNFA("b", a.nextID);
+	t.testing("on literal() and on more complex NFAs", () => {
+		const a = literal("a", 1);
+		const b = literal("b", a.nextID);
 		const expected1 = {
 			start:  5,
 			end:    6,
@@ -1897,7 +1904,7 @@ const test_union = t => {
 		};
 		t.assertEq(union(a, b), expected1);
 
-		const c = baseNFA("c", expected1.nextID);
+		const c = literal("c", expected1.nextID);
 		const expected2 = {
 			start:  9,
 			end:    10,
@@ -1952,8 +1959,8 @@ const test_union = t => {
 const test_zeroOrMore = t => {
 	t.start("zeroOrMore()");
 
-	t.testing("on baseNFA()", () => {
-		const a = baseNFA("a", 1);
+	t.testing("on literal()", () => {
+		const a = literal("a", 1);
 		const expected = {
 			start:  3,
 			end:    4,
@@ -1984,8 +1991,8 @@ const test_zeroOrMore = t => {
 const test_oneOrMore = t => {
 	t.start("oneOrMore()");
 
-	t.testing("on baseNFA()", () => {
-		const a = baseNFA("a", 1);
+	t.testing("on literal()", () => {
+		const a = literal("a", 1);
 		const expected = {
 			start:  1,
 			end:    4,
@@ -2016,8 +2023,8 @@ const test_oneOrMore = t => {
 const test_zeroOrOne = t => {
 	t.start("zeroOrOne()");
 
-	t.testing("on baseNFA()", () => {
-		const a = baseNFA("a", 1);
+	t.testing("on literal()", () => {
+		const a = literal("a", 1);
 		const expected = {
 			start:  3,
 			end:    4,
@@ -2045,6 +2052,109 @@ const test_zeroOrOne = t => {
 	});
 };
 
+const test_characterClass = t => {
+	t.start("characterClass()");
+
+	t.testing("when caret is true, op is excludes", () => {
+		const expected = {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				3: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op:      "excludes",
+						to:      4,
+						matches: new Set([
+							"-", "a", "b", "c",
+							".", "_", "$", "^",
+						]),
+						ranges: {
+							102: 112,
+							116: 122,
+						},
+					},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(characterClass({
+			set: [
+				"-", "a", "b", "c", { from: "f", to: "p" }, ".",
+				{ from: "t", to: "z" }, "_", "$", "^", "a", "_",
+			],
+			caret: true,
+		}, 3), expected);
+	});
+
+	t.testing("when caret is false, op is includes", () => {
+		const expected = {
+			start:  7,
+			end:    8,
+			nextID: 9,
+			nodes: {
+				7: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op:      "includes",
+						to:      8,
+						matches: new Set([]),
+						ranges: {
+							48: 57,
+							65: 90,
+							97: 122,
+						},
+					},
+				},
+				8: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(characterClass({
+			set: [
+				{ from: "a", to: "z" },
+				{ from: "A", to: "Z" },
+				{ from: "0", to: "9" },
+			],
+		}, 7), expected);
+	});
+};
+
+const test_wildcard = t => {
+	t.start("wildcard()");
+
+	t.testing("the returned nfa always matches", () => {
+		const expected = {
+			start:  5,
+			end:    6,
+			nextID: 7,
+			nodes: {
+				5: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: true,
+						to: 6,
+					}
+				},
+				6: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(wildcard("IGNORED", 5), expected);
+	});
+};
+
 const test_OPERATORS_FNS = t => {
 	t.start("OPERATORS_FNS");
 
@@ -2105,6 +2215,49 @@ const test_OPERATORS_FNS = t => {
 	});
 };
 
+const test_baseNFA = t => {
+	t.start("baseNFA()");
+
+	t.testing("decide between metacharacter and literal", () => {
+		const expected1 = {
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes:  {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		const expected2 = {
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes:  {
+				1: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: true,
+						to: 2,
+					},
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(baseNFA("a",           1), expected1);
+		t.assertEq(baseNFA({ meta: "." }, 1), expected2);
+	});
+};
+
 const test_buildNFAStep = t => {
 	t.start("buildNFAStep()");
 
@@ -2129,7 +2282,7 @@ const test_buildNFAStep = t => {
 	});
 
 	t.testing("literal char with existing nextID", () => {
-		const input = baseNFA("a", 1);
+		const input = [ literal("a", 1) ];
 		const expected = [{
 			start:  1,
 			end:    2,
@@ -2159,15 +2312,56 @@ const test_buildNFAStep = t => {
 				},
 			},
 		}];
-		t.assertEq(buildNFAStep([input], "b"), expected);
+		t.assertEq(buildNFAStep(input, "b"), expected);
 		t.assertEq(
-			buildNFAStep([input], "b"),
-			[ baseNFA("a", 1), baseNFA("b", 3) ]
+			buildNFAStep(input, "b"),
+			[ literal("a", 1), literal("b", 3) ]
+		);
+	});
+
+	t.testing("metacharacter", () => {
+		const input = [ literal("a", 1) ];
+		const expected = [{
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		}, {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				3: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op: true,
+						to: 4,
+					},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		}];
+		t.assertEq(
+			buildNFAStep(input, { meta: "." }),
+			expected,
 		);
 	});
 
 	t.testing("operator", () => {
-		const input = [ baseNFA("a", 1), baseNFA("b", 3) ];
+		const input = [ literal("a", 1), literal("b", 3) ];
 		const expected = [{
 			start:  1,
 			end:    4,
@@ -3111,13 +3305,16 @@ runTests([
 	test_toPostfixStep,
 	test_toPostfix,
 	test_emptyNFA,
-	test_baseNFA,
+	test_literal,
 	test_concat,
 	test_union,
 	test_zeroOrMore,
 	test_oneOrMore,
 	test_zeroOrOne,
+	test_characterClass,
+	test_wildcard,
 	test_OPERATORS_FNS,
+	test_baseNFA,
 	test_buildNFAStep,
 	test_buildNFA,
 	test_allDirects,
author	EuAndreh <eu@euandre.org>	2025-07-16 10:32:04 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-16 10:32:04 -0300
commit	69e36152600599126d95ae1085b0777a830bc97b (patch)
tree	43527bb37c38cde9921f579153e85edf915ce97f
parent	Differentiate an "operator" from a "meta" character (diff)
download	paca-69e36152600599126d95ae1085b0777a830bc97b.tar.gz paca-69e36152600599126d95ae1085b0777a830bc97b.tar.xz