Compress character class when compiling NFA.

Do not change any observable behaviour outside of `characterClass()`, as the new output is 100% semantically compatible, but faster and most importantly, much smaller. * src/paca.mjs (characterClass): Leverage `compressCharacterRanges()` when processing the given set. Also use the same compressed range to filter out set matches when they are already within the range. (compareRange): Add function that sorts based on the first numeric element of the range. When they're equal, use the second element as the tie breaker. (compressRangeStep): Add function that takes ranges in sorted order tries to mush them together when the start of the second (`from`) is contained by the first (`curr[1]`). Since the ranges are given to us sorted, we already know that the start of the second is greater than or equal to the start of the first. When this is the case, we pick the largest ending to merge the ranges, otherwise we just place them one next to the other, sequentially. (compressCharacterRanges): Add function that sorts the ranges and reduces them with `compressRangeStep()`. Here we have to give an empty array as the initial value to prevent `compressRangeStep()` to be given ranges as `[m, n]`, instead of either `[]` or `[[m, n]]`. This is also why there's a check for `!curr` in `compressRangeStep()` - to plug the other side of this adaptation. (inRange): Add function that checks if the given character is contained by any of the ranges of the given object. At the end, instead of looking through every `from`, all we need is a single match, so we use `.some()` instead. * tests/paca.mjs (test_characterClass): Add test case for a range that collapses many literal character matches and many ranges into a single range. (test_{compareRange,compressRangeStep,compressCharacterRanges,inRange): Add routine sample-based test cases.
author: EuAndreh <eu@euandre.org> 2025-07-16 17:37:25 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-16 17:55:51 -0300
commit: d7247c4e860d407248633874e24c65328c0f375b (patch)
tree: 54e403bb53f43688fd17587a7dcc5bc8deb5a6e1
parent: Build NFA nodes for "." and "class" metacharacters (diff)
download: paca-d7247c4e860d407248633874e24c65328c0f375b.tar.gz
paca-d7247c4e860d407248633874e24c65328c0f375b.tar.xz
2 files changed, 174 insertions, 3 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
index 0625d5c..77a7a4b 100644
--- a/src/paca.mjs
+++ b/src/paca.mjs
@@ -623,14 +623,44 @@ const zeroOrOne = nfa => {
 	};
 };
 
+const compareRange = (lhs, rhs) =>
+	lhs[0] !== rhs[0] ? lhs[0] - rhs[0] : lhs[1] - rhs[1];
+
+const compressRangeStep = (ranges, [from, to]) => {
+	const curr = last(ranges);
+	if (!curr) {
+		return [[from, to]];
+	}
+	return from <= curr[1]
+		? butlast(ranges).concat([[curr[0], max(to, curr[1])]])
+		: ranges.concat([[from, to]]);
+};
+
+const compressCharacterRanges = ranges =>
+	ranges.toSorted(compareRange).reduce(compressRangeStep, []);
+
+const inRange = (ranges, char) => {
+	const code = char.charCodeAt(0);
+	const froms = Object.keys(ranges).map(Number).filter(
+		from => from <= code,
+	);
+	return froms.some(from => code <= ranges[from]);
+};
+
 const characterClass = ({ set, caret }, id) => {
 	const start  = id + 0;
 	const end    = id + 1;
 	const nextID = id + 2;
 	const { string, object } = Object.groupBy(set, x => typeof x);
-	const matches = new Set(string);
-	const ranges  = Object.fromEntries(object.map(
-		({ from, to }) => [ from.charCodeAt(0), to.charCodeAt(0) ],
+	const ranges = Object.fromEntries(
+		compressCharacterRanges((object || []).map(
+			({ from, to }) => [
+				from.charCodeAt(0), to.charCodeAt(0)
+			],
+		),
+	));
+	const matches = new Set((string || []).filter(
+		char => !inRange(ranges, char),
 	));
 	return {
 		start,
diff --git a/tests/paca.mjs b/tests/paca.mjs
index 06b3c8b..efee446 100644
--- a/tests/paca.mjs
+++ b/tests/paca.mjs
@@ -28,6 +28,10 @@ import {
 	zeroOrMore,
 	oneOrMore,
 	zeroOrOne,
+	compareRange,
+	compressRangeStep,
+	compressCharacterRanges,
+	inRange,
 	characterClass,
 	wildcard,
 	OPERATORS_FNS,
@@ -2052,6 +2056,102 @@ const test_zeroOrOne = t => {
 	});
 };
 
+const test_compareRange = t => {
+	t.start("compareRange()");
+
+	t.testing("sorts ranges in ascending order", () => {
+		const input = [
+			[3, 5],
+			[1, 3],
+			[1, 5],
+			[1, 2],
+			[2, 3],
+			[2, 1],
+			[5, 3],
+			[5, 1],
+		];
+		const expected = [
+			[1, 2],
+			[1, 3],
+			[1, 5],
+			[2, 1],
+			[2, 3],
+			[3, 5],
+			[5, 1],
+			[5, 3],
+		];
+		t.assertEq(input.toSorted(compareRange), expected);
+	});
+};
+
+const test_compressRangeStep = t => {
+	t.start("compressRangeStep()");
+
+	t.testing("concat when not overlapping", () => {
+		t.assertEq(
+			compressRangeStep([[1, 10]], [20, 30]),
+			[[1, 10], [20, 30]],
+		);
+	});
+
+	t.testing("merge ranges when overlapping", () => {
+		t.assertEq(
+			compressRangeStep([[1, 10]], [10, 30]),
+			[[1, 30]],
+		);
+	});
+
+	t.testing("merge with the largest intervals", () => {
+		t.assertEq(
+			compressRangeStep([[1, 10]], [2, 8]),
+			[[1, 10]],
+		);
+	});
+};
+
+const test_compressCharacterRanges = t => {
+	t.start("compressCharacterRanges()");
+
+	t.testing("noop when no overlaps", () => {
+		t.assertEq(
+			compressCharacterRanges([[1, 5], [11, 15], [7, 9]]),
+			[[1, 5], [7, 9], [11, 15]],
+		);
+	});
+
+	t.testing("smash all ranges into a single one", () => {
+		t.assertEq(
+			compressCharacterRanges(
+				[[1, 5], [11, 15], [7, 9], [5, 7], [9, 11]],
+			),
+			[[1, 15]],
+		);
+	});
+};
+
+const test_inRange = t => {
+	t.start("inRange()");
+
+	t.testing("always false on empty ranges", () => {
+		t.assertEq(inRange({}, "a"), false);
+	});
+
+	t.testing("false when outside range", () => {
+		t.assertEq(inRange({ 10: 11  }, "a"), false);
+		t.assertEq(inRange({ 1:  96  }, "a"), false);
+		t.assertEq(inRange({ 96: 96  }, "a"), false);
+		t.assertEq(inRange({ 98: 98  }, "a"), false);
+		t.assertEq(inRange({ 98: 127 }, "a"), false);
+	});
+
+	t.testing("true when within range", () => {
+		t.assertEq(inRange({ 48: 57  }, "0"), true);
+		t.assertEq(inRange({ 65: 90  }, "X"), true);
+		t.assertEq(inRange({ 97: 122 }, "a"), true);
+		t.assertEq(inRange({ 97: 97  }, "a"), true);
+	});
+};
+
 const test_characterClass = t => {
 	t.start("characterClass()");
 
@@ -2126,6 +2226,43 @@ const test_characterClass = t => {
 			],
 		}, 7), expected);
 	});
+
+	t.testing("overlapping ranges collapse into a single one", () => {
+		const expected = {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				3: {
+					direct:      [],
+					transitions: {},
+					meta: {
+						op:      "excludes",
+						to:      4,
+						matches: new Set(["-"]),
+						ranges: {
+							97: 122,
+						},
+					},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(characterClass({
+			set: [
+				"-", "a", "b", "c",
+				{ from: "f", to: "p" },
+				{ from: "t", to: "z" },
+				"a", "d", "f", "x", "y", "z",
+				{ from: "p", to: "t" },
+				{ from: "a", to: "f" },
+			],
+			caret: true,
+		}, 3), expected);
+	});
 };
 
 const test_wildcard = t => {
@@ -3311,6 +3448,10 @@ runTests([
 	test_zeroOrMore,
 	test_oneOrMore,
 	test_zeroOrOne,
+	test_compareRange,
+	test_compressRangeStep,
+	test_compressCharacterRanges,
+	test_inRange,
 	test_characterClass,
 	test_wildcard,
 	test_OPERATORS_FNS,
author	EuAndreh <eu@euandre.org>	2025-07-16 17:37:25 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-16 17:55:51 -0300
commit	d7247c4e860d407248633874e24c65328c0f375b (patch)
tree	54e403bb53f43688fd17587a7dcc5bc8deb5a6e1
parent	Build NFA nodes for "." and "class" metacharacters (diff)
download	paca-d7247c4e860d407248633874e24c65328c0f375b.tar.gz paca-d7247c4e860d407248633874e24c65328c0f375b.tar.xz