diff options
author | EuAndreh <eu@euandre.org> | 2025-07-16 17:37:25 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2025-07-16 17:55:51 -0300 |
commit | d7247c4e860d407248633874e24c65328c0f375b (patch) | |
tree | 54e403bb53f43688fd17587a7dcc5bc8deb5a6e1 | |
parent | Build NFA nodes for "." and "class" metacharacters (diff) | |
download | paca-d7247c4e860d407248633874e24c65328c0f375b.tar.gz paca-d7247c4e860d407248633874e24c65328c0f375b.tar.xz |
Compress character class when compiling NFA.
Do not change any observable behaviour outside of `characterClass()`, as
the new output is 100% semantically compatible, but faster and most
importantly, much smaller.
* src/paca.mjs
(characterClass): Leverage `compressCharacterRanges()` when processing
the given set. Also use the same compressed range to filter out set
matches when they are already within the range.
(compareRange): Add function that sorts based on the first numeric
element of the range. When they're equal, use the second element as
the tie breaker.
(compressRangeStep): Add function that takes ranges in sorted order
tries to mush them together when the start of the second (`from`) is
contained by the first (`curr[1]`). Since the ranges are given to
us sorted, we already know that the start of the second is greater
than or equal to the start of the first. When this is the case, we
pick the largest ending to merge the ranges, otherwise we just place
them one next to the other, sequentially.
(compressCharacterRanges): Add function that sorts the ranges and
reduces them with `compressRangeStep()`. Here we have to give an
empty array as the initial value to prevent `compressRangeStep()` to
be given ranges as `[m, n]`, instead of either `[]` or `[[m, n]]`.
This is also why there's a check for `!curr` in
`compressRangeStep()` - to plug the other side of this adaptation.
(inRange): Add function that checks if the given character is
contained by any of the ranges of the given object. At the end,
instead of looking through every `from`, all we need is a single
match, so we use `.some()` instead.
* tests/paca.mjs
(test_characterClass): Add test case for a range that collapses many
literal character matches and many ranges into a single range.
(test_{compareRange,compressRangeStep,compressCharacterRanges,inRange):
Add routine sample-based test cases.
-rw-r--r-- | src/paca.mjs | 36 | ||||
-rw-r--r-- | tests/paca.mjs | 141 |
2 files changed, 174 insertions, 3 deletions
diff --git a/src/paca.mjs b/src/paca.mjs index 0625d5c..77a7a4b 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -623,14 +623,44 @@ const zeroOrOne = nfa => { }; }; +const compareRange = (lhs, rhs) => + lhs[0] !== rhs[0] ? lhs[0] - rhs[0] : lhs[1] - rhs[1]; + +const compressRangeStep = (ranges, [from, to]) => { + const curr = last(ranges); + if (!curr) { + return [[from, to]]; + } + return from <= curr[1] + ? butlast(ranges).concat([[curr[0], max(to, curr[1])]]) + : ranges.concat([[from, to]]); +}; + +const compressCharacterRanges = ranges => + ranges.toSorted(compareRange).reduce(compressRangeStep, []); + +const inRange = (ranges, char) => { + const code = char.charCodeAt(0); + const froms = Object.keys(ranges).map(Number).filter( + from => from <= code, + ); + return froms.some(from => code <= ranges[from]); +}; + const characterClass = ({ set, caret }, id) => { const start = id + 0; const end = id + 1; const nextID = id + 2; const { string, object } = Object.groupBy(set, x => typeof x); - const matches = new Set(string); - const ranges = Object.fromEntries(object.map( - ({ from, to }) => [ from.charCodeAt(0), to.charCodeAt(0) ], + const ranges = Object.fromEntries( + compressCharacterRanges((object || []).map( + ({ from, to }) => [ + from.charCodeAt(0), to.charCodeAt(0) + ], + ), + )); + const matches = new Set((string || []).filter( + char => !inRange(ranges, char), )); return { start, diff --git a/tests/paca.mjs b/tests/paca.mjs index 06b3c8b..efee446 100644 --- a/tests/paca.mjs +++ b/tests/paca.mjs @@ -28,6 +28,10 @@ import { zeroOrMore, oneOrMore, zeroOrOne, + compareRange, + compressRangeStep, + compressCharacterRanges, + inRange, characterClass, wildcard, OPERATORS_FNS, @@ -2052,6 +2056,102 @@ const test_zeroOrOne = t => { }); }; +const test_compareRange = t => { + t.start("compareRange()"); + + t.testing("sorts ranges in ascending order", () => { + const input = [ + [3, 5], + [1, 3], + [1, 5], + [1, 2], + [2, 3], + [2, 1], + [5, 3], + [5, 1], + ]; + const expected = [ + [1, 2], + [1, 3], + [1, 5], + [2, 1], + [2, 3], + [3, 5], + [5, 1], + [5, 3], + ]; + t.assertEq(input.toSorted(compareRange), expected); + }); +}; + +const test_compressRangeStep = t => { + t.start("compressRangeStep()"); + + t.testing("concat when not overlapping", () => { + t.assertEq( + compressRangeStep([[1, 10]], [20, 30]), + [[1, 10], [20, 30]], + ); + }); + + t.testing("merge ranges when overlapping", () => { + t.assertEq( + compressRangeStep([[1, 10]], [10, 30]), + [[1, 30]], + ); + }); + + t.testing("merge with the largest intervals", () => { + t.assertEq( + compressRangeStep([[1, 10]], [2, 8]), + [[1, 10]], + ); + }); +}; + +const test_compressCharacterRanges = t => { + t.start("compressCharacterRanges()"); + + t.testing("noop when no overlaps", () => { + t.assertEq( + compressCharacterRanges([[1, 5], [11, 15], [7, 9]]), + [[1, 5], [7, 9], [11, 15]], + ); + }); + + t.testing("smash all ranges into a single one", () => { + t.assertEq( + compressCharacterRanges( + [[1, 5], [11, 15], [7, 9], [5, 7], [9, 11]], + ), + [[1, 15]], + ); + }); +}; + +const test_inRange = t => { + t.start("inRange()"); + + t.testing("always false on empty ranges", () => { + t.assertEq(inRange({}, "a"), false); + }); + + t.testing("false when outside range", () => { + t.assertEq(inRange({ 10: 11 }, "a"), false); + t.assertEq(inRange({ 1: 96 }, "a"), false); + t.assertEq(inRange({ 96: 96 }, "a"), false); + t.assertEq(inRange({ 98: 98 }, "a"), false); + t.assertEq(inRange({ 98: 127 }, "a"), false); + }); + + t.testing("true when within range", () => { + t.assertEq(inRange({ 48: 57 }, "0"), true); + t.assertEq(inRange({ 65: 90 }, "X"), true); + t.assertEq(inRange({ 97: 122 }, "a"), true); + t.assertEq(inRange({ 97: 97 }, "a"), true); + }); +}; + const test_characterClass = t => { t.start("characterClass()"); @@ -2126,6 +2226,43 @@ const test_characterClass = t => { ], }, 7), expected); }); + + t.testing("overlapping ranges collapse into a single one", () => { + const expected = { + start: 3, + end: 4, + nextID: 5, + nodes: { + 3: { + direct: [], + transitions: {}, + meta: { + op: "excludes", + to: 4, + matches: new Set(["-"]), + ranges: { + 97: 122, + }, + }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(characterClass({ + set: [ + "-", "a", "b", "c", + { from: "f", to: "p" }, + { from: "t", to: "z" }, + "a", "d", "f", "x", "y", "z", + { from: "p", to: "t" }, + { from: "a", to: "f" }, + ], + caret: true, + }, 3), expected); + }); }; const test_wildcard = t => { @@ -3311,6 +3448,10 @@ runTests([ test_zeroOrMore, test_oneOrMore, test_zeroOrOne, + test_compareRange, + test_compressRangeStep, + test_compressCharacterRanges, + test_inRange, test_characterClass, test_wildcard, test_OPERATORS_FNS, |