Implement v0 version of NFA and DFA; WIP tests

author: EuAndreh <eu@euandre.org> 2025-07-01 13:45:29 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-07 06:09:16 -0300
commit: 6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
tree: c2641eddadbaaf211208c50a4a9ee0172382f95b
parent: Initial empty commit (diff)
download: paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz
paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz
6 files changed, 1898 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bf8600d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/node_modules/
+/src/*.exported.mjs
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..aa02a58
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,105 @@
+.POSIX:
+DATE         = 1970-01-01
+VERSION      = 0.1.0
+NAME         = paca
+NAME_UC      = $(NAME)
+## Installation prefix.  Defaults to "/usr".
+PREFIX       = /usr
+BINDIR       = $(PREFIX)/bin
+LIBDIR       = $(PREFIX)/lib
+JSLIBDIR     = $(LIBDIR)/javascript
+INCLUDEDIR   = $(PREFIX)/include
+SRCDIR       = $(PREFIX)/src/$(NAME)
+SHAREDIR     = $(PREFIX)/share
+LOCALEDIR    = $(SHAREDIR)/locale
+MANDIR       = $(SHAREDIR)/man
+EXEC         = ./
+## Where to store the installation.  Empty by default.
+DESTDIR      =
+LDLIBS       =
+JSIMPL       = node
+
+
+
+.SUFFIXES:
+.SUFFIXES: .mjs .mjs-check
+
+
+
+all:
+include deps.mk
+
+sources = \
+	src/$(NAME).mjs                 \
+
+
+derived-assets = \
+	src/$(NAME).exported.mjs       \
+
+side-assets = \
+
+
+
+## Default target.  Builds all artifacts required for testing
+## and installation.
+all: $(derived-assets)
+
+
+src/$(NAME).exported.mjs: src/$(NAME).mjs Makefile
+	cp src/$(NAME).mjs $@
+	printf '\n\nexport {\n'                                   >> $@
+	awk '/^const / { printf "\t%s,\n", $$2 }' src/$(NAME).mjs >> $@
+	printf '}\n'                                              >> $@
+
+
+
+tests/$(NAME).mjs-check: src/$(NAME).exported.mjs
+	$(JSIMPL) $*.mjs
+
+check-unit: tests/$(NAME).mjs-check
+
+
+integration-tests = \
+
+.PRECIOUS: $(integration-tests)
+$(integration-tests): ALWAYS
+	sh $@
+
+check-integration: $(integration-tests)
+
+
+## Run all tests.  Each test suite is isolated, so that a parallel
+## build can run tests at the same time.  The required artifacts
+## are created if missing.
+check: check-unit check-integration
+
+
+
+## Remove *all* derived artifacts produced during the build.
+## A dedicated test asserts that this is always true.
+clean:
+	rm -rf $(derived-assets) $(side-assets)
+
+
+## Installs into $(DESTDIR)$(PREFIX).  Its dependency target
+## ensures that all installable artifacts are crafted beforehand.
+install: all
+	mkdir -p \
+		'$(DESTDIR)$(BINDIR)'     \
+		'$(DESTDIR)$(JSLIBDIR)'   \
+		'$(DESTDIR)$(SRCDIR)'     \
+
+	cp src/$(NAME).mjs    '$(DESTDIR)$(JSLIBDIR)'
+	cp $(sources)        '$(DESTDIR)$(SRCDIR)'
+
+## Uninstalls from $(DESTDIR)$(PREFIX).  This is a perfect mirror
+## of the "install" target, and removes *all* that was installed.
+## A dedicated test asserts that this is always true.
+uninstall:
+	rm -rf \
+		'$(DESTDIR)$(JSLIBDIR)'/$(NAME).mjs    \
+		'$(DESTDIR)$(SRCDIR)'                 \
+
+
+
+ALWAYS:
diff --git a/deps.mk b/deps.mk
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deps.mk
diff --git a/mkdeps.sh b/mkdeps.sh
new file mode 100755
index 0000000..e5606ff
--- /dev/null
+++ b/mkdeps.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+set -eu
+
+export LANG=POSIX.UTF-8
diff --git a/src/paca.mjs b/src/paca.mjs
new file mode 100644
index 0000000..a0e607b
--- /dev/null
+++ b/src/paca.mjs
@@ -0,0 +1,436 @@
+import {
+	butlast, explode, last, mapValues, max, reduce, reduced, repeat,
+} from "sjs";
+
+
+
+export class SyntaxError extends Error {}
+
+const ConcatStep = {
+	ACCEPTING: "accepting",
+	ESCAPING:  "escaping",
+};
+
+const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]);
+
+const shouldConcat = (char, next) =>
+	next !== undefined &&
+	char !== "(" &&
+	char !== "|" &&
+	!nonConcatOperators.has(next);
+
+const isOperator = char =>
+	nonConcatOperators.has(char) || char == "(";
+
+const tokenizeRegexStep = chars => ({ out, state }, char, index) => {
+	const next = chars[index + 1];
+	const maybeConcat = shouldConcat(char, next)
+		? [{operator: "concat"}]
+		: [];
+
+	if (state === ConcatStep.ESCAPING) {
+		return {
+			out: out.concat(char, maybeConcat),
+			state: ConcatStep.ACCEPTING,
+		};
+	}
+
+	if (char === "\\") {
+		if (next === undefined) {
+			throw new SyntaxError(
+				`bad trailing escape character: ${chars}`,
+			);
+		}
+
+		return {
+			out,
+			state: ConcatStep.ESCAPING,
+		};
+	}
+
+	const op = isOperator(char) ? { operator: char } : char;
+	return {
+		out: out.concat(op, maybeConcat),
+		state,
+	};
+};
+
+const tokenizeRegexFn = chars =>
+	chars.reduce(tokenizeRegexStep(chars), {
+		out:   [],
+		state: ConcatStep.ACCEPTING,
+	});
+
+const tokenizeRegex = chars =>
+	tokenizeRegexFn(chars).out;
+
+const PRECEDENCE = {
+	"(":      4,
+	"*":      3,
+	"+":      3,
+	"?":      3,
+	"concat": 2,
+	"|":      1,
+};
+
+const shouldPush = (stack, token) =>
+	stack.length      === 0 ||
+	token.operator    === "(" ||
+	stack[0].operator === "(";
+
+const toPostfixStep = ({ out, stack }, token, _index, tokens) => {
+	if (typeof token === "string") {
+		return {
+			out: out.concat(token),
+			stack,
+		};
+	}
+
+	if (shouldPush(stack, token)) {
+		return {
+			out,
+			stack: [token].concat(stack),
+		};
+	}
+
+	if (token.operator === ")") {
+		const openParenIndex = stack.findIndex(o => o.operator === "(");
+		if (openParenIndex === -1) {
+			throw new SyntaxError(
+				`close parens without opening: ${tokens}`,
+			);
+		}
+
+		const ops = stack.slice(0, openParenIndex);
+		const leftover = stack.slice(openParenIndex + 1);
+		return {
+			out:   out.concat(ops),
+			stack: leftover,
+		};
+	}
+
+	const idx = stack.findIndex(el =>
+		el.operator === "(" ||
+			PRECEDENCE[el.operator] < PRECEDENCE[token.operator],
+	);
+
+	if (idx < 0) {
+		return {
+			out:   out.concat(stack),
+			stack: [token],
+		};
+	} else {
+		const ops = stack.slice(0, idx);
+		const leftover = stack.slice(idx);
+		return {
+			out:   out.concat(ops),
+			stack: [token].concat(leftover),
+		};
+	}
+};
+
+const toPostfix = tokens => {
+	const { out, stack } = tokens.reduce(toPostfixStep, {
+		out:   [],
+		stack: [],
+	});
+	return out.concat(stack);
+};
+
+const emptyNFA = () => {
+	const startID = 1;
+	const endID   = 2;
+	const nextID  = 3;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			[startID]: {
+				direct:      [ endID ],
+				transitions: {},
+			},
+			[endID]:   {
+				direct:      [],
+				transitions: {},
+			},
+		}
+	};
+};
+
+const baseNFA = (edge, id) => {
+	const startID = id + 0;
+	const endID   = id + 1;
+	const nextID  = id + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			[startID]: {
+				direct:      [],
+				transitions: {
+					[edge]: endID,
+				},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const concat = (lhs, rhs) => ({
+	start:  lhs.start,
+	end:    rhs.end,
+	nextID: max(lhs.nextID, rhs.nextID),
+	nodes: {
+		...lhs.nodes,
+		...rhs.nodes,
+		[lhs.end]: {
+			...lhs.nodes[lhs.end],
+			direct: lhs.nodes[lhs.end].direct.concat([
+				rhs.start,
+			]),
+		}
+	},
+});
+
+const union = (lhs, rhs) => {
+	const startID = max(lhs.nextID, rhs.nextID);
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...lhs.nodes,
+			...rhs.nodes,
+			[lhs.end]: {
+				...lhs.nodes[lhs.end],
+				direct: lhs.nodes[lhs.end].direct.concat([
+					endID,
+				]),
+			},
+			[rhs.end]: {
+				...rhs.nodes[rhs.end],
+				direct: lhs.nodes[lhs.end].direct.concat([
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [lhs.start, rhs.start],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const zeroOrMore = nfa => {
+	const startID = nfa.nextID;
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...nfa.nodes,
+			[nfa.end]: {
+				...nfa.nodes[nfa.end],
+				direct: nfa.nodes[nfa.end].direct.concat([
+					nfa.start,
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [ nfa.start, endID ],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const oneOrMore = nfa =>
+	concat(nfa, zeroOrMore(nfa));
+
+const zeroOrOne = nfa => {
+	const startID = nfa.nextID;
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...nfa.nodes,
+			[nfa.end]: {
+				...nfa.nodes[nfa.end],
+				direct: nfa.nodes[nfa.end].direct.concat([
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [ nfa.start, endID ],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const OPERATORS_FNS = ({
+	zeroOrMoreFn = zeroOrMore,
+	oneOrMoreFn  = oneOrMore,
+	zeroOrOneFn  = zeroOrOne,
+	unionFn      = union,
+	concatFn     = concat,
+} = {}) => ({
+	"*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))),
+	"+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))),
+	"?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))),
+	"|": stack => butlast(butlast(stack)).concat(unionFn(
+		last(butlast(stack)),
+		last(stack),
+	)),
+	"concat": stack => butlast(butlast(stack)).concat(concatFn(
+		last(butlast(stack)),
+		last(stack),
+	)),
+});
+
+const OPERATORS = OPERATORS_FNS();
+
+const buildNFAStep = (stack, token) =>
+	typeof token === "string"
+		? stack.concat(baseNFA(token, last(stack)?.nextID || 1))
+		: OPERATORS[token.operator](stack);
+
+const buildNFA = tokens =>
+	tokens.length === 0
+		? emptyNFA()
+		: last(tokens.reduce(buildNFAStep, []));
+
+const allDirects = (stateID, nodes) =>
+	nodes[stateID].direct.length === 0
+		? [ stateID ]
+		: [...new Set(
+			nodes[stateID].direct.map(
+				id => allDirects(id, nodes),
+			).flat(),
+		)].toSorted();
+
+const searchNFAStep = nodes => (directStates, char) =>
+	[...new Set(
+		directStates
+			.map(id => nodes[id].transitions[char])
+			.filter(x => !!x)
+			.map(id => allDirects(id, nodes))
+			.flat()
+	)];
+
+const searchNFAFn = (nfa, string) =>
+	explode(string).reduce(
+		searchNFAStep(nfa.nodes),
+		allDirects(nfa.start, nfa.nodes),
+	);
+
+const searchNFA = (nfa, string) =>
+	searchNFAFn(nfa, string).includes(nfa.end);
+
+const nodeID = states =>
+	[...new Set(states)]
+		.toSorted((a, b) => a - b)
+		.map(x => x.toString())
+		.join("-");
+
+const unNodeID = stateID =>
+	stateID.split("-").map(Number);
+
+const collectTransitions = (nfaNodes, states) => {
+	const transitions = states.map(
+		id => Object.entries(nfaNodes[id].transitions),
+	).filter(t => t.length !== 0);
+	if (!transitions.every(t => t.length === 1)) {
+		throw new Error("unexpected transitions with more than 1 key");
+	}
+	const groupedTransitions = Object.groupBy(
+		transitions.map(t => t[0]),
+		([key, stateID]) => key,
+	);
+	return mapValues(
+		groupedTransitions,
+		entry =>
+			entry.map(([_, id]) =>
+				allDirects(id, nfaNodes)).flat(),
+	);
+};
+
+const computePending = (pending, transitions) =>
+	[...new Set(
+		pending.slice(1).concat(transitions)
+			.map(nodeID))
+	].map(unNodeID);
+
+const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => {
+	if (pending.length === 0) {
+		return dfaNodes;
+	}
+
+	const states  = pending[0];
+	const stateID = nodeID(states);
+	if (!!dfaNodes[stateID]) {
+		return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes);
+	}
+
+	const transitions = collectTransitions(nfaNodes, states);
+	const newPending = computePending(pending, Object.values(transitions));
+	return buildDFANodes(end, nfaNodes, newPending, {
+		...dfaNodes,
+		[stateID]: {
+			end: states.includes(end),
+			transitions: mapValues(transitions, nodeID),
+		},
+	});
+};
+
+const buildDFA = nfa => {
+	const start = allDirects(nfa.start, nfa.nodes);
+	return {
+		start: nodeID(start),
+		nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}),
+	};
+};
+
+const searchDFAStep = nodes => (state, char) =>
+	nodes[state].transitions[char] || reduced(false);
+
+const searchDFAFn = (dfa, string) =>
+	reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start);
+
+const searchDFA = (dfa, string) =>
+	!!dfa.nodes[searchDFAFn(dfa, string)]?.end
+
+const compileNFA = regex =>
+	buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+
+const compileDFA = regex =>
+	buildDFA(compileNFA(regex));
+
+export const compile = regex => {
+	const nfa = compileDFA(regex);
+	return string => searchDFA(nfa, string);
+};
diff --git a/tests/paca.mjs b/tests/paca.mjs
new file mode 100644
index 0000000..6390204
--- /dev/null
+++ b/tests/paca.mjs
@@ -0,0 +1,1351 @@
+import { explode, runTests } from "sjs";
+
+import {
+	SyntaxError,
+	ConcatStep,
+	shouldConcat,
+	isOperator,
+	tokenizeRegexStep,
+	tokenizeRegexFn,
+	tokenizeRegex,
+	shouldPush,
+	toPostfixStep,
+	toPostfix,
+	emptyNFA,
+	baseNFA,
+	concat,
+	union,
+	zeroOrMore,
+	oneOrMore,
+	zeroOrOne,
+	OPERATORS_FNS,
+	buildNFAStep,
+	buildNFA,
+	allDirects,
+	searchNFAStep,
+	searchNFAFn,
+	searchNFA,
+	nodeID,
+	unNodeID,
+	collectTransitions,
+	computePending,
+	buildDFANodes,
+	buildDFA,
+	searchDFAStep,
+	searchDFAFn,
+	searchDFA,
+	compileNFA,
+	compileDFA,
+	compile,
+} from "../src/paca.exported.mjs";
+
+
+
+const test_shouldConcat = t => {
+	t.start("shouldConcat()");
+
+	t.testing("do concat literal chars", () => {
+		t.assertEq(shouldConcat("a", "b"), true);
+	});
+
+	t.testing("but not when they're the last one", () => {
+		t.assertEq(shouldConcat("a", undefined), false);
+		t.assertEq(shouldConcat("b", undefined), false);
+	});
+
+	t.testing("do not concat when starting a parens group", () => {
+		t.assertEq(shouldConcat("(", "a"), false);
+		t.assertEq(shouldConcat("(", "b"), false);
+	});
+
+	t.testing("do not concat when starting a union", () => {
+		t.assertEq(shouldConcat("|", "a"), false);
+		t.assertEq(shouldConcat("|", "b"), false);
+	});
+
+	t.testing("do not concat when next is special", () => {
+		t.assertEq(shouldConcat("a", "*"), false);
+		t.assertEq(shouldConcat("a", "|"), false);
+		t.assertEq(shouldConcat("a", ")"), false);
+	});
+};
+
+const test_isOperator = t => {
+	t.start("isOperator()");
+
+	t.testing("operators and open parens are true", () => {
+		t.assertEq(isOperator("*"), true);
+		t.assertEq(isOperator("|"), true);
+		t.assertEq(isOperator("+"), true);
+		t.assertEq(isOperator("?"), true);
+		t.assertEq(isOperator("("), true);
+		t.assertEq(isOperator(")"), true);
+	});
+
+	t.testing("false for everyday non-meta chars", () => {
+		t.assertEq(isOperator("a"), false);
+		t.assertEq(isOperator("_"), false);
+	});
+};
+
+const test_tokenizeRegexStep = t => {
+	t.start("tokenizeRegexStep()");
+
+	t.testing("when escaping we get whatever the char is", () => {
+		const stepFn = tokenizeRegexStep("ab|(cd)*");
+		const steps = [{
+			char: "a",
+			index: 0,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["a", { operator: "concat" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "b",
+			index: 1,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["b"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "|",
+			index: 2,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["|"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "(",
+			index: 3,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["("],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "c",
+			index: 4,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["c", {operator: "concat"}],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "d",
+			index: 5,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["d"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: ")",
+			index: 6,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   [")"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "*",
+			index: 7,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["*"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}];
+		for (const step of steps) {
+			t.assertEq(
+				stepFn(step.in, step.char, step.index),
+				step.out,
+			);
+		}
+	});
+
+	t.testing("trailing escape errors", () => {
+		try {
+			t.assertEq(tokenizeRegexStep("\\")({
+				out:   [],
+				index: 0,
+				state: ConcatStep.ACCEPTING,
+			}, "\\"), null);
+			t.assertEq(true, false);
+		} catch (e) {
+			const message = "bad trailing escape character: ";
+			t.assertEq(
+				e.message.substring(0, message.length),
+				message,
+			);
+			t.assertEq(e instanceof SyntaxError, true);
+		}
+	});
+
+	t.testing("escape makes it enter escaping mode", () => {
+		const stepFn = tokenizeRegexStep("\\a\\*");
+		const steps = [{
+			char: "\\",
+			index: 0,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+		}, {
+			char: "a",
+			index: 1,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["a", { operator: "concat" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "\\",
+			index: 2,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+		}, {
+			char: "*",
+			index: 3,
+			in: {
+				out:   [],
+				state: ConcatStep.ESCAPING,
+			},
+			out: {
+				out:   ["*"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}];
+		for (const step of steps) {
+			t.assertEq(
+				stepFn(step.in, step.char, step.index),
+				step.out,
+			);
+		}
+	});
+
+	t.testing("operators get detected as such", () => {
+		const stepFn = tokenizeRegexStep("ab|(cd)*");
+		const steps = [{
+			char: "a",
+			index: 0,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   ["a", { operator: "concat" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "b",
+			index: 1,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   ["b"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "|",
+			index: 2,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [{ operator: "|" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "(",
+			index: 3,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [{ operator: "(" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "c",
+			index: 4,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   ["c", { operator: "concat" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "d",
+			index: 5,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   ["d"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: ")",
+			index: 6,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [{ operator: ")" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			char: "*",
+			index: 7,
+			in: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+			out: {
+				out:   [{ operator: "*" }],
+				state: ConcatStep.ACCEPTING,
+			},
+		}];
+		for (const step of steps) {
+			t.assertEq(
+				stepFn(step.in, step.char, step.index),
+				step.out,
+			);
+		}
+	});
+};
+
+const test_tokenizeRegexFn = t => {
+	t.start("tokenizeRegexFn");
+
+	t.testing("regex table", () => {
+		const concat = { operator: "concat" };
+		const star   = { operator: "*" };
+		const table = [{
+			in:       "",
+			expected: {
+				out:   [],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			in:       "a",
+			expected: {
+				out:   ["a"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			in:       "a*",
+			expected: {
+				out:   ["a", star],
+				state: ConcatStep.ACCEPTING,
+			},
+		}, {
+			in:       "a*b",
+			expected: {
+				out:   ["a", star, concat, "b"],
+				state: ConcatStep.ACCEPTING,
+			},
+		}];
+		for (const test of table) {
+			t.assertEq(
+				tokenizeRegexFn(explode(test.in)),
+				test.expected,
+			);
+		}
+	});
+};
+
+const test_tokenizeRegex = t => {
+	t.start("tokenizeRegex()");
+
+	t.testing("regex table", () => {
+		const concat = { operator: "concat" };
+		const star   = { operator: "*" };
+		const table = [{
+			in:       "",
+			expected: []
+		}, {
+			in:       "a",
+			expected: [ "a" ],
+		}, {
+			in:       "ab",
+			expected: [ "a", concat, "b" ],
+		}, {
+			in:       "abc",
+			expected: [ "a", concat, "b", concat, "c" ],
+		}, {
+			in:       "a*",
+			expected: [ "a", star],
+		}, {
+			in:       "ab*",
+			expected: [ "a", concat, "b", star],
+		}, {
+			in:       "a*b*",
+			expected: [ "a", star, concat, "b", star],
+		}, {
+			in:       "a|b",
+			expected: [ "a", { operator: "|" }, "b" ],
+		}, {
+			in:       "a(bc(d))ef",
+			expected: [
+				"a",
+				concat,
+				{ operator: "(" },
+				"b",
+				concat,
+				"c",
+				concat,
+				{ operator: "(" },
+				"d",
+				{ operator: ")" },
+				{ operator: ")" },
+				concat,
+				"e",
+				concat,
+				"f",
+			],
+		}, {
+			in:       "ab|(cd)*",
+			expected: [
+				"a",
+				concat,
+				"b",
+				{ operator: "|" },
+				{ operator: "(" },
+				"c",
+				concat,
+				"d",
+				{ operator: ")" },
+				star,
+			],
+		}, {
+			in:       "(a|b)*c",
+			expected: [
+				{ operator: "(" },
+				"a",
+				{ operator: "|" },
+				"b",
+				{ operator: ")" },
+				star,
+				concat,
+				"c",
+			],
+		}];
+		for (const test of table) {
+			t.assertEq(
+				tokenizeRegex(explode(test.in)),
+				test.expected,
+			);
+		}
+	});
+};
+
+const test_shouldPush = t => {
+	t.start("shouldPush()");
+
+	t.testing("always push on empty stack", () => {
+		t.assertEq(shouldPush([], "a"), true);
+		t.assertEq(shouldPush([], null), true);
+	});
+
+	t.testing(`push when operator or stack top is "("`, () => {
+		t.assertEq(shouldPush([{}], { operator: "(" }), true);
+		t.assertEq(shouldPush([{ operator: "("}], {}), true);
+	});
+
+	t.testing("don't push otherwise", () => {
+		t.assertEq(shouldPush([{}], {}), false);
+	});
+};
+
+const test_toPostfixStep = t => {
+	t.start("toPostfixStep()");
+
+	t.testing("string literals go directly to out", () => {
+		t.assertEq(
+			toPostfixStep({ out: ["a"], stack: "stack" }, "t"),
+			{ out: ["a", "t"], stack: "stack" },
+		);
+	});
+
+	t.testing("parens put things on the stack", () => {
+		t.assertEq(
+			toPostfixStep({
+				out: "out",
+				stack: [{ operator: "(" }],
+			}, { operator: "X" }),
+			{ out: "out", stack: [
+				{ operator: "X" },
+				{ operator: "(" },
+			]},
+		);
+	});
+
+	// FIXME: more tests
+};
+
+const test_toPostfix = t => {
+	t.start("toPostfix()");
+
+	t.testing("regex table", () => {
+		const concat = { operator: "concat" };
+		const table = [{
+			in:       [],
+			expected: []
+		}, {
+			in:       [ "a" ],
+			expected: [ "a" ],
+		}, {
+			in:       [ "a", concat, "b", concat, "c" ],
+			expected: [ "a", "b", concat, "c", concat ],
+		}, {
+			in:       [ "a", { operator: "*" }],
+			expected: [ "a", { operator: "*" }],
+		}, {
+			in:       [ "a", { operator: "|" }, "b" ],
+			expected: [ "a", "b", { operator: "|" } ],
+		}, {
+			in: [
+				"a",
+				concat,
+				"b",
+				{ operator: "|" },
+				{ operator: "(" },
+				"c",
+				concat,
+				"d",
+				{ operator: ")" },
+				{ operator: "*" },
+			],
+			expected: [
+				"a",
+				"b",
+				concat,
+				"c",
+				"d",
+				concat,
+				{ operator: "*" },
+				{ operator: "|" },
+			],
+		}, {
+			in: [
+				{ operator: "(" },
+				"a",
+				{ operator: "|" },
+				"b",
+				{ operator: ")" },
+				{ operator: "*" },
+				concat,
+				"c",
+			],
+			expected: [
+				"a",
+				"b",
+				{ operator: "|" },
+				{ operator: "*" },
+				"c",
+				concat,
+			],
+		}];
+		for (const test of table) {
+			t.assertEq(
+				toPostfix(test.in),
+				test.expected,
+			);
+		}
+	});
+};
+
+const test_emptyNFA = t => {
+	t.start("emptyNFA()");
+
+	t.testing("we deterministically build out of the ID", () => {
+		const expected = {
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct: [ 2 ],
+					transitions: {},
+				},
+				2: {
+					direct: [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(emptyNFA(), expected);
+	});
+};
+
+const test_baseNFA = t => {
+	t.start("baseNFA()");
+
+	t.testing("the given edge connects the nodes", () => {
+		const expected = {
+			start:  123,
+			end:    124,
+			nextID: 125,
+			nodes:  {
+				123: {
+					direct:      [],
+					transitions: { X: 124 },
+				},
+				124: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(baseNFA("X", 123), expected);
+	});
+};
+
+const test_concat = t => {
+	t.start("concat()");
+
+	t.testing("on baseNFA() and on more complex NFAs", () => {
+		const a  = baseNFA("a", 1);
+		const b  = baseNFA("b", a.nextID);
+		const a_ = baseNFA("a", b.nextID);
+		const expected1 = {
+			start:  1,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 3 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(concat(a, b), expected1);
+
+		const expected2 = {
+			start:  1,
+			end:    6,
+			nextID: 7,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 3 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [ 5 ],
+					transitions: {},
+				},
+				5: {
+					direct:      [],
+					transitions: { a: 6 },
+				},
+				6: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(concat(concat(a, b), a_), expected2);
+	});
+};
+
+const test_union = t => {
+	t.start("union()");
+
+	t.testing("on baseNFA() and on more complex NFAs", () => {
+		const a = baseNFA("a", 1);
+		const b = baseNFA("b", a.nextID);
+		const expected1 = {
+			start:  5,
+			end:    6,
+			nextID: 7,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				5: {
+					direct:      [ 1, 3 ],
+					transitions: {},
+				},
+				6: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(union(a, b), expected1);
+
+		const c = baseNFA("c", expected1.nextID);
+		const expected2 = {
+			start:  9,
+			end:    10,
+			nextID: 11,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				5: {
+					direct:      [ 1, 3 ],
+					transitions: {},
+				},
+				6: {
+					direct:      [ 10 ],
+					transitions: {},
+				},
+				7: {
+					direct:      [],
+					transitions: { c: 8 },
+				},
+				8: {
+					direct:      [ 10 ],
+					transitions: {},
+				},
+				9: {
+					direct:      [ 5, 7 ],
+					transitions: {},
+				},
+				10: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(union(union(a, b), c), expected2);
+	});
+};
+
+const test_zeroOrMore = t => {
+	t.start("zeroOrMore()");
+
+	t.testing("on baseNFA()", () => {
+		const a = baseNFA("a", 1);
+		const expected = {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 1, 4 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [ 1, 4 ],
+					transitions: {},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(zeroOrMore(a), expected);
+	});
+};
+
+const test_oneOrMore = t => {
+	t.start("oneOrMore()");
+
+	t.testing("on baseNFA()", () => {
+		const a = baseNFA("a", 1);
+		const expected = {
+			start:  1,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 3 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [ 1, 4 ],
+					transitions: {},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(oneOrMore(a), expected);
+	});
+};
+
+const test_zeroOrOne = t => {
+	t.start("zeroOrOne()");
+
+	t.testing("on baseNFA()", () => {
+		const a = baseNFA("a", 1);
+		const expected = {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 4 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [ 1, 4 ],
+					transitions: {},
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(zeroOrOne(a), expected);
+	});
+};
+
+const test_OPERATORS_FNS = t => {
+	t.start("OPERATORS_FNS");
+
+	const arr = [];
+	const fn = ret => (...args) => {
+		arr.push({
+			args,
+			ret,
+		});
+		return ret;
+	};
+	const input = [1, 2, 3];
+
+
+	t.testing("star", () => {
+		arr.splice(0);
+		t.assertEq(
+			OPERATORS_FNS({ zeroOrMoreFn: fn(111) })["*"](input),
+			[1, 2, 111],
+		);
+		t.assertEq(arr, [{ args: [3], ret: 111 }]);
+	});
+
+	t.testing("plus", () => {
+		arr.splice(0);
+		t.assertEq(
+			OPERATORS_FNS({ oneOrMoreFn: fn(222) })["+"](input),
+			[1, 2, 222],
+		);
+		t.assertEq(arr, [{ args: [3], ret: 222 }]);
+	});
+
+	t.testing("question mark", () => {
+		arr.splice(0);
+		t.assertEq(
+			OPERATORS_FNS({ zeroOrOneFn: fn(333) })["?"](input),
+			[1, 2, 333],
+		);
+		t.assertEq(arr, [{ args: [3], ret: 333 }]);
+	});
+
+	t.testing("pipe", () => {
+		arr.splice(0);
+		t.assertEq(
+			OPERATORS_FNS({ unionFn: fn(444) })["|"](input),
+			[1, 444],
+		);
+		t.assertEq(arr, [{ args: [2, 3], ret: 444 }]);
+	});
+
+	t.testing("concat", () => {
+		arr.splice(0);
+		t.assertEq(
+			OPERATORS_FNS({ concatFn: fn(555) })["concat"](input),
+			[1, 555],
+		);
+		t.assertEq(arr, [{ args: [2, 3], ret: 555 }]);
+	});
+};
+
+const test_buildNFAStep = t => {
+	t.start("buildNFAStep()");
+
+	t.testing("literal char without existing nextID", () => {
+		const expected = [{
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { t: 2 },
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+
+			},
+		}];
+		t.assertEq(buildNFAStep([], "t"), expected);
+	});
+
+	t.testing("literal char with existing nextID", () => {
+		const input = baseNFA("a", 1);
+		const expected = [{
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		}, {
+			start:  3,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		}];
+		t.assertEq(buildNFAStep([input], "b"), expected);
+		t.assertEq(
+			buildNFAStep([input], "b"),
+			[ baseNFA("a", 1), baseNFA("b", 3) ]
+		);
+	});
+
+	t.testing("operator", () => {
+		const input = [ baseNFA("a", 1), baseNFA("b", 3) ];
+		const expected = [{
+			start:  1,
+			end:    4,
+			nextID: 5,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 3 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		}];
+		t.assertEq(
+			buildNFAStep(input, { operator: "concat" }),
+			expected,
+		);
+	});
+};
+
+const test_buildNFA = t => {
+	t.start("buildNFA()");
+
+	t.testing("empty regex pattern", () => {
+		const expected = {
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct:      [ 2 ],
+					transitions: {},
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(buildNFA([]), expected);
+	});
+
+	t.testing("sample composed regex", () => {
+		const regex = "(a|b)*cde";
+		const expected = {
+			start:  7,
+			end:    14,
+			nextID: 15,
+			nodes: {
+				1: {
+					direct:      [],
+					transitions: { a: 2 },
+				},
+				2: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				3: {
+					direct:      [],
+					transitions: { b: 4 },
+				},
+				4: {
+					direct:      [ 6 ],
+					transitions: {},
+				},
+				5: {
+					direct:      [ 1, 3 ],
+					transitions: {},
+				},
+				6: {
+					direct:      [ 5, 8 ],
+					transitions: {},
+				},
+				7: {
+					direct:      [ 5, 8 ],
+					transitions: {},
+				},
+				8: {
+					direct:      [ 9 ],
+					transitions: {},
+				},
+				9: {
+					direct:      [],
+					transitions: { c: 10 },
+				},
+				10: {
+					direct:      [ 11 ],
+					transitions: {},
+				},
+				11: {
+					direct:      [],
+					transitions: { d: 12 },
+				},
+				12: {
+					direct:      [ 13 ],
+					transitions: {},
+				},
+				13: {
+					direct:      [],
+					transitions: { e: 14 },
+				},
+				14: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(
+			buildNFA(toPostfix(tokenizeRegex(explode(regex)))),
+			expected,
+		);
+	});
+};
+
+const test_allDirects = t => {
+	t.start("allDirects()");
+
+	t.testing("empty directs", () => {
+	});
+
+	t.testing("single level", () => {
+	});
+
+	t.testing("deep recursion", () => {
+	});
+
+	t.testing("plenty of duplicates", () => {
+	});
+};
+
+const test_searchNFAStep = t => {
+	t.start("searchNFAStep()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_searchNFAFn = t => {
+	t.start("searchNFAFn()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_searchNFA = t => {
+	t.start("searchNFA()");
+
+	t.testing("on composed sample regex", () => {
+		const regex = "(a|b)*c";
+		const nfa = buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+		t.assertEq(searchNFA(nfa, "c"), true);
+		t.assertEq(searchNFA(nfa, "cc"), false);
+		t.assertEq(searchNFA(nfa, "aaabbbc"), true);
+	});
+};
+
+const test_nodeID = t => {
+	t.start("nodeID()");
+
+	t.testing("single number", () => {
+		t.assertEq(nodeID([1]), "1");
+	});
+
+	t.testing("plenty of duplicates", () => {
+		t.assertEq(nodeID([1, 1, 1, 1, 1, 1]), "1");
+	});
+
+	t.testing("unsorted input", () => {
+		t.assertEq(
+			nodeID([5, 5, 11, 11, 11, 3, 4, 5, 123]),
+			"3-4-5-11-123",
+		);
+	});
+};
+
+const test_unNodeID = t => {
+	t.start("unNodeID()");
+
+	t.testing("no hyphes", () => {
+		t.assertEq(unNodeID("123"), [123]);
+	});
+
+	t.testing("with hyphens", () => {
+		t.assertEq(unNodeID("1-2"), [1, 2]);
+		t.assertEq(unNodeID("11-3-0"), [11, 3, 0]);
+	});
+};
+
+const test_collectTransitions = t => {
+	t.start("collectTransitions()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_computePending = t => {
+	t.start("computePending()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_buildDFANodes = t => {
+	t.start("buildDFANodes()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_buildDFA = t => {
+	t.start("buildDFA()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_searchDFAStep = t => {
+	t.start("searchDFAStep()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_searchDFAFn = t => {
+	t.start("searchDFAFn()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_searchDFA = t => {
+	t.start("searchDFA()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_compileNFA = t => {
+	t.start("compileNFA()");
+
+	t.testing("empty regex", () => {
+		const expected = {
+			start:  1,
+			end:    2,
+			nextID: 3,
+			nodes: {
+				1: {
+					direct:      [ 2 ],
+					transitions: {},
+				},
+				2: {
+					direct:      [],
+					transitions: {},
+				},
+			},
+		};
+		t.assertEq(compileNFA(""), expected);
+	});
+
+	t.testing("composed sample regex", () => {
+		const nfa = compileNFA("(a|b)*c");
+		t.assertEq(searchNFA(nfa, "ababac"), true);
+		t.assertEq(searchNFA(nfa, "babac"), true);
+		t.assertEq(searchNFA(nfa, "babaca"), false);
+	});
+};
+
+const test_compileDFA = t => {
+	t.start("compileDFA()");
+
+	t.testing("FIXME", () => {});
+};
+
+const test_compile = t => {
+	t.start("compile()");
+
+	t.testing("composed sample regex", () => {
+		const matcher = compile("(a|b)*c");
+		t.assertEq(matcher("ababac"),  true);
+		t.assertEq(matcher("ababaca"), false);
+	});
+};
+
+
+
+runTests([
+	test_shouldConcat,
+	test_isOperator,
+	test_tokenizeRegexStep,
+	test_tokenizeRegexFn,
+	test_tokenizeRegex,
+	test_shouldPush,
+	test_toPostfixStep,
+	test_toPostfix,
+	test_emptyNFA,
+	test_baseNFA,
+	test_concat,
+	test_union,
+	test_zeroOrMore,
+	test_oneOrMore,
+	test_zeroOrOne,
+	test_OPERATORS_FNS,
+	test_buildNFAStep,
+	test_buildNFA,
+	test_allDirects,
+	test_searchNFAStep,
+	test_searchNFAFn,
+	test_searchNFA,
+	test_nodeID,
+	test_unNodeID,
+	test_collectTransitions,
+	test_computePending,
+	test_buildDFANodes,
+	test_buildDFA,
+	test_searchDFAStep,
+	test_searchDFAFn,
+	test_searchDFA,
+	test_compileNFA,
+	test_compileDFA,
+	test_compile,
+]);
author	EuAndreh <eu@euandre.org>	2025-07-01 13:45:29 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-07 06:09:16 -0300
commit	6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
tree	c2641eddadbaaf211208c50a4a9ee0172382f95b
parent	Initial empty commit (diff)
download	paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz