Implement v0 version of NFA and DFA; WIP tests

author: EuAndreh <eu@euandre.org> 2025-07-01 13:45:29 -0300
committer: EuAndreh <eu@euandre.org> 2025-07-07 06:09:16 -0300
commit: 6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
tree: c2641eddadbaaf211208c50a4a9ee0172382f95b /src
parent: Initial empty commit (diff)
download: paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz
paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz
1 files changed, 436 insertions, 0 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
new file mode 100644
index 0000000..a0e607b
--- /dev/null
+++ b/src/paca.mjs
@@ -0,0 +1,436 @@
+import {
+	butlast, explode, last, mapValues, max, reduce, reduced, repeat,
+} from "sjs";
+
+
+
+export class SyntaxError extends Error {}
+
+const ConcatStep = {
+	ACCEPTING: "accepting",
+	ESCAPING:  "escaping",
+};
+
+const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]);
+
+const shouldConcat = (char, next) =>
+	next !== undefined &&
+	char !== "(" &&
+	char !== "|" &&
+	!nonConcatOperators.has(next);
+
+const isOperator = char =>
+	nonConcatOperators.has(char) || char == "(";
+
+const tokenizeRegexStep = chars => ({ out, state }, char, index) => {
+	const next = chars[index + 1];
+	const maybeConcat = shouldConcat(char, next)
+		? [{operator: "concat"}]
+		: [];
+
+	if (state === ConcatStep.ESCAPING) {
+		return {
+			out: out.concat(char, maybeConcat),
+			state: ConcatStep.ACCEPTING,
+		};
+	}
+
+	if (char === "\\") {
+		if (next === undefined) {
+			throw new SyntaxError(
+				`bad trailing escape character: ${chars}`,
+			);
+		}
+
+		return {
+			out,
+			state: ConcatStep.ESCAPING,
+		};
+	}
+
+	const op = isOperator(char) ? { operator: char } : char;
+	return {
+		out: out.concat(op, maybeConcat),
+		state,
+	};
+};
+
+const tokenizeRegexFn = chars =>
+	chars.reduce(tokenizeRegexStep(chars), {
+		out:   [],
+		state: ConcatStep.ACCEPTING,
+	});
+
+const tokenizeRegex = chars =>
+	tokenizeRegexFn(chars).out;
+
+const PRECEDENCE = {
+	"(":      4,
+	"*":      3,
+	"+":      3,
+	"?":      3,
+	"concat": 2,
+	"|":      1,
+};
+
+const shouldPush = (stack, token) =>
+	stack.length      === 0 ||
+	token.operator    === "(" ||
+	stack[0].operator === "(";
+
+const toPostfixStep = ({ out, stack }, token, _index, tokens) => {
+	if (typeof token === "string") {
+		return {
+			out: out.concat(token),
+			stack,
+		};
+	}
+
+	if (shouldPush(stack, token)) {
+		return {
+			out,
+			stack: [token].concat(stack),
+		};
+	}
+
+	if (token.operator === ")") {
+		const openParenIndex = stack.findIndex(o => o.operator === "(");
+		if (openParenIndex === -1) {
+			throw new SyntaxError(
+				`close parens without opening: ${tokens}`,
+			);
+		}
+
+		const ops = stack.slice(0, openParenIndex);
+		const leftover = stack.slice(openParenIndex + 1);
+		return {
+			out:   out.concat(ops),
+			stack: leftover,
+		};
+	}
+
+	const idx = stack.findIndex(el =>
+		el.operator === "(" ||
+			PRECEDENCE[el.operator] < PRECEDENCE[token.operator],
+	);
+
+	if (idx < 0) {
+		return {
+			out:   out.concat(stack),
+			stack: [token],
+		};
+	} else {
+		const ops = stack.slice(0, idx);
+		const leftover = stack.slice(idx);
+		return {
+			out:   out.concat(ops),
+			stack: [token].concat(leftover),
+		};
+	}
+};
+
+const toPostfix = tokens => {
+	const { out, stack } = tokens.reduce(toPostfixStep, {
+		out:   [],
+		stack: [],
+	});
+	return out.concat(stack);
+};
+
+const emptyNFA = () => {
+	const startID = 1;
+	const endID   = 2;
+	const nextID  = 3;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			[startID]: {
+				direct:      [ endID ],
+				transitions: {},
+			},
+			[endID]:   {
+				direct:      [],
+				transitions: {},
+			},
+		}
+	};
+};
+
+const baseNFA = (edge, id) => {
+	const startID = id + 0;
+	const endID   = id + 1;
+	const nextID  = id + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			[startID]: {
+				direct:      [],
+				transitions: {
+					[edge]: endID,
+				},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const concat = (lhs, rhs) => ({
+	start:  lhs.start,
+	end:    rhs.end,
+	nextID: max(lhs.nextID, rhs.nextID),
+	nodes: {
+		...lhs.nodes,
+		...rhs.nodes,
+		[lhs.end]: {
+			...lhs.nodes[lhs.end],
+			direct: lhs.nodes[lhs.end].direct.concat([
+				rhs.start,
+			]),
+		}
+	},
+});
+
+const union = (lhs, rhs) => {
+	const startID = max(lhs.nextID, rhs.nextID);
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...lhs.nodes,
+			...rhs.nodes,
+			[lhs.end]: {
+				...lhs.nodes[lhs.end],
+				direct: lhs.nodes[lhs.end].direct.concat([
+					endID,
+				]),
+			},
+			[rhs.end]: {
+				...rhs.nodes[rhs.end],
+				direct: lhs.nodes[lhs.end].direct.concat([
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [lhs.start, rhs.start],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const zeroOrMore = nfa => {
+	const startID = nfa.nextID;
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...nfa.nodes,
+			[nfa.end]: {
+				...nfa.nodes[nfa.end],
+				direct: nfa.nodes[nfa.end].direct.concat([
+					nfa.start,
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [ nfa.start, endID ],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const oneOrMore = nfa =>
+	concat(nfa, zeroOrMore(nfa));
+
+const zeroOrOne = nfa => {
+	const startID = nfa.nextID;
+	const endID   = startID + 1;
+	const nextID  = startID + 2;
+	return {
+		start: startID,
+		end:   endID,
+		nextID,
+		nodes: {
+			...nfa.nodes,
+			[nfa.end]: {
+				...nfa.nodes[nfa.end],
+				direct: nfa.nodes[nfa.end].direct.concat([
+					endID,
+				]),
+			},
+			[startID]: {
+				direct:      [ nfa.start, endID ],
+				transitions: {},
+			},
+			[endID]: {
+				direct:      [],
+				transitions: {},
+			},
+		},
+	};
+};
+
+const OPERATORS_FNS = ({
+	zeroOrMoreFn = zeroOrMore,
+	oneOrMoreFn  = oneOrMore,
+	zeroOrOneFn  = zeroOrOne,
+	unionFn      = union,
+	concatFn     = concat,
+} = {}) => ({
+	"*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))),
+	"+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))),
+	"?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))),
+	"|": stack => butlast(butlast(stack)).concat(unionFn(
+		last(butlast(stack)),
+		last(stack),
+	)),
+	"concat": stack => butlast(butlast(stack)).concat(concatFn(
+		last(butlast(stack)),
+		last(stack),
+	)),
+});
+
+const OPERATORS = OPERATORS_FNS();
+
+const buildNFAStep = (stack, token) =>
+	typeof token === "string"
+		? stack.concat(baseNFA(token, last(stack)?.nextID || 1))
+		: OPERATORS[token.operator](stack);
+
+const buildNFA = tokens =>
+	tokens.length === 0
+		? emptyNFA()
+		: last(tokens.reduce(buildNFAStep, []));
+
+const allDirects = (stateID, nodes) =>
+	nodes[stateID].direct.length === 0
+		? [ stateID ]
+		: [...new Set(
+			nodes[stateID].direct.map(
+				id => allDirects(id, nodes),
+			).flat(),
+		)].toSorted();
+
+const searchNFAStep = nodes => (directStates, char) =>
+	[...new Set(
+		directStates
+			.map(id => nodes[id].transitions[char])
+			.filter(x => !!x)
+			.map(id => allDirects(id, nodes))
+			.flat()
+	)];
+
+const searchNFAFn = (nfa, string) =>
+	explode(string).reduce(
+		searchNFAStep(nfa.nodes),
+		allDirects(nfa.start, nfa.nodes),
+	);
+
+const searchNFA = (nfa, string) =>
+	searchNFAFn(nfa, string).includes(nfa.end);
+
+const nodeID = states =>
+	[...new Set(states)]
+		.toSorted((a, b) => a - b)
+		.map(x => x.toString())
+		.join("-");
+
+const unNodeID = stateID =>
+	stateID.split("-").map(Number);
+
+const collectTransitions = (nfaNodes, states) => {
+	const transitions = states.map(
+		id => Object.entries(nfaNodes[id].transitions),
+	).filter(t => t.length !== 0);
+	if (!transitions.every(t => t.length === 1)) {
+		throw new Error("unexpected transitions with more than 1 key");
+	}
+	const groupedTransitions = Object.groupBy(
+		transitions.map(t => t[0]),
+		([key, stateID]) => key,
+	);
+	return mapValues(
+		groupedTransitions,
+		entry =>
+			entry.map(([_, id]) =>
+				allDirects(id, nfaNodes)).flat(),
+	);
+};
+
+const computePending = (pending, transitions) =>
+	[...new Set(
+		pending.slice(1).concat(transitions)
+			.map(nodeID))
+	].map(unNodeID);
+
+const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => {
+	if (pending.length === 0) {
+		return dfaNodes;
+	}
+
+	const states  = pending[0];
+	const stateID = nodeID(states);
+	if (!!dfaNodes[stateID]) {
+		return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes);
+	}
+
+	const transitions = collectTransitions(nfaNodes, states);
+	const newPending = computePending(pending, Object.values(transitions));
+	return buildDFANodes(end, nfaNodes, newPending, {
+		...dfaNodes,
+		[stateID]: {
+			end: states.includes(end),
+			transitions: mapValues(transitions, nodeID),
+		},
+	});
+};
+
+const buildDFA = nfa => {
+	const start = allDirects(nfa.start, nfa.nodes);
+	return {
+		start: nodeID(start),
+		nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}),
+	};
+};
+
+const searchDFAStep = nodes => (state, char) =>
+	nodes[state].transitions[char] || reduced(false);
+
+const searchDFAFn = (dfa, string) =>
+	reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start);
+
+const searchDFA = (dfa, string) =>
+	!!dfa.nodes[searchDFAFn(dfa, string)]?.end
+
+const compileNFA = regex =>
+	buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+
+const compileDFA = regex =>
+	buildDFA(compileNFA(regex));
+
+export const compile = regex => {
+	const nfa = compileDFA(regex);
+	return string => searchDFA(nfa, string);
+};
author	EuAndreh <eu@euandre.org>	2025-07-01 13:45:29 -0300
committer	EuAndreh <eu@euandre.org>	2025-07-07 06:09:16 -0300
commit	6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
tree	c2641eddadbaaf211208c50a4a9ee0172382f95b /src
parent	Initial empty commit (diff)
download	paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz