summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorEuAndreh <eu@euandre.org>2025-07-01 13:45:29 -0300
committerEuAndreh <eu@euandre.org>2025-07-07 06:09:16 -0300
commit6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
treec2641eddadbaaf211208c50a4a9ee0172382f95b /src
parentInitial empty commit (diff)
downloadpaca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz
paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz
Implement v0 version of NFA and DFA; WIP tests
Diffstat (limited to 'src')
-rw-r--r--src/paca.mjs436
1 files changed, 436 insertions, 0 deletions
diff --git a/src/paca.mjs b/src/paca.mjs
new file mode 100644
index 0000000..a0e607b
--- /dev/null
+++ b/src/paca.mjs
@@ -0,0 +1,436 @@
+import {
+ butlast, explode, last, mapValues, max, reduce, reduced, repeat,
+} from "sjs";
+
+
+
+export class SyntaxError extends Error {}
+
+const ConcatStep = {
+ ACCEPTING: "accepting",
+ ESCAPING: "escaping",
+};
+
+const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]);
+
+const shouldConcat = (char, next) =>
+ next !== undefined &&
+ char !== "(" &&
+ char !== "|" &&
+ !nonConcatOperators.has(next);
+
+const isOperator = char =>
+ nonConcatOperators.has(char) || char == "(";
+
+const tokenizeRegexStep = chars => ({ out, state }, char, index) => {
+ const next = chars[index + 1];
+ const maybeConcat = shouldConcat(char, next)
+ ? [{operator: "concat"}]
+ : [];
+
+ if (state === ConcatStep.ESCAPING) {
+ return {
+ out: out.concat(char, maybeConcat),
+ state: ConcatStep.ACCEPTING,
+ };
+ }
+
+ if (char === "\\") {
+ if (next === undefined) {
+ throw new SyntaxError(
+ `bad trailing escape character: ${chars}`,
+ );
+ }
+
+ return {
+ out,
+ state: ConcatStep.ESCAPING,
+ };
+ }
+
+ const op = isOperator(char) ? { operator: char } : char;
+ return {
+ out: out.concat(op, maybeConcat),
+ state,
+ };
+};
+
+const tokenizeRegexFn = chars =>
+ chars.reduce(tokenizeRegexStep(chars), {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ });
+
+const tokenizeRegex = chars =>
+ tokenizeRegexFn(chars).out;
+
+const PRECEDENCE = {
+ "(": 4,
+ "*": 3,
+ "+": 3,
+ "?": 3,
+ "concat": 2,
+ "|": 1,
+};
+
+const shouldPush = (stack, token) =>
+ stack.length === 0 ||
+ token.operator === "(" ||
+ stack[0].operator === "(";
+
+const toPostfixStep = ({ out, stack }, token, _index, tokens) => {
+ if (typeof token === "string") {
+ return {
+ out: out.concat(token),
+ stack,
+ };
+ }
+
+ if (shouldPush(stack, token)) {
+ return {
+ out,
+ stack: [token].concat(stack),
+ };
+ }
+
+ if (token.operator === ")") {
+ const openParenIndex = stack.findIndex(o => o.operator === "(");
+ if (openParenIndex === -1) {
+ throw new SyntaxError(
+ `close parens without opening: ${tokens}`,
+ );
+ }
+
+ const ops = stack.slice(0, openParenIndex);
+ const leftover = stack.slice(openParenIndex + 1);
+ return {
+ out: out.concat(ops),
+ stack: leftover,
+ };
+ }
+
+ const idx = stack.findIndex(el =>
+ el.operator === "(" ||
+ PRECEDENCE[el.operator] < PRECEDENCE[token.operator],
+ );
+
+ if (idx < 0) {
+ return {
+ out: out.concat(stack),
+ stack: [token],
+ };
+ } else {
+ const ops = stack.slice(0, idx);
+ const leftover = stack.slice(idx);
+ return {
+ out: out.concat(ops),
+ stack: [token].concat(leftover),
+ };
+ }
+};
+
+const toPostfix = tokens => {
+ const { out, stack } = tokens.reduce(toPostfixStep, {
+ out: [],
+ stack: [],
+ });
+ return out.concat(stack);
+};
+
+const emptyNFA = () => {
+ const startID = 1;
+ const endID = 2;
+ const nextID = 3;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ [startID]: {
+ direct: [ endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ }
+ };
+};
+
+const baseNFA = (edge, id) => {
+ const startID = id + 0;
+ const endID = id + 1;
+ const nextID = id + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ [startID]: {
+ direct: [],
+ transitions: {
+ [edge]: endID,
+ },
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const concat = (lhs, rhs) => ({
+ start: lhs.start,
+ end: rhs.end,
+ nextID: max(lhs.nextID, rhs.nextID),
+ nodes: {
+ ...lhs.nodes,
+ ...rhs.nodes,
+ [lhs.end]: {
+ ...lhs.nodes[lhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ rhs.start,
+ ]),
+ }
+ },
+});
+
+const union = (lhs, rhs) => {
+ const startID = max(lhs.nextID, rhs.nextID);
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...lhs.nodes,
+ ...rhs.nodes,
+ [lhs.end]: {
+ ...lhs.nodes[lhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ endID,
+ ]),
+ },
+ [rhs.end]: {
+ ...rhs.nodes[rhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [lhs.start, rhs.start],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const zeroOrMore = nfa => {
+ const startID = nfa.nextID;
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...nfa.nodes,
+ [nfa.end]: {
+ ...nfa.nodes[nfa.end],
+ direct: nfa.nodes[nfa.end].direct.concat([
+ nfa.start,
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [ nfa.start, endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const oneOrMore = nfa =>
+ concat(nfa, zeroOrMore(nfa));
+
+const zeroOrOne = nfa => {
+ const startID = nfa.nextID;
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...nfa.nodes,
+ [nfa.end]: {
+ ...nfa.nodes[nfa.end],
+ direct: nfa.nodes[nfa.end].direct.concat([
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [ nfa.start, endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const OPERATORS_FNS = ({
+ zeroOrMoreFn = zeroOrMore,
+ oneOrMoreFn = oneOrMore,
+ zeroOrOneFn = zeroOrOne,
+ unionFn = union,
+ concatFn = concat,
+} = {}) => ({
+ "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))),
+ "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))),
+ "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))),
+ "|": stack => butlast(butlast(stack)).concat(unionFn(
+ last(butlast(stack)),
+ last(stack),
+ )),
+ "concat": stack => butlast(butlast(stack)).concat(concatFn(
+ last(butlast(stack)),
+ last(stack),
+ )),
+});
+
+const OPERATORS = OPERATORS_FNS();
+
+const buildNFAStep = (stack, token) =>
+ typeof token === "string"
+ ? stack.concat(baseNFA(token, last(stack)?.nextID || 1))
+ : OPERATORS[token.operator](stack);
+
+const buildNFA = tokens =>
+ tokens.length === 0
+ ? emptyNFA()
+ : last(tokens.reduce(buildNFAStep, []));
+
+const allDirects = (stateID, nodes) =>
+ nodes[stateID].direct.length === 0
+ ? [ stateID ]
+ : [...new Set(
+ nodes[stateID].direct.map(
+ id => allDirects(id, nodes),
+ ).flat(),
+ )].toSorted();
+
+const searchNFAStep = nodes => (directStates, char) =>
+ [...new Set(
+ directStates
+ .map(id => nodes[id].transitions[char])
+ .filter(x => !!x)
+ .map(id => allDirects(id, nodes))
+ .flat()
+ )];
+
+const searchNFAFn = (nfa, string) =>
+ explode(string).reduce(
+ searchNFAStep(nfa.nodes),
+ allDirects(nfa.start, nfa.nodes),
+ );
+
+const searchNFA = (nfa, string) =>
+ searchNFAFn(nfa, string).includes(nfa.end);
+
+const nodeID = states =>
+ [...new Set(states)]
+ .toSorted((a, b) => a - b)
+ .map(x => x.toString())
+ .join("-");
+
+const unNodeID = stateID =>
+ stateID.split("-").map(Number);
+
+const collectTransitions = (nfaNodes, states) => {
+ const transitions = states.map(
+ id => Object.entries(nfaNodes[id].transitions),
+ ).filter(t => t.length !== 0);
+ if (!transitions.every(t => t.length === 1)) {
+ throw new Error("unexpected transitions with more than 1 key");
+ }
+ const groupedTransitions = Object.groupBy(
+ transitions.map(t => t[0]),
+ ([key, stateID]) => key,
+ );
+ return mapValues(
+ groupedTransitions,
+ entry =>
+ entry.map(([_, id]) =>
+ allDirects(id, nfaNodes)).flat(),
+ );
+};
+
+const computePending = (pending, transitions) =>
+ [...new Set(
+ pending.slice(1).concat(transitions)
+ .map(nodeID))
+ ].map(unNodeID);
+
+const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => {
+ if (pending.length === 0) {
+ return dfaNodes;
+ }
+
+ const states = pending[0];
+ const stateID = nodeID(states);
+ if (!!dfaNodes[stateID]) {
+ return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes);
+ }
+
+ const transitions = collectTransitions(nfaNodes, states);
+ const newPending = computePending(pending, Object.values(transitions));
+ return buildDFANodes(end, nfaNodes, newPending, {
+ ...dfaNodes,
+ [stateID]: {
+ end: states.includes(end),
+ transitions: mapValues(transitions, nodeID),
+ },
+ });
+};
+
+const buildDFA = nfa => {
+ const start = allDirects(nfa.start, nfa.nodes);
+ return {
+ start: nodeID(start),
+ nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}),
+ };
+};
+
+const searchDFAStep = nodes => (state, char) =>
+ nodes[state].transitions[char] || reduced(false);
+
+const searchDFAFn = (dfa, string) =>
+ reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start);
+
+const searchDFA = (dfa, string) =>
+ !!dfa.nodes[searchDFAFn(dfa, string)]?.end
+
+const compileNFA = regex =>
+ buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+
+const compileDFA = regex =>
+ buildDFA(compileNFA(regex));
+
+export const compile = regex => {
+ const nfa = compileDFA(regex);
+ return string => searchDFA(nfa, string);
+};