diff options
| author | EuAndreh <eu@euandre.org> | 2025-07-01 13:45:29 -0300 |
|---|---|---|
| committer | EuAndreh <eu@euandre.org> | 2025-07-07 06:09:16 -0300 |
| commit | 6b626f2e0408e52f771e0895648b4b75060ec082 (patch) | |
| tree | c2641eddadbaaf211208c50a4a9ee0172382f95b /src | |
| parent | Initial empty commit (diff) | |
| download | paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz | |
Implement v0 version of NFA and DFA; WIP tests
Diffstat (limited to 'src')
| -rw-r--r-- | src/paca.mjs | 436 |
1 files changed, 436 insertions, 0 deletions
diff --git a/src/paca.mjs b/src/paca.mjs new file mode 100644 index 0000000..a0e607b --- /dev/null +++ b/src/paca.mjs @@ -0,0 +1,436 @@ +import { + butlast, explode, last, mapValues, max, reduce, reduced, repeat, +} from "sjs"; + + + +export class SyntaxError extends Error {} + +const ConcatStep = { + ACCEPTING: "accepting", + ESCAPING: "escaping", +}; + +const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]); + +const shouldConcat = (char, next) => + next !== undefined && + char !== "(" && + char !== "|" && + !nonConcatOperators.has(next); + +const isOperator = char => + nonConcatOperators.has(char) || char == "("; + +const tokenizeRegexStep = chars => ({ out, state }, char, index) => { + const next = chars[index + 1]; + const maybeConcat = shouldConcat(char, next) + ? [{operator: "concat"}] + : []; + + if (state === ConcatStep.ESCAPING) { + return { + out: out.concat(char, maybeConcat), + state: ConcatStep.ACCEPTING, + }; + } + + if (char === "\\") { + if (next === undefined) { + throw new SyntaxError( + `bad trailing escape character: ${chars}`, + ); + } + + return { + out, + state: ConcatStep.ESCAPING, + }; + } + + const op = isOperator(char) ? { operator: char } : char; + return { + out: out.concat(op, maybeConcat), + state, + }; +}; + +const tokenizeRegexFn = chars => + chars.reduce(tokenizeRegexStep(chars), { + out: [], + state: ConcatStep.ACCEPTING, + }); + +const tokenizeRegex = chars => + tokenizeRegexFn(chars).out; + +const PRECEDENCE = { + "(": 4, + "*": 3, + "+": 3, + "?": 3, + "concat": 2, + "|": 1, +}; + +const shouldPush = (stack, token) => + stack.length === 0 || + token.operator === "(" || + stack[0].operator === "("; + +const toPostfixStep = ({ out, stack }, token, _index, tokens) => { + if (typeof token === "string") { + return { + out: out.concat(token), + stack, + }; + } + + if (shouldPush(stack, token)) { + return { + out, + stack: [token].concat(stack), + }; + } + + if (token.operator === ")") { + const openParenIndex = stack.findIndex(o => o.operator === "("); + if (openParenIndex === -1) { + throw new SyntaxError( + `close parens without opening: ${tokens}`, + ); + } + + const ops = stack.slice(0, openParenIndex); + const leftover = stack.slice(openParenIndex + 1); + return { + out: out.concat(ops), + stack: leftover, + }; + } + + const idx = stack.findIndex(el => + el.operator === "(" || + PRECEDENCE[el.operator] < PRECEDENCE[token.operator], + ); + + if (idx < 0) { + return { + out: out.concat(stack), + stack: [token], + }; + } else { + const ops = stack.slice(0, idx); + const leftover = stack.slice(idx); + return { + out: out.concat(ops), + stack: [token].concat(leftover), + }; + } +}; + +const toPostfix = tokens => { + const { out, stack } = tokens.reduce(toPostfixStep, { + out: [], + stack: [], + }); + return out.concat(stack); +}; + +const emptyNFA = () => { + const startID = 1; + const endID = 2; + const nextID = 3; + return { + start: startID, + end: endID, + nextID, + nodes: { + [startID]: { + direct: [ endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + } + }; +}; + +const baseNFA = (edge, id) => { + const startID = id + 0; + const endID = id + 1; + const nextID = id + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + [startID]: { + direct: [], + transitions: { + [edge]: endID, + }, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const concat = (lhs, rhs) => ({ + start: lhs.start, + end: rhs.end, + nextID: max(lhs.nextID, rhs.nextID), + nodes: { + ...lhs.nodes, + ...rhs.nodes, + [lhs.end]: { + ...lhs.nodes[lhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + rhs.start, + ]), + } + }, +}); + +const union = (lhs, rhs) => { + const startID = max(lhs.nextID, rhs.nextID); + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...lhs.nodes, + ...rhs.nodes, + [lhs.end]: { + ...lhs.nodes[lhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + endID, + ]), + }, + [rhs.end]: { + ...rhs.nodes[rhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + endID, + ]), + }, + [startID]: { + direct: [lhs.start, rhs.start], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const zeroOrMore = nfa => { + const startID = nfa.nextID; + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...nfa.nodes, + [nfa.end]: { + ...nfa.nodes[nfa.end], + direct: nfa.nodes[nfa.end].direct.concat([ + nfa.start, + endID, + ]), + }, + [startID]: { + direct: [ nfa.start, endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const oneOrMore = nfa => + concat(nfa, zeroOrMore(nfa)); + +const zeroOrOne = nfa => { + const startID = nfa.nextID; + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...nfa.nodes, + [nfa.end]: { + ...nfa.nodes[nfa.end], + direct: nfa.nodes[nfa.end].direct.concat([ + endID, + ]), + }, + [startID]: { + direct: [ nfa.start, endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const OPERATORS_FNS = ({ + zeroOrMoreFn = zeroOrMore, + oneOrMoreFn = oneOrMore, + zeroOrOneFn = zeroOrOne, + unionFn = union, + concatFn = concat, +} = {}) => ({ + "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))), + "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))), + "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))), + "|": stack => butlast(butlast(stack)).concat(unionFn( + last(butlast(stack)), + last(stack), + )), + "concat": stack => butlast(butlast(stack)).concat(concatFn( + last(butlast(stack)), + last(stack), + )), +}); + +const OPERATORS = OPERATORS_FNS(); + +const buildNFAStep = (stack, token) => + typeof token === "string" + ? stack.concat(baseNFA(token, last(stack)?.nextID || 1)) + : OPERATORS[token.operator](stack); + +const buildNFA = tokens => + tokens.length === 0 + ? emptyNFA() + : last(tokens.reduce(buildNFAStep, [])); + +const allDirects = (stateID, nodes) => + nodes[stateID].direct.length === 0 + ? [ stateID ] + : [...new Set( + nodes[stateID].direct.map( + id => allDirects(id, nodes), + ).flat(), + )].toSorted(); + +const searchNFAStep = nodes => (directStates, char) => + [...new Set( + directStates + .map(id => nodes[id].transitions[char]) + .filter(x => !!x) + .map(id => allDirects(id, nodes)) + .flat() + )]; + +const searchNFAFn = (nfa, string) => + explode(string).reduce( + searchNFAStep(nfa.nodes), + allDirects(nfa.start, nfa.nodes), + ); + +const searchNFA = (nfa, string) => + searchNFAFn(nfa, string).includes(nfa.end); + +const nodeID = states => + [...new Set(states)] + .toSorted((a, b) => a - b) + .map(x => x.toString()) + .join("-"); + +const unNodeID = stateID => + stateID.split("-").map(Number); + +const collectTransitions = (nfaNodes, states) => { + const transitions = states.map( + id => Object.entries(nfaNodes[id].transitions), + ).filter(t => t.length !== 0); + if (!transitions.every(t => t.length === 1)) { + throw new Error("unexpected transitions with more than 1 key"); + } + const groupedTransitions = Object.groupBy( + transitions.map(t => t[0]), + ([key, stateID]) => key, + ); + return mapValues( + groupedTransitions, + entry => + entry.map(([_, id]) => + allDirects(id, nfaNodes)).flat(), + ); +}; + +const computePending = (pending, transitions) => + [...new Set( + pending.slice(1).concat(transitions) + .map(nodeID)) + ].map(unNodeID); + +const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => { + if (pending.length === 0) { + return dfaNodes; + } + + const states = pending[0]; + const stateID = nodeID(states); + if (!!dfaNodes[stateID]) { + return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes); + } + + const transitions = collectTransitions(nfaNodes, states); + const newPending = computePending(pending, Object.values(transitions)); + return buildDFANodes(end, nfaNodes, newPending, { + ...dfaNodes, + [stateID]: { + end: states.includes(end), + transitions: mapValues(transitions, nodeID), + }, + }); +}; + +const buildDFA = nfa => { + const start = allDirects(nfa.start, nfa.nodes); + return { + start: nodeID(start), + nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}), + }; +}; + +const searchDFAStep = nodes => (state, char) => + nodes[state].transitions[char] || reduced(false); + +const searchDFAFn = (dfa, string) => + reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start); + +const searchDFA = (dfa, string) => + !!dfa.nodes[searchDFAFn(dfa, string)]?.end + +const compileNFA = regex => + buildNFA(toPostfix(tokenizeRegex(explode(regex)))); + +const compileDFA = regex => + buildDFA(compileNFA(regex)); + +export const compile = regex => { + const nfa = compileDFA(regex); + return string => searchDFA(nfa, string); +}; |
