import { butlast, explode, last, mapValues, max, reduce, reduced, repeat, } from "sjs"; export class SyntaxError extends Error {} const ConcatStep = { ACCEPTING: "accepting", ESCAPING: "escaping", }; const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]); const shouldConcat = (char, next) => next !== undefined && char !== "(" && char !== "|" && !nonConcatOperators.has(next); const isOperator = char => nonConcatOperators.has(char) || char == "("; const tokenizeRegexStep = chars => ({ out, state }, char, index) => { const next = chars[index + 1]; const maybeConcat = shouldConcat(char, next) ? [{operator: "concat"}] : []; if (state === ConcatStep.ESCAPING) { return { out: out.concat(char, maybeConcat), state: ConcatStep.ACCEPTING, }; } if (char === "\\") { if (next === undefined) { throw new SyntaxError( `bad trailing escape character: ${chars}`, ); } return { out, state: ConcatStep.ESCAPING, }; } const op = isOperator(char) ? { operator: char } : char; return { out: out.concat(op, maybeConcat), state, }; }; const tokenizeRegexFn = chars => chars.reduce(tokenizeRegexStep(chars), { out: [], state: ConcatStep.ACCEPTING, }); const tokenizeRegex = chars => tokenizeRegexFn(chars).out; const PRECEDENCE = { "(": 4, "*": 3, "+": 3, "?": 3, "concat": 2, "|": 1, }; const shouldPush = (stack, token) => stack.length === 0 || token.operator === "(" || (stack[0].operator === "(" && token.operator !== ")"); const findLowerPrecedenceItem = (stack, token) => stack.findIndex(el => el.operator === "(" || PRECEDENCE[el.operator] < PRECEDENCE[token.operator], ); const toPostfixStep = ({ out, stack }, token, _index, tokens) => { if (typeof token === "string") { return { out: out.concat(token), stack, }; } if (shouldPush(stack, token)) { return { out, stack: [token].concat(stack), }; } if (token.operator === ")") { const openParenIndex = stack.findIndex(o => o.operator === "("); if (openParenIndex === -1) { throw new SyntaxError( `close parens without opening: ${tokens}`, ); } const ops = stack.slice(0, openParenIndex); const leftover = stack.slice(openParenIndex + 1); return { out: out.concat(ops), stack: leftover, }; } const idx = findLowerPrecedenceItem(stack, token); if (idx < 0) { return { out: out.concat(stack), stack: [token], }; } else { const ops = stack.slice(0, idx); const leftover = stack.slice(idx); return { out: out.concat(ops), stack: [token].concat(leftover), }; } }; const toPostfix = tokens => { const { out, stack } = tokens.reduce(toPostfixStep, { out: [], stack: [], }); return out.concat(stack); }; const emptyNFA = () => { const startID = 1; const endID = 2; const nextID = 3; return { start: startID, end: endID, nextID, nodes: { [startID]: { direct: [ endID ], transitions: {}, }, [endID]: { direct: [], transitions: {}, }, } }; }; const baseNFA = (edge, id) => { const startID = id + 0; const endID = id + 1; const nextID = id + 2; return { start: startID, end: endID, nextID, nodes: { [startID]: { direct: [], transitions: { [edge]: endID, }, }, [endID]: { direct: [], transitions: {}, }, }, }; }; const concat = (lhs, rhs) => ({ start: lhs.start, end: rhs.end, nextID: max(lhs.nextID, rhs.nextID), nodes: { ...lhs.nodes, ...rhs.nodes, [lhs.end]: { ...lhs.nodes[lhs.end], direct: lhs.nodes[lhs.end].direct.concat([ rhs.start, ]), } }, }); const union = (lhs, rhs) => { const startID = max(lhs.nextID, rhs.nextID); const endID = startID + 1; const nextID = startID + 2; return { start: startID, end: endID, nextID, nodes: { ...lhs.nodes, ...rhs.nodes, [lhs.end]: { ...lhs.nodes[lhs.end], direct: lhs.nodes[lhs.end].direct.concat([ endID, ]), }, [rhs.end]: { ...rhs.nodes[rhs.end], direct: lhs.nodes[lhs.end].direct.concat([ endID, ]), }, [startID]: { direct: [lhs.start, rhs.start], transitions: {}, }, [endID]: { direct: [], transitions: {}, }, }, }; }; const zeroOrMore = nfa => { const startID = nfa.nextID; const endID = startID + 1; const nextID = startID + 2; return { start: startID, end: endID, nextID, nodes: { ...nfa.nodes, [nfa.end]: { ...nfa.nodes[nfa.end], direct: nfa.nodes[nfa.end].direct.concat([ startID, ]), }, [startID]: { direct: [ nfa.start, endID ], transitions: {}, }, [endID]: { direct: [], transitions: {}, }, }, }; }; const oneOrMore = nfa => concat(nfa, zeroOrMore(nfa)); const zeroOrOne = nfa => { const startID = nfa.nextID; const endID = startID + 1; const nextID = startID + 2; return { start: startID, end: endID, nextID, nodes: { ...nfa.nodes, [nfa.end]: { ...nfa.nodes[nfa.end], direct: nfa.nodes[nfa.end].direct.concat([ endID, ]), }, [startID]: { direct: [ nfa.start, endID ], transitions: {}, }, [endID]: { direct: [], transitions: {}, }, }, }; }; const OPERATORS_FNS = ({ zeroOrMoreFn = zeroOrMore, oneOrMoreFn = oneOrMore, zeroOrOneFn = zeroOrOne, unionFn = union, concatFn = concat, } = {}) => ({ "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))), "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))), "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))), "|": stack => butlast(butlast(stack)).concat(unionFn( last(butlast(stack)), last(stack), )), "concat": stack => butlast(butlast(stack)).concat(concatFn( last(butlast(stack)), last(stack), )), }); const OPERATORS = OPERATORS_FNS(); const buildNFAStep = (stack, token) => typeof token === "string" ? stack.concat(baseNFA(token, last(stack)?.nextID || 1)) : OPERATORS[token.operator](stack); const buildNFA = tokens => tokens.length === 0 ? emptyNFA() : last(tokens.reduce(buildNFAStep, [])); const allDirects = (stateID, nodes) => nodes[stateID].direct.length === 0 ? [ stateID ] : [...new Set( nodes[stateID].direct.map( id => allDirects(id, nodes), ).flat(), )].toSorted(); const searchNFAStep = nodes => (directStates, char) => [...new Set( directStates .map(id => nodes[id].transitions[char]) .filter(x => !!x) .map(id => allDirects(id, nodes)) .flat() )].toSorted(); const searchNFAFn = (nfa, string) => explode(string).reduce( searchNFAStep(nfa.nodes), allDirects(nfa.start, nfa.nodes), ); const searchNFA = (nfa, string) => searchNFAFn(nfa, string).includes(nfa.end); const nodeID = states => [...new Set(states)] .toSorted((a, b) => a - b) .map(x => x.toString()) .join("-"); const unNodeID = stateID => stateID.split("-").map(Number); const collectTransitions = (nfaNodes, states) => { const transitions = states.map( id => Object.entries(nfaNodes[id].transitions), ).filter(t => t.length !== 0); if (!transitions.every(t => t.length === 1)) { throw new Error("unexpected transitions with more than 1 key"); } const groupedTransitions = Object.groupBy( transitions.map(t => t[0]), ([key, stateID]) => key, ); return mapValues( groupedTransitions, entry => [ ...new Set(entry.map(([_, id]) => allDirects(id, nfaNodes)).flat()), ].toSorted(), ); }; const computePending = (pending, transitions) => [...new Set( pending.slice(1).concat(transitions) .map(nodeID)) ].toSorted().map(unNodeID); const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => { if (pending.length === 0) { return dfaNodes; } const states = pending[0]; const stateID = nodeID(states); if (!!dfaNodes[stateID]) { return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes); } const transitions = collectTransitions(nfaNodes, states); const newPending = computePending(pending, Object.values(transitions)); return buildDFANodes(end, nfaNodes, newPending, { ...dfaNodes, [stateID]: { end: states.includes(end), transitions: mapValues(transitions, nodeID), }, }); }; const buildDFA = nfa => { const start = allDirects(nfa.start, nfa.nodes); return { start: nodeID(start), nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}), }; }; const searchDFAStep = nodes => (state, char) => nodes[state].transitions[char] || reduced(false); const searchDFAFn = (dfa, string) => reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start); const searchDFA = (dfa, string) => !!dfa.nodes[searchDFAFn(dfa, string)]?.end const compileNFA = regex => buildNFA(toPostfix(tokenizeRegex(explode(regex)))); const compileDFA = regex => buildDFA(compileNFA(regex)); export const compile = regex => { const nfa = compileDFA(regex); return string => searchDFA(nfa, string); };