import { butlast, dissoc, explode, isNumeric, last, mapValues, max, reduce, reduced, } from "sjs"; export class SyntaxError extends Error {} export class ValueError extends Error {} export class InternalError extends Error {} const ConcatStep = { ACCEPTING: "accepting", ESCAPING: "escaping", RANGE: "range", CLASS: "class", }; const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]); const shouldConcat = (char, next) => next !== undefined && char !== "(" && char !== "|" && char !== "{" && !nonConcatOperators.has(next); const operatorChars = new Set([...nonConcatOperators, "(", "$"]); const isOperator = char => operatorChars.has(char); const numFromDigits = digits => digits.length === 0 ? -1 : Number(digits.join("")); const escapingStateStep = ({ out, state, context }, char, _index, next) => !(isOperator(char) || char === "\\") ? reduced({ out, state, context, error: new SyntaxError( "unknown escape sequence: \\" + char, ), }) : { out: out.concat( char, shouldConcat(null, next) ? [{ operator: "concat" }] : [], ), state: ConcatStep.ACCEPTING, context, }; const rangeStateStep = ({ out, state, context }, char, _index, _next) => { if (char === "}") { if (context.where !== "to") { return reduced({ out, state, context, error: new SyntaxError( "missing comma in range operator", ), }); } const from = numFromDigits(context.from); const to = numFromDigits(context.to); if (from > to && to != -1) { return reduced({ out, state, context, error: new ValueError( `bad range values: {${from},${to}}`, ), }); } return { out: out.concat({ operator: "range", from, to, }), state: ConcatStep.ACCEPTING, context: null, }; } if (char === ",") { if (context.where === "to") { return reduced({ out, state, context, error: new SyntaxError( "extraneous comma in range expression", ), }); } else { return { out, state, context: { ...context, where: "to", }, }; } } if (!isNumeric(char)) { return reduced({ out, state, context, error: new SyntaxError( "bad char in range expression: " + char, ), }); } return { out, state, context: { ...context, [context.where]: context[context.where].concat(char), }, }; }; const classStateStep = ({ out, state, context }, char, _index, _next) => { if (context.escaping) { return { out, state, context: dissoc({ ...context, set: context.set.concat(char), }, "escaping"), }; } if (char === "]") { if (context.range.where === "to") { return reduced({ out, state, context, error: new SyntaxError( "unfinished character class range", ), }); } if (context.set.length === 0) { return reduced({ out, state, context, error: new ValueError("empty character class"), }); } return { out: out.concat({ meta: "class", set: context.set, caret: !!context.caret, }), state: ConcatStep.ACCEPTING, context: null, }; } if (char === "\\") { return { out, state, context: { ...context, escaping: true, }, }; } if (context.range.where === "to") { const from = context.range.from; const to = char; if (from.charCodeAt(0) > to.charCodeAt(0)) { return reduced({ out, state, context, error: new ValueError( "bad class range values: " + `[${from}-${to}]`, ), }); } return { out, state, context: { ...context, set: context.set.concat({ from, to }), range: { from: null, where: "from", }, }, }; } if (char === "-" && context.set.length !== 0) { return { out, state, context: { ...context, set: butlast(context.set), range: { from: last(context.set), where: "to", }, }, }; } if (char === "^" && context.set.length === 0) { return { out, state, context: { ...context, caret: true, }, }; } return { out, state, context: { ...context, set: context.set.concat(char), }, }; }; const STATE_FNS = { [ConcatStep.ESCAPING]: escapingStateStep, [ConcatStep.RANGE ]: rangeStateStep, [ConcatStep.CLASS ]: classStateStep, }; const TRANSITION_FNS = { "\\": ({ out, _state, context }, _char, _index, _next) => ({ out, state: ConcatStep.ESCAPING, context, }), "{": ({ out, _state, _context }, _char, _index, _next) => ({ out, state: ConcatStep.RANGE, context: { from: [], to: [], where: "from", }, }), "[": ({ out, _state, _context }, _char, _index, _next) => ({ out, state: ConcatStep.CLASS, context: { set: [], range: { from: null, where: "from", }, }, }), }; const ANCHOR_FNS = { "^": ({ out, state, context }, _char, index, _next) => index !== 0 ? reduced({ out, state, context, error: new SyntaxError( "^ not at the start of the expression", ), }) : { out: out.concat([ { meta: "^" }, { operator: "concat" }, ]), state, context, }, "$": ({ out, state, context }, _char, _index, next) => next !== undefined ? reduced({ out, state, context, error: new SyntaxError( "$ not at the end of the expression", ), }) : { out: out.concat([{ meta: "$" }]), state, context, }, }; const anchors = new Set(Object.keys(ANCHOR_FNS)); const isAnchor = char => anchors.has(char); const transitionChars = new Set(Object.keys(TRANSITION_FNS)); const isTransition = char => transitionChars.has(char); const metaChars = new Set(["."]); const isMeta = char => metaChars.has(char); const opFor = char => isOperator(char) ? { operator: char } : isMeta(char) ? { meta: char } : char; const tokenizeRegexStep = chars => ({ out, state, context }, char, index) => { const next = chars[index + 1]; if (state !== ConcatStep.ACCEPTING) { return STATE_FNS[state]( { out, state, context }, char, index, next, ); } if (isTransition(char)) { return TRANSITION_FNS[char]( { out, state, context }, char, index, next, ); } if (isAnchor(char)) { return ANCHOR_FNS[char]( { out, state, context }, char, index, next, ); } return { out: out.concat( opFor(char), shouldConcat(char, next) ? [{ operator: "concat" }] : [], ), state, context, }; }; const tokenizeRegexFn = chars => reduce(chars, tokenizeRegexStep(chars), { out: [], state: ConcatStep.ACCEPTING, context: null, }); const tokenizeRegex = chars => { const tokens = tokenizeRegexFn(chars); if (!!tokens.error) { throw tokens.error; } if (tokens.state !== ConcatStep.ACCEPTING) { throw new SyntaxError("bad ending state: " + tokens.state); } return tokens.out; }; const PRECEDENCE = { "(": 4, "*": 3, "+": 3, "?": 3, "range": 3, "concat": 2, "|": 1, }; const shouldPush = (stack, token) => stack.length === 0 || token.operator === "(" || (stack[0].operator === "(" && token.operator !== ")"); const findLowerPrecedenceItem = (stack, token) => stack.findIndex(el => el.operator === "(" || PRECEDENCE[el.operator] < PRECEDENCE[token.operator], ); const toPostfixStep = ({ out, stack }, token, _index, tokens) => { if (!token.operator) { return { out: out.concat(token), stack, }; } if (shouldPush(stack, token)) { return { out, stack: [token].concat(stack), }; } if (token.operator === ")") { const openParenIndex = stack.findIndex(o => o.operator === "("); if (openParenIndex === -1) { throw new SyntaxError( `close parens without opening: ${tokens}`, ); } const ops = stack.slice(0, openParenIndex); const leftover = stack.slice(openParenIndex + 1); return { out: out.concat(ops), stack: leftover, }; } const idx = findLowerPrecedenceItem(stack, token); if (idx < 0) { return { out: out.concat(stack), stack: [token], }; } else { const ops = stack.slice(0, idx); const leftover = stack.slice(idx); return { out: out.concat(ops), stack: [token].concat(leftover), }; } }; const toPostfix = tokens => { const { out, stack } = tokens.reduce(toPostfixStep, { out: [], stack: [], }); return out.concat(stack); }; const emptyNFA = () => { const start = 1; const end = 2; return { start, end, nodes: { [start]: { direct: [end], transitions: {}, }, [end]: { direct: [], transitions: {}, }, } }; }; const literal = (edge, id) => { const start = id + 0; const end = id + 1; return { start, end, nodes: { [start]: { direct: [], transitions: { [edge]: end, }, }, [end]: { direct: [], transitions: {}, }, }, }; }; const concat = (lhs, rhs) => ({ start: lhs.start, end: rhs.end, nodes: { ...lhs.nodes, ...rhs.nodes, [lhs.end]: { ...lhs.nodes[lhs.end], direct: lhs.nodes[lhs.end].direct.concat(rhs.start), } }, }); const union = (lhs, rhs) => { const start = max(lhs.end, rhs.end) + 1; const end = start + 1; return { start, end, nodes: { ...lhs.nodes, ...rhs.nodes, [lhs.end]: { ...lhs.nodes[lhs.end], direct: lhs.nodes[lhs.end].direct.concat(end), }, [rhs.end]: { ...rhs.nodes[rhs.end], direct: lhs.nodes[lhs.end].direct.concat(end), }, [start]: { direct: [lhs.start, rhs.start], transitions: {}, }, [end]: { direct: [], transitions: {}, }, }, }; }; const zeroOrMore = nfa => { const start = nfa.end + 1; const end = nfa.end + 2; return { start, end, nodes: { ...nfa.nodes, [nfa.end]: { ...nfa.nodes[nfa.end], direct: nfa.nodes[nfa.end].direct.concat(start), }, [start]: { direct: [nfa.start, end], transitions: {}, }, [end]: { direct: [], transitions: {}, }, }, }; }; const oneOrMore = nfa => concat(nfa, zeroOrMore(nfa)); const zeroOrOne = nfa => { const start = nfa.end + 1; const end = nfa.end + 2; return { start, end, nodes: { ...nfa.nodes, [nfa.end]: { ...nfa.nodes[nfa.end], direct: nfa.nodes[nfa.end].direct.concat(end), }, [start]: { direct: [nfa.start, end], transitions: {}, }, [end]: { direct: [], transitions: {}, }, }, }; }; const compareRange = (lhs, rhs) => lhs[0] !== rhs[0] ? lhs[0] - rhs[0] : lhs[1] - rhs[1]; const compressRangeStep = (ranges, [from, to]) => { const curr = last(ranges); if (!curr) { return [[from, to]]; } return from <= curr[1] ? butlast(ranges).concat([[curr[0], max(to, curr[1])]]) : ranges.concat([[from, to]]); }; const compressCharacterRanges = ranges => ranges.toSorted(compareRange).reduce(compressRangeStep, []); const inRange = (ranges, char) => { const code = char.charCodeAt(0); const froms = Object.keys(ranges).map(Number).filter( from => from <= code, ); return froms.some(from => code <= ranges[from]); }; const characterClass = ({ set, caret }, id) => { const start = id + 0; const end = id + 1; const { string, object } = Object.groupBy(set, x => typeof x); const ranges = Object.fromEntries( compressCharacterRanges((object || []).map( ({ from, to }) => [ from.charCodeAt(0), to.charCodeAt(0) ], ), )); const matches = new Set((string || []).filter( char => !inRange(ranges, char), )); return { start, end, nodes: { [start]: { direct: [], transitions: {}, meta: { op: caret ? "excludes" : "includes", to: end, matches, ranges, }, }, [end]: { direct: [], transitions: {}, }, }, }; }; const wildcard = (_edge, id) => { const start = id + 0; const end = id + 1; return { start, end, nodes: { [start]: { direct: [], transitions: {}, meta: { op: true, to: end, }, }, [end]: { direct: [], transitions: {}, }, }, }; }; const caret = (_edge, id) => { const start = id + 0; const end = id + 1; return { start, end, nodes: { [start]: { direct: [], transitions: {}, meta: { op: "caret", to: end, }, }, [end]: { direct: [], transitions: {}, }, }, }; }; const dollar = (_edge, id) => { const start = id + 0; const end = id + 1; return { start, end, nodes: { [start]: { direct: [], transitions: {}, meta: { op: "dollar", to: end, }, }, [end]: { direct: [], transitions: {}, }, }, }; }; const OPERATORS_FNS = ({ zeroOrMoreFn = zeroOrMore, oneOrMoreFn = oneOrMore, zeroOrOneFn = zeroOrOne, unionFn = union, concatFn = concat, } = {}) => ({ "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))), "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))), "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))), "|": stack => butlast(butlast(stack)).concat(unionFn( last(butlast(stack)), last(stack), )), "concat": stack => butlast(butlast(stack)).concat(concatFn( last(butlast(stack)), last(stack), )), }); const OPERATORS = OPERATORS_FNS(); const METACHARACTERS_FNS = { "class": characterClass, ".": wildcard, "^": caret, "$": dollar, }; const baseNFA = (token, id) => ( !token.meta ? literal : METACHARACTERS_FNS[token.meta] )(token, id); const buildNFAStep = (stack, token) => !token.operator ? stack.concat(baseNFA(token, (last(stack)?.end || 0) + 1)) : OPERATORS[token.operator](stack); const buildNFA = tokens => tokens.length === 0 ? emptyNFA() : last(tokens.reduce(buildNFAStep, [])); const allDirects = (stateID, nodes) => nodes[stateID].direct.length === 0 ? [ stateID ] : [...new Set( nodes[stateID].direct.map( id => allDirects(id, nodes), ).flat(), )].toSorted(); const VALID_OPS = new Set([ "caret", "dollar", true, "includes", "excludes", ]); const isValidOp = op => VALID_OPS.has(op); const interpretMetacharacter = ( { op, to, matches, ranges }, char, index, nodes, ) => { if (!op) { return []; } if (!isValidOp(op)) { throw new InternalError("Unsupported op: " + op); } if (op === "caret") { return index === 0 ? searchNFAStep(nodes)( allDirects(to, nodes), char, index, ) : []; } if (op === "dollar") { return []; } if (op === true) { return [to]; } if (matches.has(char)) { return op === "includes" ? [to] : []; } if (inRange(ranges, char)) { return op === "includes" ? [to] : []; } return op === "excludes" ? [to] : []; }; const performTransition = (nodes, char, index) => id => (nodes[id].transitions[char] && [nodes[id].transitions[char]]) || interpretMetacharacter(nodes[id].meta || {}, char, index, nodes); const searchNFAStep = nodes => (directStates, char, index) => [...new Set( directStates .map(performTransition(nodes, char, index)) .flat() .map(id => allDirects(id, nodes)) .flat() )].toSorted(); const anchorDollar = (nfa, ids) => nfa.nodes[last(ids)]?.meta?.op === "dollar" ? allDirects(nfa.nodes[last(ids)].meta.to, nfa.nodes) : ids; const searchNFAFn = (nfa, string) => anchorDollar(nfa, explode(string).reduce( searchNFAStep(nfa.nodes), allDirects(nfa.start, nfa.nodes), )); const searchNFA = (nfa, string) => searchNFAFn(nfa, string).includes(nfa.end); const nodeID = states => [...new Set(states)] .toSorted((a, b) => a - b) .map(x => x.toString()) .join("-"); const unNodeID = stateID => stateID.split("-").map(Number); const collectTransitions = (nfaNodes, states) => { const transitions = states.map( id => Object.entries(nfaNodes[id].transitions), ).filter(t => t.length !== 0); if (!transitions.every(t => t.length === 1)) { throw new InternalError( "unexpected transitions with more than 1 key", ); } const groupedTransitions = Object.groupBy( transitions.map(t => t[0]), ([key, stateID]) => key, ); return mapValues( groupedTransitions, entry => [ ...new Set(entry.map(([_, id]) => allDirects(id, nfaNodes)).flat()), ].toSorted(), ); }; const computePending = (pending, transitions) => [...new Set( pending.slice(1).concat(transitions) .map(nodeID)) ].toSorted().map(unNodeID); const toDFANodes = (end, nfaNodes, pending, dfaNodes) => { if (pending.length === 0) { return dfaNodes; } const states = pending[0]; const stateID = nodeID(states); if (!!dfaNodes[stateID]) { return toDFANodes(end, nfaNodes, pending.slice(1), dfaNodes); } const transitions = collectTransitions(nfaNodes, states); const newPending = computePending(pending, Object.values(transitions)); return toDFANodes(end, nfaNodes, newPending, { ...dfaNodes, [stateID]: { end: states.includes(end), transitions: mapValues(transitions, nodeID), }, }); }; const toDFA = nfa => { const start = allDirects(nfa.start, nfa.nodes); return { start: nodeID(start), nodes: toDFANodes(nfa.end, nfa.nodes, [start], {}), }; }; const searchDFAStep = nodes => (state, char) => nodes[state].transitions[char] || reduced(false); const searchDFAFn = (dfa, string) => reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start); const searchDFA = (dfa, string) => !!dfa.nodes[searchDFAFn(dfa, string)]?.end const parse = regex => toPostfix(tokenizeRegex(explode(regex))); const compileNFA = regex => buildNFA(parse(regex)); const compileDFA = regex => toDFA(compileNFA(regex)); export const compile = regex => { const nfa = compileDFA(regex); return string => searchDFA(nfa, string); };