diff options
author | EuAndreh <eu@euandre.org> | 2025-07-01 13:45:29 -0300 |
---|---|---|
committer | EuAndreh <eu@euandre.org> | 2025-07-07 06:09:16 -0300 |
commit | 6b626f2e0408e52f771e0895648b4b75060ec082 (patch) | |
tree | c2641eddadbaaf211208c50a4a9ee0172382f95b | |
parent | Initial empty commit (diff) | |
download | paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz |
Implement v0 version of NFA and DFA; WIP tests
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 105 | ||||
-rw-r--r-- | deps.mk | 0 | ||||
-rwxr-xr-x | mkdeps.sh | 4 | ||||
-rw-r--r-- | src/paca.mjs | 436 | ||||
-rw-r--r-- | tests/paca.mjs | 1351 |
6 files changed, 1898 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bf8600d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/node_modules/ +/src/*.exported.mjs diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..aa02a58 --- /dev/null +++ b/Makefile @@ -0,0 +1,105 @@ +.POSIX: +DATE = 1970-01-01 +VERSION = 0.1.0 +NAME = paca +NAME_UC = $(NAME) +## Installation prefix. Defaults to "/usr". +PREFIX = /usr +BINDIR = $(PREFIX)/bin +LIBDIR = $(PREFIX)/lib +JSLIBDIR = $(LIBDIR)/javascript +INCLUDEDIR = $(PREFIX)/include +SRCDIR = $(PREFIX)/src/$(NAME) +SHAREDIR = $(PREFIX)/share +LOCALEDIR = $(SHAREDIR)/locale +MANDIR = $(SHAREDIR)/man +EXEC = ./ +## Where to store the installation. Empty by default. +DESTDIR = +LDLIBS = +JSIMPL = node + + + +.SUFFIXES: +.SUFFIXES: .mjs .mjs-check + + + +all: +include deps.mk + +sources = \ + src/$(NAME).mjs \ + + +derived-assets = \ + src/$(NAME).exported.mjs \ + +side-assets = \ + + + +## Default target. Builds all artifacts required for testing +## and installation. +all: $(derived-assets) + + +src/$(NAME).exported.mjs: src/$(NAME).mjs Makefile + cp src/$(NAME).mjs $@ + printf '\n\nexport {\n' >> $@ + awk '/^const / { printf "\t%s,\n", $$2 }' src/$(NAME).mjs >> $@ + printf '}\n' >> $@ + + + +tests/$(NAME).mjs-check: src/$(NAME).exported.mjs + $(JSIMPL) $*.mjs + +check-unit: tests/$(NAME).mjs-check + + +integration-tests = \ + +.PRECIOUS: $(integration-tests) +$(integration-tests): ALWAYS + sh $@ + +check-integration: $(integration-tests) + + +## Run all tests. Each test suite is isolated, so that a parallel +## build can run tests at the same time. The required artifacts +## are created if missing. +check: check-unit check-integration + + + +## Remove *all* derived artifacts produced during the build. +## A dedicated test asserts that this is always true. +clean: + rm -rf $(derived-assets) $(side-assets) + + +## Installs into $(DESTDIR)$(PREFIX). Its dependency target +## ensures that all installable artifacts are crafted beforehand. +install: all + mkdir -p \ + '$(DESTDIR)$(BINDIR)' \ + '$(DESTDIR)$(JSLIBDIR)' \ + '$(DESTDIR)$(SRCDIR)' \ + + cp src/$(NAME).mjs '$(DESTDIR)$(JSLIBDIR)' + cp $(sources) '$(DESTDIR)$(SRCDIR)' + +## Uninstalls from $(DESTDIR)$(PREFIX). This is a perfect mirror +## of the "install" target, and removes *all* that was installed. +## A dedicated test asserts that this is always true. +uninstall: + rm -rf \ + '$(DESTDIR)$(JSLIBDIR)'/$(NAME).mjs \ + '$(DESTDIR)$(SRCDIR)' \ + + + +ALWAYS: diff --git a/mkdeps.sh b/mkdeps.sh new file mode 100755 index 0000000..e5606ff --- /dev/null +++ b/mkdeps.sh @@ -0,0 +1,4 @@ +#!/bin/sh +set -eu + +export LANG=POSIX.UTF-8 diff --git a/src/paca.mjs b/src/paca.mjs new file mode 100644 index 0000000..a0e607b --- /dev/null +++ b/src/paca.mjs @@ -0,0 +1,436 @@ +import { + butlast, explode, last, mapValues, max, reduce, reduced, repeat, +} from "sjs"; + + + +export class SyntaxError extends Error {} + +const ConcatStep = { + ACCEPTING: "accepting", + ESCAPING: "escaping", +}; + +const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]); + +const shouldConcat = (char, next) => + next !== undefined && + char !== "(" && + char !== "|" && + !nonConcatOperators.has(next); + +const isOperator = char => + nonConcatOperators.has(char) || char == "("; + +const tokenizeRegexStep = chars => ({ out, state }, char, index) => { + const next = chars[index + 1]; + const maybeConcat = shouldConcat(char, next) + ? [{operator: "concat"}] + : []; + + if (state === ConcatStep.ESCAPING) { + return { + out: out.concat(char, maybeConcat), + state: ConcatStep.ACCEPTING, + }; + } + + if (char === "\\") { + if (next === undefined) { + throw new SyntaxError( + `bad trailing escape character: ${chars}`, + ); + } + + return { + out, + state: ConcatStep.ESCAPING, + }; + } + + const op = isOperator(char) ? { operator: char } : char; + return { + out: out.concat(op, maybeConcat), + state, + }; +}; + +const tokenizeRegexFn = chars => + chars.reduce(tokenizeRegexStep(chars), { + out: [], + state: ConcatStep.ACCEPTING, + }); + +const tokenizeRegex = chars => + tokenizeRegexFn(chars).out; + +const PRECEDENCE = { + "(": 4, + "*": 3, + "+": 3, + "?": 3, + "concat": 2, + "|": 1, +}; + +const shouldPush = (stack, token) => + stack.length === 0 || + token.operator === "(" || + stack[0].operator === "("; + +const toPostfixStep = ({ out, stack }, token, _index, tokens) => { + if (typeof token === "string") { + return { + out: out.concat(token), + stack, + }; + } + + if (shouldPush(stack, token)) { + return { + out, + stack: [token].concat(stack), + }; + } + + if (token.operator === ")") { + const openParenIndex = stack.findIndex(o => o.operator === "("); + if (openParenIndex === -1) { + throw new SyntaxError( + `close parens without opening: ${tokens}`, + ); + } + + const ops = stack.slice(0, openParenIndex); + const leftover = stack.slice(openParenIndex + 1); + return { + out: out.concat(ops), + stack: leftover, + }; + } + + const idx = stack.findIndex(el => + el.operator === "(" || + PRECEDENCE[el.operator] < PRECEDENCE[token.operator], + ); + + if (idx < 0) { + return { + out: out.concat(stack), + stack: [token], + }; + } else { + const ops = stack.slice(0, idx); + const leftover = stack.slice(idx); + return { + out: out.concat(ops), + stack: [token].concat(leftover), + }; + } +}; + +const toPostfix = tokens => { + const { out, stack } = tokens.reduce(toPostfixStep, { + out: [], + stack: [], + }); + return out.concat(stack); +}; + +const emptyNFA = () => { + const startID = 1; + const endID = 2; + const nextID = 3; + return { + start: startID, + end: endID, + nextID, + nodes: { + [startID]: { + direct: [ endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + } + }; +}; + +const baseNFA = (edge, id) => { + const startID = id + 0; + const endID = id + 1; + const nextID = id + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + [startID]: { + direct: [], + transitions: { + [edge]: endID, + }, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const concat = (lhs, rhs) => ({ + start: lhs.start, + end: rhs.end, + nextID: max(lhs.nextID, rhs.nextID), + nodes: { + ...lhs.nodes, + ...rhs.nodes, + [lhs.end]: { + ...lhs.nodes[lhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + rhs.start, + ]), + } + }, +}); + +const union = (lhs, rhs) => { + const startID = max(lhs.nextID, rhs.nextID); + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...lhs.nodes, + ...rhs.nodes, + [lhs.end]: { + ...lhs.nodes[lhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + endID, + ]), + }, + [rhs.end]: { + ...rhs.nodes[rhs.end], + direct: lhs.nodes[lhs.end].direct.concat([ + endID, + ]), + }, + [startID]: { + direct: [lhs.start, rhs.start], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const zeroOrMore = nfa => { + const startID = nfa.nextID; + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...nfa.nodes, + [nfa.end]: { + ...nfa.nodes[nfa.end], + direct: nfa.nodes[nfa.end].direct.concat([ + nfa.start, + endID, + ]), + }, + [startID]: { + direct: [ nfa.start, endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const oneOrMore = nfa => + concat(nfa, zeroOrMore(nfa)); + +const zeroOrOne = nfa => { + const startID = nfa.nextID; + const endID = startID + 1; + const nextID = startID + 2; + return { + start: startID, + end: endID, + nextID, + nodes: { + ...nfa.nodes, + [nfa.end]: { + ...nfa.nodes[nfa.end], + direct: nfa.nodes[nfa.end].direct.concat([ + endID, + ]), + }, + [startID]: { + direct: [ nfa.start, endID ], + transitions: {}, + }, + [endID]: { + direct: [], + transitions: {}, + }, + }, + }; +}; + +const OPERATORS_FNS = ({ + zeroOrMoreFn = zeroOrMore, + oneOrMoreFn = oneOrMore, + zeroOrOneFn = zeroOrOne, + unionFn = union, + concatFn = concat, +} = {}) => ({ + "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))), + "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))), + "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))), + "|": stack => butlast(butlast(stack)).concat(unionFn( + last(butlast(stack)), + last(stack), + )), + "concat": stack => butlast(butlast(stack)).concat(concatFn( + last(butlast(stack)), + last(stack), + )), +}); + +const OPERATORS = OPERATORS_FNS(); + +const buildNFAStep = (stack, token) => + typeof token === "string" + ? stack.concat(baseNFA(token, last(stack)?.nextID || 1)) + : OPERATORS[token.operator](stack); + +const buildNFA = tokens => + tokens.length === 0 + ? emptyNFA() + : last(tokens.reduce(buildNFAStep, [])); + +const allDirects = (stateID, nodes) => + nodes[stateID].direct.length === 0 + ? [ stateID ] + : [...new Set( + nodes[stateID].direct.map( + id => allDirects(id, nodes), + ).flat(), + )].toSorted(); + +const searchNFAStep = nodes => (directStates, char) => + [...new Set( + directStates + .map(id => nodes[id].transitions[char]) + .filter(x => !!x) + .map(id => allDirects(id, nodes)) + .flat() + )]; + +const searchNFAFn = (nfa, string) => + explode(string).reduce( + searchNFAStep(nfa.nodes), + allDirects(nfa.start, nfa.nodes), + ); + +const searchNFA = (nfa, string) => + searchNFAFn(nfa, string).includes(nfa.end); + +const nodeID = states => + [...new Set(states)] + .toSorted((a, b) => a - b) + .map(x => x.toString()) + .join("-"); + +const unNodeID = stateID => + stateID.split("-").map(Number); + +const collectTransitions = (nfaNodes, states) => { + const transitions = states.map( + id => Object.entries(nfaNodes[id].transitions), + ).filter(t => t.length !== 0); + if (!transitions.every(t => t.length === 1)) { + throw new Error("unexpected transitions with more than 1 key"); + } + const groupedTransitions = Object.groupBy( + transitions.map(t => t[0]), + ([key, stateID]) => key, + ); + return mapValues( + groupedTransitions, + entry => + entry.map(([_, id]) => + allDirects(id, nfaNodes)).flat(), + ); +}; + +const computePending = (pending, transitions) => + [...new Set( + pending.slice(1).concat(transitions) + .map(nodeID)) + ].map(unNodeID); + +const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => { + if (pending.length === 0) { + return dfaNodes; + } + + const states = pending[0]; + const stateID = nodeID(states); + if (!!dfaNodes[stateID]) { + return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes); + } + + const transitions = collectTransitions(nfaNodes, states); + const newPending = computePending(pending, Object.values(transitions)); + return buildDFANodes(end, nfaNodes, newPending, { + ...dfaNodes, + [stateID]: { + end: states.includes(end), + transitions: mapValues(transitions, nodeID), + }, + }); +}; + +const buildDFA = nfa => { + const start = allDirects(nfa.start, nfa.nodes); + return { + start: nodeID(start), + nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}), + }; +}; + +const searchDFAStep = nodes => (state, char) => + nodes[state].transitions[char] || reduced(false); + +const searchDFAFn = (dfa, string) => + reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start); + +const searchDFA = (dfa, string) => + !!dfa.nodes[searchDFAFn(dfa, string)]?.end + +const compileNFA = regex => + buildNFA(toPostfix(tokenizeRegex(explode(regex)))); + +const compileDFA = regex => + buildDFA(compileNFA(regex)); + +export const compile = regex => { + const nfa = compileDFA(regex); + return string => searchDFA(nfa, string); +}; diff --git a/tests/paca.mjs b/tests/paca.mjs new file mode 100644 index 0000000..6390204 --- /dev/null +++ b/tests/paca.mjs @@ -0,0 +1,1351 @@ +import { explode, runTests } from "sjs"; + +import { + SyntaxError, + ConcatStep, + shouldConcat, + isOperator, + tokenizeRegexStep, + tokenizeRegexFn, + tokenizeRegex, + shouldPush, + toPostfixStep, + toPostfix, + emptyNFA, + baseNFA, + concat, + union, + zeroOrMore, + oneOrMore, + zeroOrOne, + OPERATORS_FNS, + buildNFAStep, + buildNFA, + allDirects, + searchNFAStep, + searchNFAFn, + searchNFA, + nodeID, + unNodeID, + collectTransitions, + computePending, + buildDFANodes, + buildDFA, + searchDFAStep, + searchDFAFn, + searchDFA, + compileNFA, + compileDFA, + compile, +} from "../src/paca.exported.mjs"; + + + +const test_shouldConcat = t => { + t.start("shouldConcat()"); + + t.testing("do concat literal chars", () => { + t.assertEq(shouldConcat("a", "b"), true); + }); + + t.testing("but not when they're the last one", () => { + t.assertEq(shouldConcat("a", undefined), false); + t.assertEq(shouldConcat("b", undefined), false); + }); + + t.testing("do not concat when starting a parens group", () => { + t.assertEq(shouldConcat("(", "a"), false); + t.assertEq(shouldConcat("(", "b"), false); + }); + + t.testing("do not concat when starting a union", () => { + t.assertEq(shouldConcat("|", "a"), false); + t.assertEq(shouldConcat("|", "b"), false); + }); + + t.testing("do not concat when next is special", () => { + t.assertEq(shouldConcat("a", "*"), false); + t.assertEq(shouldConcat("a", "|"), false); + t.assertEq(shouldConcat("a", ")"), false); + }); +}; + +const test_isOperator = t => { + t.start("isOperator()"); + + t.testing("operators and open parens are true", () => { + t.assertEq(isOperator("*"), true); + t.assertEq(isOperator("|"), true); + t.assertEq(isOperator("+"), true); + t.assertEq(isOperator("?"), true); + t.assertEq(isOperator("("), true); + t.assertEq(isOperator(")"), true); + }); + + t.testing("false for everyday non-meta chars", () => { + t.assertEq(isOperator("a"), false); + t.assertEq(isOperator("_"), false); + }); +}; + +const test_tokenizeRegexStep = t => { + t.start("tokenizeRegexStep()"); + + t.testing("when escaping we get whatever the char is", () => { + const stepFn = tokenizeRegexStep("ab|(cd)*"); + const steps = [{ + char: "a", + index: 0, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["a", { operator: "concat" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "b", + index: 1, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["b"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "|", + index: 2, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["|"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "(", + index: 3, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["("], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "c", + index: 4, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["c", {operator: "concat"}], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "d", + index: 5, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["d"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: ")", + index: 6, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: [")"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "*", + index: 7, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["*"], + state: ConcatStep.ACCEPTING, + }, + }]; + for (const step of steps) { + t.assertEq( + stepFn(step.in, step.char, step.index), + step.out, + ); + } + }); + + t.testing("trailing escape errors", () => { + try { + t.assertEq(tokenizeRegexStep("\\")({ + out: [], + index: 0, + state: ConcatStep.ACCEPTING, + }, "\\"), null); + t.assertEq(true, false); + } catch (e) { + const message = "bad trailing escape character: "; + t.assertEq( + e.message.substring(0, message.length), + message, + ); + t.assertEq(e instanceof SyntaxError, true); + } + }); + + t.testing("escape makes it enter escaping mode", () => { + const stepFn = tokenizeRegexStep("\\a\\*"); + const steps = [{ + char: "\\", + index: 0, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [], + state: ConcatStep.ESCAPING, + }, + }, { + char: "a", + index: 1, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["a", { operator: "concat" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "\\", + index: 2, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [], + state: ConcatStep.ESCAPING, + }, + }, { + char: "*", + index: 3, + in: { + out: [], + state: ConcatStep.ESCAPING, + }, + out: { + out: ["*"], + state: ConcatStep.ACCEPTING, + }, + }]; + for (const step of steps) { + t.assertEq( + stepFn(step.in, step.char, step.index), + step.out, + ); + } + }); + + t.testing("operators get detected as such", () => { + const stepFn = tokenizeRegexStep("ab|(cd)*"); + const steps = [{ + char: "a", + index: 0, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: ["a", { operator: "concat" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "b", + index: 1, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: ["b"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "|", + index: 2, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [{ operator: "|" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "(", + index: 3, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [{ operator: "(" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "c", + index: 4, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: ["c", { operator: "concat" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "d", + index: 5, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: ["d"], + state: ConcatStep.ACCEPTING, + }, + }, { + char: ")", + index: 6, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [{ operator: ")" }], + state: ConcatStep.ACCEPTING, + }, + }, { + char: "*", + index: 7, + in: { + out: [], + state: ConcatStep.ACCEPTING, + }, + out: { + out: [{ operator: "*" }], + state: ConcatStep.ACCEPTING, + }, + }]; + for (const step of steps) { + t.assertEq( + stepFn(step.in, step.char, step.index), + step.out, + ); + } + }); +}; + +const test_tokenizeRegexFn = t => { + t.start("tokenizeRegexFn"); + + t.testing("regex table", () => { + const concat = { operator: "concat" }; + const star = { operator: "*" }; + const table = [{ + in: "", + expected: { + out: [], + state: ConcatStep.ACCEPTING, + }, + }, { + in: "a", + expected: { + out: ["a"], + state: ConcatStep.ACCEPTING, + }, + }, { + in: "a*", + expected: { + out: ["a", star], + state: ConcatStep.ACCEPTING, + }, + }, { + in: "a*b", + expected: { + out: ["a", star, concat, "b"], + state: ConcatStep.ACCEPTING, + }, + }]; + for (const test of table) { + t.assertEq( + tokenizeRegexFn(explode(test.in)), + test.expected, + ); + } + }); +}; + +const test_tokenizeRegex = t => { + t.start("tokenizeRegex()"); + + t.testing("regex table", () => { + const concat = { operator: "concat" }; + const star = { operator: "*" }; + const table = [{ + in: "", + expected: [] + }, { + in: "a", + expected: [ "a" ], + }, { + in: "ab", + expected: [ "a", concat, "b" ], + }, { + in: "abc", + expected: [ "a", concat, "b", concat, "c" ], + }, { + in: "a*", + expected: [ "a", star], + }, { + in: "ab*", + expected: [ "a", concat, "b", star], + }, { + in: "a*b*", + expected: [ "a", star, concat, "b", star], + }, { + in: "a|b", + expected: [ "a", { operator: "|" }, "b" ], + }, { + in: "a(bc(d))ef", + expected: [ + "a", + concat, + { operator: "(" }, + "b", + concat, + "c", + concat, + { operator: "(" }, + "d", + { operator: ")" }, + { operator: ")" }, + concat, + "e", + concat, + "f", + ], + }, { + in: "ab|(cd)*", + expected: [ + "a", + concat, + "b", + { operator: "|" }, + { operator: "(" }, + "c", + concat, + "d", + { operator: ")" }, + star, + ], + }, { + in: "(a|b)*c", + expected: [ + { operator: "(" }, + "a", + { operator: "|" }, + "b", + { operator: ")" }, + star, + concat, + "c", + ], + }]; + for (const test of table) { + t.assertEq( + tokenizeRegex(explode(test.in)), + test.expected, + ); + } + }); +}; + +const test_shouldPush = t => { + t.start("shouldPush()"); + + t.testing("always push on empty stack", () => { + t.assertEq(shouldPush([], "a"), true); + t.assertEq(shouldPush([], null), true); + }); + + t.testing(`push when operator or stack top is "("`, () => { + t.assertEq(shouldPush([{}], { operator: "(" }), true); + t.assertEq(shouldPush([{ operator: "("}], {}), true); + }); + + t.testing("don't push otherwise", () => { + t.assertEq(shouldPush([{}], {}), false); + }); +}; + +const test_toPostfixStep = t => { + t.start("toPostfixStep()"); + + t.testing("string literals go directly to out", () => { + t.assertEq( + toPostfixStep({ out: ["a"], stack: "stack" }, "t"), + { out: ["a", "t"], stack: "stack" }, + ); + }); + + t.testing("parens put things on the stack", () => { + t.assertEq( + toPostfixStep({ + out: "out", + stack: [{ operator: "(" }], + }, { operator: "X" }), + { out: "out", stack: [ + { operator: "X" }, + { operator: "(" }, + ]}, + ); + }); + + // FIXME: more tests +}; + +const test_toPostfix = t => { + t.start("toPostfix()"); + + t.testing("regex table", () => { + const concat = { operator: "concat" }; + const table = [{ + in: [], + expected: [] + }, { + in: [ "a" ], + expected: [ "a" ], + }, { + in: [ "a", concat, "b", concat, "c" ], + expected: [ "a", "b", concat, "c", concat ], + }, { + in: [ "a", { operator: "*" }], + expected: [ "a", { operator: "*" }], + }, { + in: [ "a", { operator: "|" }, "b" ], + expected: [ "a", "b", { operator: "|" } ], + }, { + in: [ + "a", + concat, + "b", + { operator: "|" }, + { operator: "(" }, + "c", + concat, + "d", + { operator: ")" }, + { operator: "*" }, + ], + expected: [ + "a", + "b", + concat, + "c", + "d", + concat, + { operator: "*" }, + { operator: "|" }, + ], + }, { + in: [ + { operator: "(" }, + "a", + { operator: "|" }, + "b", + { operator: ")" }, + { operator: "*" }, + concat, + "c", + ], + expected: [ + "a", + "b", + { operator: "|" }, + { operator: "*" }, + "c", + concat, + ], + }]; + for (const test of table) { + t.assertEq( + toPostfix(test.in), + test.expected, + ); + } + }); +}; + +const test_emptyNFA = t => { + t.start("emptyNFA()"); + + t.testing("we deterministically build out of the ID", () => { + const expected = { + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [ 2 ], + transitions: {}, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(emptyNFA(), expected); + }); +}; + +const test_baseNFA = t => { + t.start("baseNFA()"); + + t.testing("the given edge connects the nodes", () => { + const expected = { + start: 123, + end: 124, + nextID: 125, + nodes: { + 123: { + direct: [], + transitions: { X: 124 }, + }, + 124: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(baseNFA("X", 123), expected); + }); +}; + +const test_concat = t => { + t.start("concat()"); + + t.testing("on baseNFA() and on more complex NFAs", () => { + const a = baseNFA("a", 1); + const b = baseNFA("b", a.nextID); + const a_ = baseNFA("a", b.nextID); + const expected1 = { + start: 1, + end: 4, + nextID: 5, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 3 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(concat(a, b), expected1); + + const expected2 = { + start: 1, + end: 6, + nextID: 7, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 3 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [ 5 ], + transitions: {}, + }, + 5: { + direct: [], + transitions: { a: 6 }, + }, + 6: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(concat(concat(a, b), a_), expected2); + }); +}; + +const test_union = t => { + t.start("union()"); + + t.testing("on baseNFA() and on more complex NFAs", () => { + const a = baseNFA("a", 1); + const b = baseNFA("b", a.nextID); + const expected1 = { + start: 5, + end: 6, + nextID: 7, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 6 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [ 6 ], + transitions: {}, + }, + 5: { + direct: [ 1, 3 ], + transitions: {}, + }, + 6: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(union(a, b), expected1); + + const c = baseNFA("c", expected1.nextID); + const expected2 = { + start: 9, + end: 10, + nextID: 11, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 6 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [ 6 ], + transitions: {}, + }, + 5: { + direct: [ 1, 3 ], + transitions: {}, + }, + 6: { + direct: [ 10 ], + transitions: {}, + }, + 7: { + direct: [], + transitions: { c: 8 }, + }, + 8: { + direct: [ 10 ], + transitions: {}, + }, + 9: { + direct: [ 5, 7 ], + transitions: {}, + }, + 10: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(union(union(a, b), c), expected2); + }); +}; + +const test_zeroOrMore = t => { + t.start("zeroOrMore()"); + + t.testing("on baseNFA()", () => { + const a = baseNFA("a", 1); + const expected = { + start: 3, + end: 4, + nextID: 5, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 1, 4 ], + transitions: {}, + }, + 3: { + direct: [ 1, 4 ], + transitions: {}, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(zeroOrMore(a), expected); + }); +}; + +const test_oneOrMore = t => { + t.start("oneOrMore()"); + + t.testing("on baseNFA()", () => { + const a = baseNFA("a", 1); + const expected = { + start: 1, + end: 4, + nextID: 5, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 3 ], + transitions: {}, + }, + 3: { + direct: [ 1, 4 ], + transitions: {}, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(oneOrMore(a), expected); + }); +}; + +const test_zeroOrOne = t => { + t.start("zeroOrOne()"); + + t.testing("on baseNFA()", () => { + const a = baseNFA("a", 1); + const expected = { + start: 3, + end: 4, + nextID: 5, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 4 ], + transitions: {}, + }, + 3: { + direct: [ 1, 4 ], + transitions: {}, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(zeroOrOne(a), expected); + }); +}; + +const test_OPERATORS_FNS = t => { + t.start("OPERATORS_FNS"); + + const arr = []; + const fn = ret => (...args) => { + arr.push({ + args, + ret, + }); + return ret; + }; + const input = [1, 2, 3]; + + + t.testing("star", () => { + arr.splice(0); + t.assertEq( + OPERATORS_FNS({ zeroOrMoreFn: fn(111) })["*"](input), + [1, 2, 111], + ); + t.assertEq(arr, [{ args: [3], ret: 111 }]); + }); + + t.testing("plus", () => { + arr.splice(0); + t.assertEq( + OPERATORS_FNS({ oneOrMoreFn: fn(222) })["+"](input), + [1, 2, 222], + ); + t.assertEq(arr, [{ args: [3], ret: 222 }]); + }); + + t.testing("question mark", () => { + arr.splice(0); + t.assertEq( + OPERATORS_FNS({ zeroOrOneFn: fn(333) })["?"](input), + [1, 2, 333], + ); + t.assertEq(arr, [{ args: [3], ret: 333 }]); + }); + + t.testing("pipe", () => { + arr.splice(0); + t.assertEq( + OPERATORS_FNS({ unionFn: fn(444) })["|"](input), + [1, 444], + ); + t.assertEq(arr, [{ args: [2, 3], ret: 444 }]); + }); + + t.testing("concat", () => { + arr.splice(0); + t.assertEq( + OPERATORS_FNS({ concatFn: fn(555) })["concat"](input), + [1, 555], + ); + t.assertEq(arr, [{ args: [2, 3], ret: 555 }]); + }); +}; + +const test_buildNFAStep = t => { + t.start("buildNFAStep()"); + + t.testing("literal char without existing nextID", () => { + const expected = [{ + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [], + transitions: { t: 2 }, + }, + 2: { + direct: [], + transitions: {}, + }, + + }, + }]; + t.assertEq(buildNFAStep([], "t"), expected); + }); + + t.testing("literal char with existing nextID", () => { + const input = baseNFA("a", 1); + const expected = [{ + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }, { + start: 3, + end: 4, + nextID: 5, + nodes: { + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }]; + t.assertEq(buildNFAStep([input], "b"), expected); + t.assertEq( + buildNFAStep([input], "b"), + [ baseNFA("a", 1), baseNFA("b", 3) ] + ); + }); + + t.testing("operator", () => { + const input = [ baseNFA("a", 1), baseNFA("b", 3) ]; + const expected = [{ + start: 1, + end: 4, + nextID: 5, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 3 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [], + transitions: {}, + }, + }, + }]; + t.assertEq( + buildNFAStep(input, { operator: "concat" }), + expected, + ); + }); +}; + +const test_buildNFA = t => { + t.start("buildNFA()"); + + t.testing("empty regex pattern", () => { + const expected = { + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [ 2 ], + transitions: {}, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(buildNFA([]), expected); + }); + + t.testing("sample composed regex", () => { + const regex = "(a|b)*cde"; + const expected = { + start: 7, + end: 14, + nextID: 15, + nodes: { + 1: { + direct: [], + transitions: { a: 2 }, + }, + 2: { + direct: [ 6 ], + transitions: {}, + }, + 3: { + direct: [], + transitions: { b: 4 }, + }, + 4: { + direct: [ 6 ], + transitions: {}, + }, + 5: { + direct: [ 1, 3 ], + transitions: {}, + }, + 6: { + direct: [ 5, 8 ], + transitions: {}, + }, + 7: { + direct: [ 5, 8 ], + transitions: {}, + }, + 8: { + direct: [ 9 ], + transitions: {}, + }, + 9: { + direct: [], + transitions: { c: 10 }, + }, + 10: { + direct: [ 11 ], + transitions: {}, + }, + 11: { + direct: [], + transitions: { d: 12 }, + }, + 12: { + direct: [ 13 ], + transitions: {}, + }, + 13: { + direct: [], + transitions: { e: 14 }, + }, + 14: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq( + buildNFA(toPostfix(tokenizeRegex(explode(regex)))), + expected, + ); + }); +}; + +const test_allDirects = t => { + t.start("allDirects()"); + + t.testing("empty directs", () => { + }); + + t.testing("single level", () => { + }); + + t.testing("deep recursion", () => { + }); + + t.testing("plenty of duplicates", () => { + }); +}; + +const test_searchNFAStep = t => { + t.start("searchNFAStep()"); + + t.testing("FIXME", () => {}); +}; + +const test_searchNFAFn = t => { + t.start("searchNFAFn()"); + + t.testing("FIXME", () => {}); +}; + +const test_searchNFA = t => { + t.start("searchNFA()"); + + t.testing("on composed sample regex", () => { + const regex = "(a|b)*c"; + const nfa = buildNFA(toPostfix(tokenizeRegex(explode(regex)))); + t.assertEq(searchNFA(nfa, "c"), true); + t.assertEq(searchNFA(nfa, "cc"), false); + t.assertEq(searchNFA(nfa, "aaabbbc"), true); + }); +}; + +const test_nodeID = t => { + t.start("nodeID()"); + + t.testing("single number", () => { + t.assertEq(nodeID([1]), "1"); + }); + + t.testing("plenty of duplicates", () => { + t.assertEq(nodeID([1, 1, 1, 1, 1, 1]), "1"); + }); + + t.testing("unsorted input", () => { + t.assertEq( + nodeID([5, 5, 11, 11, 11, 3, 4, 5, 123]), + "3-4-5-11-123", + ); + }); +}; + +const test_unNodeID = t => { + t.start("unNodeID()"); + + t.testing("no hyphes", () => { + t.assertEq(unNodeID("123"), [123]); + }); + + t.testing("with hyphens", () => { + t.assertEq(unNodeID("1-2"), [1, 2]); + t.assertEq(unNodeID("11-3-0"), [11, 3, 0]); + }); +}; + +const test_collectTransitions = t => { + t.start("collectTransitions()"); + + t.testing("FIXME", () => {}); +}; + +const test_computePending = t => { + t.start("computePending()"); + + t.testing("FIXME", () => {}); +}; + +const test_buildDFANodes = t => { + t.start("buildDFANodes()"); + + t.testing("FIXME", () => {}); +}; + +const test_buildDFA = t => { + t.start("buildDFA()"); + + t.testing("FIXME", () => {}); +}; + +const test_searchDFAStep = t => { + t.start("searchDFAStep()"); + + t.testing("FIXME", () => {}); +}; + +const test_searchDFAFn = t => { + t.start("searchDFAFn()"); + + t.testing("FIXME", () => {}); +}; + +const test_searchDFA = t => { + t.start("searchDFA()"); + + t.testing("FIXME", () => {}); +}; + +const test_compileNFA = t => { + t.start("compileNFA()"); + + t.testing("empty regex", () => { + const expected = { + start: 1, + end: 2, + nextID: 3, + nodes: { + 1: { + direct: [ 2 ], + transitions: {}, + }, + 2: { + direct: [], + transitions: {}, + }, + }, + }; + t.assertEq(compileNFA(""), expected); + }); + + t.testing("composed sample regex", () => { + const nfa = compileNFA("(a|b)*c"); + t.assertEq(searchNFA(nfa, "ababac"), true); + t.assertEq(searchNFA(nfa, "babac"), true); + t.assertEq(searchNFA(nfa, "babaca"), false); + }); +}; + +const test_compileDFA = t => { + t.start("compileDFA()"); + + t.testing("FIXME", () => {}); +}; + +const test_compile = t => { + t.start("compile()"); + + t.testing("composed sample regex", () => { + const matcher = compile("(a|b)*c"); + t.assertEq(matcher("ababac"), true); + t.assertEq(matcher("ababaca"), false); + }); +}; + + + +runTests([ + test_shouldConcat, + test_isOperator, + test_tokenizeRegexStep, + test_tokenizeRegexFn, + test_tokenizeRegex, + test_shouldPush, + test_toPostfixStep, + test_toPostfix, + test_emptyNFA, + test_baseNFA, + test_concat, + test_union, + test_zeroOrMore, + test_oneOrMore, + test_zeroOrOne, + test_OPERATORS_FNS, + test_buildNFAStep, + test_buildNFA, + test_allDirects, + test_searchNFAStep, + test_searchNFAFn, + test_searchNFA, + test_nodeID, + test_unNodeID, + test_collectTransitions, + test_computePending, + test_buildDFANodes, + test_buildDFA, + test_searchDFAStep, + test_searchDFAFn, + test_searchDFA, + test_compileNFA, + test_compileDFA, + test_compile, +]); |