summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEuAndreh <eu@euandre.org>2025-07-01 13:45:29 -0300
committerEuAndreh <eu@euandre.org>2025-07-07 06:09:16 -0300
commit6b626f2e0408e52f771e0895648b4b75060ec082 (patch)
treec2641eddadbaaf211208c50a4a9ee0172382f95b
parentInitial empty commit (diff)
downloadpaca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.gz
paca-6b626f2e0408e52f771e0895648b4b75060ec082.tar.xz
Implement v0 version of NFA and DFA; WIP tests
-rw-r--r--.gitignore2
-rw-r--r--Makefile105
-rw-r--r--deps.mk0
-rwxr-xr-xmkdeps.sh4
-rw-r--r--src/paca.mjs436
-rw-r--r--tests/paca.mjs1351
6 files changed, 1898 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bf8600d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/node_modules/
+/src/*.exported.mjs
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..aa02a58
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,105 @@
+.POSIX:
+DATE = 1970-01-01
+VERSION = 0.1.0
+NAME = paca
+NAME_UC = $(NAME)
+## Installation prefix. Defaults to "/usr".
+PREFIX = /usr
+BINDIR = $(PREFIX)/bin
+LIBDIR = $(PREFIX)/lib
+JSLIBDIR = $(LIBDIR)/javascript
+INCLUDEDIR = $(PREFIX)/include
+SRCDIR = $(PREFIX)/src/$(NAME)
+SHAREDIR = $(PREFIX)/share
+LOCALEDIR = $(SHAREDIR)/locale
+MANDIR = $(SHAREDIR)/man
+EXEC = ./
+## Where to store the installation. Empty by default.
+DESTDIR =
+LDLIBS =
+JSIMPL = node
+
+
+
+.SUFFIXES:
+.SUFFIXES: .mjs .mjs-check
+
+
+
+all:
+include deps.mk
+
+sources = \
+ src/$(NAME).mjs \
+
+
+derived-assets = \
+ src/$(NAME).exported.mjs \
+
+side-assets = \
+
+
+
+## Default target. Builds all artifacts required for testing
+## and installation.
+all: $(derived-assets)
+
+
+src/$(NAME).exported.mjs: src/$(NAME).mjs Makefile
+ cp src/$(NAME).mjs $@
+ printf '\n\nexport {\n' >> $@
+ awk '/^const / { printf "\t%s,\n", $$2 }' src/$(NAME).mjs >> $@
+ printf '}\n' >> $@
+
+
+
+tests/$(NAME).mjs-check: src/$(NAME).exported.mjs
+ $(JSIMPL) $*.mjs
+
+check-unit: tests/$(NAME).mjs-check
+
+
+integration-tests = \
+
+.PRECIOUS: $(integration-tests)
+$(integration-tests): ALWAYS
+ sh $@
+
+check-integration: $(integration-tests)
+
+
+## Run all tests. Each test suite is isolated, so that a parallel
+## build can run tests at the same time. The required artifacts
+## are created if missing.
+check: check-unit check-integration
+
+
+
+## Remove *all* derived artifacts produced during the build.
+## A dedicated test asserts that this is always true.
+clean:
+ rm -rf $(derived-assets) $(side-assets)
+
+
+## Installs into $(DESTDIR)$(PREFIX). Its dependency target
+## ensures that all installable artifacts are crafted beforehand.
+install: all
+ mkdir -p \
+ '$(DESTDIR)$(BINDIR)' \
+ '$(DESTDIR)$(JSLIBDIR)' \
+ '$(DESTDIR)$(SRCDIR)' \
+
+ cp src/$(NAME).mjs '$(DESTDIR)$(JSLIBDIR)'
+ cp $(sources) '$(DESTDIR)$(SRCDIR)'
+
+## Uninstalls from $(DESTDIR)$(PREFIX). This is a perfect mirror
+## of the "install" target, and removes *all* that was installed.
+## A dedicated test asserts that this is always true.
+uninstall:
+ rm -rf \
+ '$(DESTDIR)$(JSLIBDIR)'/$(NAME).mjs \
+ '$(DESTDIR)$(SRCDIR)' \
+
+
+
+ALWAYS:
diff --git a/deps.mk b/deps.mk
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/deps.mk
diff --git a/mkdeps.sh b/mkdeps.sh
new file mode 100755
index 0000000..e5606ff
--- /dev/null
+++ b/mkdeps.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+set -eu
+
+export LANG=POSIX.UTF-8
diff --git a/src/paca.mjs b/src/paca.mjs
new file mode 100644
index 0000000..a0e607b
--- /dev/null
+++ b/src/paca.mjs
@@ -0,0 +1,436 @@
+import {
+ butlast, explode, last, mapValues, max, reduce, reduced, repeat,
+} from "sjs";
+
+
+
+export class SyntaxError extends Error {}
+
+const ConcatStep = {
+ ACCEPTING: "accepting",
+ ESCAPING: "escaping",
+};
+
+const nonConcatOperators = new Set(["*", "+", "?", "|", ")"]);
+
+const shouldConcat = (char, next) =>
+ next !== undefined &&
+ char !== "(" &&
+ char !== "|" &&
+ !nonConcatOperators.has(next);
+
+const isOperator = char =>
+ nonConcatOperators.has(char) || char == "(";
+
+const tokenizeRegexStep = chars => ({ out, state }, char, index) => {
+ const next = chars[index + 1];
+ const maybeConcat = shouldConcat(char, next)
+ ? [{operator: "concat"}]
+ : [];
+
+ if (state === ConcatStep.ESCAPING) {
+ return {
+ out: out.concat(char, maybeConcat),
+ state: ConcatStep.ACCEPTING,
+ };
+ }
+
+ if (char === "\\") {
+ if (next === undefined) {
+ throw new SyntaxError(
+ `bad trailing escape character: ${chars}`,
+ );
+ }
+
+ return {
+ out,
+ state: ConcatStep.ESCAPING,
+ };
+ }
+
+ const op = isOperator(char) ? { operator: char } : char;
+ return {
+ out: out.concat(op, maybeConcat),
+ state,
+ };
+};
+
+const tokenizeRegexFn = chars =>
+ chars.reduce(tokenizeRegexStep(chars), {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ });
+
+const tokenizeRegex = chars =>
+ tokenizeRegexFn(chars).out;
+
+const PRECEDENCE = {
+ "(": 4,
+ "*": 3,
+ "+": 3,
+ "?": 3,
+ "concat": 2,
+ "|": 1,
+};
+
+const shouldPush = (stack, token) =>
+ stack.length === 0 ||
+ token.operator === "(" ||
+ stack[0].operator === "(";
+
+const toPostfixStep = ({ out, stack }, token, _index, tokens) => {
+ if (typeof token === "string") {
+ return {
+ out: out.concat(token),
+ stack,
+ };
+ }
+
+ if (shouldPush(stack, token)) {
+ return {
+ out,
+ stack: [token].concat(stack),
+ };
+ }
+
+ if (token.operator === ")") {
+ const openParenIndex = stack.findIndex(o => o.operator === "(");
+ if (openParenIndex === -1) {
+ throw new SyntaxError(
+ `close parens without opening: ${tokens}`,
+ );
+ }
+
+ const ops = stack.slice(0, openParenIndex);
+ const leftover = stack.slice(openParenIndex + 1);
+ return {
+ out: out.concat(ops),
+ stack: leftover,
+ };
+ }
+
+ const idx = stack.findIndex(el =>
+ el.operator === "(" ||
+ PRECEDENCE[el.operator] < PRECEDENCE[token.operator],
+ );
+
+ if (idx < 0) {
+ return {
+ out: out.concat(stack),
+ stack: [token],
+ };
+ } else {
+ const ops = stack.slice(0, idx);
+ const leftover = stack.slice(idx);
+ return {
+ out: out.concat(ops),
+ stack: [token].concat(leftover),
+ };
+ }
+};
+
+const toPostfix = tokens => {
+ const { out, stack } = tokens.reduce(toPostfixStep, {
+ out: [],
+ stack: [],
+ });
+ return out.concat(stack);
+};
+
+const emptyNFA = () => {
+ const startID = 1;
+ const endID = 2;
+ const nextID = 3;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ [startID]: {
+ direct: [ endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ }
+ };
+};
+
+const baseNFA = (edge, id) => {
+ const startID = id + 0;
+ const endID = id + 1;
+ const nextID = id + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ [startID]: {
+ direct: [],
+ transitions: {
+ [edge]: endID,
+ },
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const concat = (lhs, rhs) => ({
+ start: lhs.start,
+ end: rhs.end,
+ nextID: max(lhs.nextID, rhs.nextID),
+ nodes: {
+ ...lhs.nodes,
+ ...rhs.nodes,
+ [lhs.end]: {
+ ...lhs.nodes[lhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ rhs.start,
+ ]),
+ }
+ },
+});
+
+const union = (lhs, rhs) => {
+ const startID = max(lhs.nextID, rhs.nextID);
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...lhs.nodes,
+ ...rhs.nodes,
+ [lhs.end]: {
+ ...lhs.nodes[lhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ endID,
+ ]),
+ },
+ [rhs.end]: {
+ ...rhs.nodes[rhs.end],
+ direct: lhs.nodes[lhs.end].direct.concat([
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [lhs.start, rhs.start],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const zeroOrMore = nfa => {
+ const startID = nfa.nextID;
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...nfa.nodes,
+ [nfa.end]: {
+ ...nfa.nodes[nfa.end],
+ direct: nfa.nodes[nfa.end].direct.concat([
+ nfa.start,
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [ nfa.start, endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const oneOrMore = nfa =>
+ concat(nfa, zeroOrMore(nfa));
+
+const zeroOrOne = nfa => {
+ const startID = nfa.nextID;
+ const endID = startID + 1;
+ const nextID = startID + 2;
+ return {
+ start: startID,
+ end: endID,
+ nextID,
+ nodes: {
+ ...nfa.nodes,
+ [nfa.end]: {
+ ...nfa.nodes[nfa.end],
+ direct: nfa.nodes[nfa.end].direct.concat([
+ endID,
+ ]),
+ },
+ [startID]: {
+ direct: [ nfa.start, endID ],
+ transitions: {},
+ },
+ [endID]: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+};
+
+const OPERATORS_FNS = ({
+ zeroOrMoreFn = zeroOrMore,
+ oneOrMoreFn = oneOrMore,
+ zeroOrOneFn = zeroOrOne,
+ unionFn = union,
+ concatFn = concat,
+} = {}) => ({
+ "*": stack => butlast(stack).concat(zeroOrMoreFn(last(stack))),
+ "+": stack => butlast(stack).concat( oneOrMoreFn(last(stack))),
+ "?": stack => butlast(stack).concat(zeroOrOneFn (last(stack))),
+ "|": stack => butlast(butlast(stack)).concat(unionFn(
+ last(butlast(stack)),
+ last(stack),
+ )),
+ "concat": stack => butlast(butlast(stack)).concat(concatFn(
+ last(butlast(stack)),
+ last(stack),
+ )),
+});
+
+const OPERATORS = OPERATORS_FNS();
+
+const buildNFAStep = (stack, token) =>
+ typeof token === "string"
+ ? stack.concat(baseNFA(token, last(stack)?.nextID || 1))
+ : OPERATORS[token.operator](stack);
+
+const buildNFA = tokens =>
+ tokens.length === 0
+ ? emptyNFA()
+ : last(tokens.reduce(buildNFAStep, []));
+
+const allDirects = (stateID, nodes) =>
+ nodes[stateID].direct.length === 0
+ ? [ stateID ]
+ : [...new Set(
+ nodes[stateID].direct.map(
+ id => allDirects(id, nodes),
+ ).flat(),
+ )].toSorted();
+
+const searchNFAStep = nodes => (directStates, char) =>
+ [...new Set(
+ directStates
+ .map(id => nodes[id].transitions[char])
+ .filter(x => !!x)
+ .map(id => allDirects(id, nodes))
+ .flat()
+ )];
+
+const searchNFAFn = (nfa, string) =>
+ explode(string).reduce(
+ searchNFAStep(nfa.nodes),
+ allDirects(nfa.start, nfa.nodes),
+ );
+
+const searchNFA = (nfa, string) =>
+ searchNFAFn(nfa, string).includes(nfa.end);
+
+const nodeID = states =>
+ [...new Set(states)]
+ .toSorted((a, b) => a - b)
+ .map(x => x.toString())
+ .join("-");
+
+const unNodeID = stateID =>
+ stateID.split("-").map(Number);
+
+const collectTransitions = (nfaNodes, states) => {
+ const transitions = states.map(
+ id => Object.entries(nfaNodes[id].transitions),
+ ).filter(t => t.length !== 0);
+ if (!transitions.every(t => t.length === 1)) {
+ throw new Error("unexpected transitions with more than 1 key");
+ }
+ const groupedTransitions = Object.groupBy(
+ transitions.map(t => t[0]),
+ ([key, stateID]) => key,
+ );
+ return mapValues(
+ groupedTransitions,
+ entry =>
+ entry.map(([_, id]) =>
+ allDirects(id, nfaNodes)).flat(),
+ );
+};
+
+const computePending = (pending, transitions) =>
+ [...new Set(
+ pending.slice(1).concat(transitions)
+ .map(nodeID))
+ ].map(unNodeID);
+
+const buildDFANodes = (end, nfaNodes, pending, dfaNodes) => {
+ if (pending.length === 0) {
+ return dfaNodes;
+ }
+
+ const states = pending[0];
+ const stateID = nodeID(states);
+ if (!!dfaNodes[stateID]) {
+ return buildDFANodes(end, nfaNodes, pending.slice(1), dfaNodes);
+ }
+
+ const transitions = collectTransitions(nfaNodes, states);
+ const newPending = computePending(pending, Object.values(transitions));
+ return buildDFANodes(end, nfaNodes, newPending, {
+ ...dfaNodes,
+ [stateID]: {
+ end: states.includes(end),
+ transitions: mapValues(transitions, nodeID),
+ },
+ });
+};
+
+const buildDFA = nfa => {
+ const start = allDirects(nfa.start, nfa.nodes);
+ return {
+ start: nodeID(start),
+ nodes: buildDFANodes(nfa.end, nfa.nodes, [start], {}),
+ };
+};
+
+const searchDFAStep = nodes => (state, char) =>
+ nodes[state].transitions[char] || reduced(false);
+
+const searchDFAFn = (dfa, string) =>
+ reduce(explode(string), searchDFAStep(dfa.nodes), dfa.start);
+
+const searchDFA = (dfa, string) =>
+ !!dfa.nodes[searchDFAFn(dfa, string)]?.end
+
+const compileNFA = regex =>
+ buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+
+const compileDFA = regex =>
+ buildDFA(compileNFA(regex));
+
+export const compile = regex => {
+ const nfa = compileDFA(regex);
+ return string => searchDFA(nfa, string);
+};
diff --git a/tests/paca.mjs b/tests/paca.mjs
new file mode 100644
index 0000000..6390204
--- /dev/null
+++ b/tests/paca.mjs
@@ -0,0 +1,1351 @@
+import { explode, runTests } from "sjs";
+
+import {
+ SyntaxError,
+ ConcatStep,
+ shouldConcat,
+ isOperator,
+ tokenizeRegexStep,
+ tokenizeRegexFn,
+ tokenizeRegex,
+ shouldPush,
+ toPostfixStep,
+ toPostfix,
+ emptyNFA,
+ baseNFA,
+ concat,
+ union,
+ zeroOrMore,
+ oneOrMore,
+ zeroOrOne,
+ OPERATORS_FNS,
+ buildNFAStep,
+ buildNFA,
+ allDirects,
+ searchNFAStep,
+ searchNFAFn,
+ searchNFA,
+ nodeID,
+ unNodeID,
+ collectTransitions,
+ computePending,
+ buildDFANodes,
+ buildDFA,
+ searchDFAStep,
+ searchDFAFn,
+ searchDFA,
+ compileNFA,
+ compileDFA,
+ compile,
+} from "../src/paca.exported.mjs";
+
+
+
+const test_shouldConcat = t => {
+ t.start("shouldConcat()");
+
+ t.testing("do concat literal chars", () => {
+ t.assertEq(shouldConcat("a", "b"), true);
+ });
+
+ t.testing("but not when they're the last one", () => {
+ t.assertEq(shouldConcat("a", undefined), false);
+ t.assertEq(shouldConcat("b", undefined), false);
+ });
+
+ t.testing("do not concat when starting a parens group", () => {
+ t.assertEq(shouldConcat("(", "a"), false);
+ t.assertEq(shouldConcat("(", "b"), false);
+ });
+
+ t.testing("do not concat when starting a union", () => {
+ t.assertEq(shouldConcat("|", "a"), false);
+ t.assertEq(shouldConcat("|", "b"), false);
+ });
+
+ t.testing("do not concat when next is special", () => {
+ t.assertEq(shouldConcat("a", "*"), false);
+ t.assertEq(shouldConcat("a", "|"), false);
+ t.assertEq(shouldConcat("a", ")"), false);
+ });
+};
+
+const test_isOperator = t => {
+ t.start("isOperator()");
+
+ t.testing("operators and open parens are true", () => {
+ t.assertEq(isOperator("*"), true);
+ t.assertEq(isOperator("|"), true);
+ t.assertEq(isOperator("+"), true);
+ t.assertEq(isOperator("?"), true);
+ t.assertEq(isOperator("("), true);
+ t.assertEq(isOperator(")"), true);
+ });
+
+ t.testing("false for everyday non-meta chars", () => {
+ t.assertEq(isOperator("a"), false);
+ t.assertEq(isOperator("_"), false);
+ });
+};
+
+const test_tokenizeRegexStep = t => {
+ t.start("tokenizeRegexStep()");
+
+ t.testing("when escaping we get whatever the char is", () => {
+ const stepFn = tokenizeRegexStep("ab|(cd)*");
+ const steps = [{
+ char: "a",
+ index: 0,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["a", { operator: "concat" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "b",
+ index: 1,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["b"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "|",
+ index: 2,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["|"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "(",
+ index: 3,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["("],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "c",
+ index: 4,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["c", {operator: "concat"}],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "d",
+ index: 5,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["d"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: ")",
+ index: 6,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: [")"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "*",
+ index: 7,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["*"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }];
+ for (const step of steps) {
+ t.assertEq(
+ stepFn(step.in, step.char, step.index),
+ step.out,
+ );
+ }
+ });
+
+ t.testing("trailing escape errors", () => {
+ try {
+ t.assertEq(tokenizeRegexStep("\\")({
+ out: [],
+ index: 0,
+ state: ConcatStep.ACCEPTING,
+ }, "\\"), null);
+ t.assertEq(true, false);
+ } catch (e) {
+ const message = "bad trailing escape character: ";
+ t.assertEq(
+ e.message.substring(0, message.length),
+ message,
+ );
+ t.assertEq(e instanceof SyntaxError, true);
+ }
+ });
+
+ t.testing("escape makes it enter escaping mode", () => {
+ const stepFn = tokenizeRegexStep("\\a\\*");
+ const steps = [{
+ char: "\\",
+ index: 0,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ }, {
+ char: "a",
+ index: 1,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["a", { operator: "concat" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "\\",
+ index: 2,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ }, {
+ char: "*",
+ index: 3,
+ in: {
+ out: [],
+ state: ConcatStep.ESCAPING,
+ },
+ out: {
+ out: ["*"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }];
+ for (const step of steps) {
+ t.assertEq(
+ stepFn(step.in, step.char, step.index),
+ step.out,
+ );
+ }
+ });
+
+ t.testing("operators get detected as such", () => {
+ const stepFn = tokenizeRegexStep("ab|(cd)*");
+ const steps = [{
+ char: "a",
+ index: 0,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: ["a", { operator: "concat" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "b",
+ index: 1,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: ["b"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "|",
+ index: 2,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [{ operator: "|" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "(",
+ index: 3,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [{ operator: "(" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "c",
+ index: 4,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: ["c", { operator: "concat" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "d",
+ index: 5,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: ["d"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: ")",
+ index: 6,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [{ operator: ")" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ char: "*",
+ index: 7,
+ in: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ out: {
+ out: [{ operator: "*" }],
+ state: ConcatStep.ACCEPTING,
+ },
+ }];
+ for (const step of steps) {
+ t.assertEq(
+ stepFn(step.in, step.char, step.index),
+ step.out,
+ );
+ }
+ });
+};
+
+const test_tokenizeRegexFn = t => {
+ t.start("tokenizeRegexFn");
+
+ t.testing("regex table", () => {
+ const concat = { operator: "concat" };
+ const star = { operator: "*" };
+ const table = [{
+ in: "",
+ expected: {
+ out: [],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ in: "a",
+ expected: {
+ out: ["a"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ in: "a*",
+ expected: {
+ out: ["a", star],
+ state: ConcatStep.ACCEPTING,
+ },
+ }, {
+ in: "a*b",
+ expected: {
+ out: ["a", star, concat, "b"],
+ state: ConcatStep.ACCEPTING,
+ },
+ }];
+ for (const test of table) {
+ t.assertEq(
+ tokenizeRegexFn(explode(test.in)),
+ test.expected,
+ );
+ }
+ });
+};
+
+const test_tokenizeRegex = t => {
+ t.start("tokenizeRegex()");
+
+ t.testing("regex table", () => {
+ const concat = { operator: "concat" };
+ const star = { operator: "*" };
+ const table = [{
+ in: "",
+ expected: []
+ }, {
+ in: "a",
+ expected: [ "a" ],
+ }, {
+ in: "ab",
+ expected: [ "a", concat, "b" ],
+ }, {
+ in: "abc",
+ expected: [ "a", concat, "b", concat, "c" ],
+ }, {
+ in: "a*",
+ expected: [ "a", star],
+ }, {
+ in: "ab*",
+ expected: [ "a", concat, "b", star],
+ }, {
+ in: "a*b*",
+ expected: [ "a", star, concat, "b", star],
+ }, {
+ in: "a|b",
+ expected: [ "a", { operator: "|" }, "b" ],
+ }, {
+ in: "a(bc(d))ef",
+ expected: [
+ "a",
+ concat,
+ { operator: "(" },
+ "b",
+ concat,
+ "c",
+ concat,
+ { operator: "(" },
+ "d",
+ { operator: ")" },
+ { operator: ")" },
+ concat,
+ "e",
+ concat,
+ "f",
+ ],
+ }, {
+ in: "ab|(cd)*",
+ expected: [
+ "a",
+ concat,
+ "b",
+ { operator: "|" },
+ { operator: "(" },
+ "c",
+ concat,
+ "d",
+ { operator: ")" },
+ star,
+ ],
+ }, {
+ in: "(a|b)*c",
+ expected: [
+ { operator: "(" },
+ "a",
+ { operator: "|" },
+ "b",
+ { operator: ")" },
+ star,
+ concat,
+ "c",
+ ],
+ }];
+ for (const test of table) {
+ t.assertEq(
+ tokenizeRegex(explode(test.in)),
+ test.expected,
+ );
+ }
+ });
+};
+
+const test_shouldPush = t => {
+ t.start("shouldPush()");
+
+ t.testing("always push on empty stack", () => {
+ t.assertEq(shouldPush([], "a"), true);
+ t.assertEq(shouldPush([], null), true);
+ });
+
+ t.testing(`push when operator or stack top is "("`, () => {
+ t.assertEq(shouldPush([{}], { operator: "(" }), true);
+ t.assertEq(shouldPush([{ operator: "("}], {}), true);
+ });
+
+ t.testing("don't push otherwise", () => {
+ t.assertEq(shouldPush([{}], {}), false);
+ });
+};
+
+const test_toPostfixStep = t => {
+ t.start("toPostfixStep()");
+
+ t.testing("string literals go directly to out", () => {
+ t.assertEq(
+ toPostfixStep({ out: ["a"], stack: "stack" }, "t"),
+ { out: ["a", "t"], stack: "stack" },
+ );
+ });
+
+ t.testing("parens put things on the stack", () => {
+ t.assertEq(
+ toPostfixStep({
+ out: "out",
+ stack: [{ operator: "(" }],
+ }, { operator: "X" }),
+ { out: "out", stack: [
+ { operator: "X" },
+ { operator: "(" },
+ ]},
+ );
+ });
+
+ // FIXME: more tests
+};
+
+const test_toPostfix = t => {
+ t.start("toPostfix()");
+
+ t.testing("regex table", () => {
+ const concat = { operator: "concat" };
+ const table = [{
+ in: [],
+ expected: []
+ }, {
+ in: [ "a" ],
+ expected: [ "a" ],
+ }, {
+ in: [ "a", concat, "b", concat, "c" ],
+ expected: [ "a", "b", concat, "c", concat ],
+ }, {
+ in: [ "a", { operator: "*" }],
+ expected: [ "a", { operator: "*" }],
+ }, {
+ in: [ "a", { operator: "|" }, "b" ],
+ expected: [ "a", "b", { operator: "|" } ],
+ }, {
+ in: [
+ "a",
+ concat,
+ "b",
+ { operator: "|" },
+ { operator: "(" },
+ "c",
+ concat,
+ "d",
+ { operator: ")" },
+ { operator: "*" },
+ ],
+ expected: [
+ "a",
+ "b",
+ concat,
+ "c",
+ "d",
+ concat,
+ { operator: "*" },
+ { operator: "|" },
+ ],
+ }, {
+ in: [
+ { operator: "(" },
+ "a",
+ { operator: "|" },
+ "b",
+ { operator: ")" },
+ { operator: "*" },
+ concat,
+ "c",
+ ],
+ expected: [
+ "a",
+ "b",
+ { operator: "|" },
+ { operator: "*" },
+ "c",
+ concat,
+ ],
+ }];
+ for (const test of table) {
+ t.assertEq(
+ toPostfix(test.in),
+ test.expected,
+ );
+ }
+ });
+};
+
+const test_emptyNFA = t => {
+ t.start("emptyNFA()");
+
+ t.testing("we deterministically build out of the ID", () => {
+ const expected = {
+ start: 1,
+ end: 2,
+ nextID: 3,
+ nodes: {
+ 1: {
+ direct: [ 2 ],
+ transitions: {},
+ },
+ 2: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(emptyNFA(), expected);
+ });
+};
+
+const test_baseNFA = t => {
+ t.start("baseNFA()");
+
+ t.testing("the given edge connects the nodes", () => {
+ const expected = {
+ start: 123,
+ end: 124,
+ nextID: 125,
+ nodes: {
+ 123: {
+ direct: [],
+ transitions: { X: 124 },
+ },
+ 124: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(baseNFA("X", 123), expected);
+ });
+};
+
+const test_concat = t => {
+ t.start("concat()");
+
+ t.testing("on baseNFA() and on more complex NFAs", () => {
+ const a = baseNFA("a", 1);
+ const b = baseNFA("b", a.nextID);
+ const a_ = baseNFA("a", b.nextID);
+ const expected1 = {
+ start: 1,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 3 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(concat(a, b), expected1);
+
+ const expected2 = {
+ start: 1,
+ end: 6,
+ nextID: 7,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 3 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [ 5 ],
+ transitions: {},
+ },
+ 5: {
+ direct: [],
+ transitions: { a: 6 },
+ },
+ 6: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(concat(concat(a, b), a_), expected2);
+ });
+};
+
+const test_union = t => {
+ t.start("union()");
+
+ t.testing("on baseNFA() and on more complex NFAs", () => {
+ const a = baseNFA("a", 1);
+ const b = baseNFA("b", a.nextID);
+ const expected1 = {
+ start: 5,
+ end: 6,
+ nextID: 7,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 5: {
+ direct: [ 1, 3 ],
+ transitions: {},
+ },
+ 6: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(union(a, b), expected1);
+
+ const c = baseNFA("c", expected1.nextID);
+ const expected2 = {
+ start: 9,
+ end: 10,
+ nextID: 11,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 5: {
+ direct: [ 1, 3 ],
+ transitions: {},
+ },
+ 6: {
+ direct: [ 10 ],
+ transitions: {},
+ },
+ 7: {
+ direct: [],
+ transitions: { c: 8 },
+ },
+ 8: {
+ direct: [ 10 ],
+ transitions: {},
+ },
+ 9: {
+ direct: [ 5, 7 ],
+ transitions: {},
+ },
+ 10: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(union(union(a, b), c), expected2);
+ });
+};
+
+const test_zeroOrMore = t => {
+ t.start("zeroOrMore()");
+
+ t.testing("on baseNFA()", () => {
+ const a = baseNFA("a", 1);
+ const expected = {
+ start: 3,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 1, 4 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [ 1, 4 ],
+ transitions: {},
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(zeroOrMore(a), expected);
+ });
+};
+
+const test_oneOrMore = t => {
+ t.start("oneOrMore()");
+
+ t.testing("on baseNFA()", () => {
+ const a = baseNFA("a", 1);
+ const expected = {
+ start: 1,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 3 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [ 1, 4 ],
+ transitions: {},
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(oneOrMore(a), expected);
+ });
+};
+
+const test_zeroOrOne = t => {
+ t.start("zeroOrOne()");
+
+ t.testing("on baseNFA()", () => {
+ const a = baseNFA("a", 1);
+ const expected = {
+ start: 3,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 4 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [ 1, 4 ],
+ transitions: {},
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(zeroOrOne(a), expected);
+ });
+};
+
+const test_OPERATORS_FNS = t => {
+ t.start("OPERATORS_FNS");
+
+ const arr = [];
+ const fn = ret => (...args) => {
+ arr.push({
+ args,
+ ret,
+ });
+ return ret;
+ };
+ const input = [1, 2, 3];
+
+
+ t.testing("star", () => {
+ arr.splice(0);
+ t.assertEq(
+ OPERATORS_FNS({ zeroOrMoreFn: fn(111) })["*"](input),
+ [1, 2, 111],
+ );
+ t.assertEq(arr, [{ args: [3], ret: 111 }]);
+ });
+
+ t.testing("plus", () => {
+ arr.splice(0);
+ t.assertEq(
+ OPERATORS_FNS({ oneOrMoreFn: fn(222) })["+"](input),
+ [1, 2, 222],
+ );
+ t.assertEq(arr, [{ args: [3], ret: 222 }]);
+ });
+
+ t.testing("question mark", () => {
+ arr.splice(0);
+ t.assertEq(
+ OPERATORS_FNS({ zeroOrOneFn: fn(333) })["?"](input),
+ [1, 2, 333],
+ );
+ t.assertEq(arr, [{ args: [3], ret: 333 }]);
+ });
+
+ t.testing("pipe", () => {
+ arr.splice(0);
+ t.assertEq(
+ OPERATORS_FNS({ unionFn: fn(444) })["|"](input),
+ [1, 444],
+ );
+ t.assertEq(arr, [{ args: [2, 3], ret: 444 }]);
+ });
+
+ t.testing("concat", () => {
+ arr.splice(0);
+ t.assertEq(
+ OPERATORS_FNS({ concatFn: fn(555) })["concat"](input),
+ [1, 555],
+ );
+ t.assertEq(arr, [{ args: [2, 3], ret: 555 }]);
+ });
+};
+
+const test_buildNFAStep = t => {
+ t.start("buildNFAStep()");
+
+ t.testing("literal char without existing nextID", () => {
+ const expected = [{
+ start: 1,
+ end: 2,
+ nextID: 3,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { t: 2 },
+ },
+ 2: {
+ direct: [],
+ transitions: {},
+ },
+
+ },
+ }];
+ t.assertEq(buildNFAStep([], "t"), expected);
+ });
+
+ t.testing("literal char with existing nextID", () => {
+ const input = baseNFA("a", 1);
+ const expected = [{
+ start: 1,
+ end: 2,
+ nextID: 3,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ }, {
+ start: 3,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ }];
+ t.assertEq(buildNFAStep([input], "b"), expected);
+ t.assertEq(
+ buildNFAStep([input], "b"),
+ [ baseNFA("a", 1), baseNFA("b", 3) ]
+ );
+ });
+
+ t.testing("operator", () => {
+ const input = [ baseNFA("a", 1), baseNFA("b", 3) ];
+ const expected = [{
+ start: 1,
+ end: 4,
+ nextID: 5,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 3 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ }];
+ t.assertEq(
+ buildNFAStep(input, { operator: "concat" }),
+ expected,
+ );
+ });
+};
+
+const test_buildNFA = t => {
+ t.start("buildNFA()");
+
+ t.testing("empty regex pattern", () => {
+ const expected = {
+ start: 1,
+ end: 2,
+ nextID: 3,
+ nodes: {
+ 1: {
+ direct: [ 2 ],
+ transitions: {},
+ },
+ 2: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(buildNFA([]), expected);
+ });
+
+ t.testing("sample composed regex", () => {
+ const regex = "(a|b)*cde";
+ const expected = {
+ start: 7,
+ end: 14,
+ nextID: 15,
+ nodes: {
+ 1: {
+ direct: [],
+ transitions: { a: 2 },
+ },
+ 2: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 3: {
+ direct: [],
+ transitions: { b: 4 },
+ },
+ 4: {
+ direct: [ 6 ],
+ transitions: {},
+ },
+ 5: {
+ direct: [ 1, 3 ],
+ transitions: {},
+ },
+ 6: {
+ direct: [ 5, 8 ],
+ transitions: {},
+ },
+ 7: {
+ direct: [ 5, 8 ],
+ transitions: {},
+ },
+ 8: {
+ direct: [ 9 ],
+ transitions: {},
+ },
+ 9: {
+ direct: [],
+ transitions: { c: 10 },
+ },
+ 10: {
+ direct: [ 11 ],
+ transitions: {},
+ },
+ 11: {
+ direct: [],
+ transitions: { d: 12 },
+ },
+ 12: {
+ direct: [ 13 ],
+ transitions: {},
+ },
+ 13: {
+ direct: [],
+ transitions: { e: 14 },
+ },
+ 14: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(
+ buildNFA(toPostfix(tokenizeRegex(explode(regex)))),
+ expected,
+ );
+ });
+};
+
+const test_allDirects = t => {
+ t.start("allDirects()");
+
+ t.testing("empty directs", () => {
+ });
+
+ t.testing("single level", () => {
+ });
+
+ t.testing("deep recursion", () => {
+ });
+
+ t.testing("plenty of duplicates", () => {
+ });
+};
+
+const test_searchNFAStep = t => {
+ t.start("searchNFAStep()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_searchNFAFn = t => {
+ t.start("searchNFAFn()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_searchNFA = t => {
+ t.start("searchNFA()");
+
+ t.testing("on composed sample regex", () => {
+ const regex = "(a|b)*c";
+ const nfa = buildNFA(toPostfix(tokenizeRegex(explode(regex))));
+ t.assertEq(searchNFA(nfa, "c"), true);
+ t.assertEq(searchNFA(nfa, "cc"), false);
+ t.assertEq(searchNFA(nfa, "aaabbbc"), true);
+ });
+};
+
+const test_nodeID = t => {
+ t.start("nodeID()");
+
+ t.testing("single number", () => {
+ t.assertEq(nodeID([1]), "1");
+ });
+
+ t.testing("plenty of duplicates", () => {
+ t.assertEq(nodeID([1, 1, 1, 1, 1, 1]), "1");
+ });
+
+ t.testing("unsorted input", () => {
+ t.assertEq(
+ nodeID([5, 5, 11, 11, 11, 3, 4, 5, 123]),
+ "3-4-5-11-123",
+ );
+ });
+};
+
+const test_unNodeID = t => {
+ t.start("unNodeID()");
+
+ t.testing("no hyphes", () => {
+ t.assertEq(unNodeID("123"), [123]);
+ });
+
+ t.testing("with hyphens", () => {
+ t.assertEq(unNodeID("1-2"), [1, 2]);
+ t.assertEq(unNodeID("11-3-0"), [11, 3, 0]);
+ });
+};
+
+const test_collectTransitions = t => {
+ t.start("collectTransitions()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_computePending = t => {
+ t.start("computePending()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_buildDFANodes = t => {
+ t.start("buildDFANodes()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_buildDFA = t => {
+ t.start("buildDFA()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_searchDFAStep = t => {
+ t.start("searchDFAStep()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_searchDFAFn = t => {
+ t.start("searchDFAFn()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_searchDFA = t => {
+ t.start("searchDFA()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_compileNFA = t => {
+ t.start("compileNFA()");
+
+ t.testing("empty regex", () => {
+ const expected = {
+ start: 1,
+ end: 2,
+ nextID: 3,
+ nodes: {
+ 1: {
+ direct: [ 2 ],
+ transitions: {},
+ },
+ 2: {
+ direct: [],
+ transitions: {},
+ },
+ },
+ };
+ t.assertEq(compileNFA(""), expected);
+ });
+
+ t.testing("composed sample regex", () => {
+ const nfa = compileNFA("(a|b)*c");
+ t.assertEq(searchNFA(nfa, "ababac"), true);
+ t.assertEq(searchNFA(nfa, "babac"), true);
+ t.assertEq(searchNFA(nfa, "babaca"), false);
+ });
+};
+
+const test_compileDFA = t => {
+ t.start("compileDFA()");
+
+ t.testing("FIXME", () => {});
+};
+
+const test_compile = t => {
+ t.start("compile()");
+
+ t.testing("composed sample regex", () => {
+ const matcher = compile("(a|b)*c");
+ t.assertEq(matcher("ababac"), true);
+ t.assertEq(matcher("ababaca"), false);
+ });
+};
+
+
+
+runTests([
+ test_shouldConcat,
+ test_isOperator,
+ test_tokenizeRegexStep,
+ test_tokenizeRegexFn,
+ test_tokenizeRegex,
+ test_shouldPush,
+ test_toPostfixStep,
+ test_toPostfix,
+ test_emptyNFA,
+ test_baseNFA,
+ test_concat,
+ test_union,
+ test_zeroOrMore,
+ test_oneOrMore,
+ test_zeroOrOne,
+ test_OPERATORS_FNS,
+ test_buildNFAStep,
+ test_buildNFA,
+ test_allDirects,
+ test_searchNFAStep,
+ test_searchNFAFn,
+ test_searchNFA,
+ test_nodeID,
+ test_unNodeID,
+ test_collectTransitions,
+ test_computePending,
+ test_buildDFANodes,
+ test_buildDFA,
+ test_searchDFAStep,
+ test_searchDFAFn,
+ test_searchDFA,
+ test_compileNFA,
+ test_compileDFA,
+ test_compile,
+]);