From c4f360baf07107468667832be66e5e9c3f92d4b6 Mon Sep 17 00:00:00 2001 From: EuAndreh Date: Tue, 15 Jul 2025 13:55:36 -0300 Subject: Support tokenizing character class expressions [a-z] * src/paca.mjs (classStateStep): New function equivalent to `rangeStateStep()` for character class expressions. For now it knowns how to handle escaping ([abc\-_]), simple ranges ([a-z]), negation ([^abc]) and the hyphen literal as the first char ([-a-z_]). * tests.paca.mjs (test_classStateStep): New test entry has a test case each scenario described above. --- src/paca.mjs | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 125 insertions(+), 3 deletions(-) (limited to 'src/paca.mjs') diff --git a/src/paca.mjs b/src/paca.mjs index 0b64a87..b92bcdc 100644 --- a/src/paca.mjs +++ b/src/paca.mjs @@ -1,5 +1,6 @@ import { - butlast, explode, isNumeric, last, mapValues, max, reduce, reduced, + butlast, dissoc, explode, isNumeric, last, mapValues, max, reduce, + reduced, } from "sjs"; @@ -11,6 +12,7 @@ const ConcatStep = { ACCEPTING: "accepting", ESCAPING: "escaping", RANGE: "range", + CLASS: "class", }; const numFromDigits = digits => @@ -107,9 +109,130 @@ const rangeStateStep = ({ out, state, context }, char, _index, _next) => { }; }; +const classStateStep = ({ out, state, context }, char, _index, _next) => { + if (context.escaping) { + return { + out, + state, + context: dissoc({ + ...context, + set: context.set.concat(char), + }, "escaping"), + }; + } + + if (char === "]") { + if (context.range.where === "to") { + return reduced({ + out, + state, + context, + error: new SyntaxError( + "unfinished character class range", + ), + }); + } + + if (context.set.length === 0) { + return reduced({ + out, + state, + context, + error: new ValueError("empty character class"), + }); + } + + return { + out: out.concat({ + operator: "class", + set: context.set, + }), + state: ConcatStep.ACCEPTING, + context: null, + }; + } + + if (char === "\\") { + return { + out, + state, + context: { + ...context, + escaping: true, + }, + }; + } + + if (context.range.where === "to") { + const from = context.range.from; + const to = char; + + if (from.charCodeAt(0) > to.charCodeAt(0)) { + return reduced({ + out, + state, + context, + error: new ValueError( + "bad class range values: " + + `[${from}-${to}]`, + ), + }); + } + + return { + out, + state, + context: { + ...context, + set: context.set.concat({ from, to }), + range: { + from: null, + where: "from", + }, + }, + }; + } + + if (char === "-" && context.set.length !== 0) { + return { + out, + state, + context: { + ...context, + set: butlast(context.set), + range: { + from: last(context.set), + where: "to", + }, + }, + }; + } + + if (char === "^" && context.set.length === 0) { + return { + out, + state, + context: { + ...context, + caret: true, + }, + }; + } + + return { + out, + state, + context: { + ...context, + set: context.set.concat(char), + }, + }; +}; + const STATE_FNS = { [ConcatStep.ESCAPING]: escapingStateStep, [ConcatStep.RANGE ]: rangeStateStep, + [ConcatStep.CLASS ]: classStateStep, }; const TRANSITION_FNS = { @@ -133,8 +256,7 @@ const TRANSITION_FNS = { context: { set: [], range: { - from: [], - to: [], + from: null, where: "from", }, }, -- cgit v1.2.3