diff options
Diffstat (limited to 'compiler/parser.go')
-rw-r--r-- | compiler/parser.go | 608 |
1 files changed, 17 insertions, 591 deletions
diff --git a/compiler/parser.go b/compiler/parser.go index 89c8301..ce481e3 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -10,6 +10,7 @@ import ( "github.com/nihei9/maleeni/spec" "github.com/nihei9/maleeni/ucd" + "github.com/nihei9/maleeni/utf8" ) type ParseErrors struct { @@ -725,124 +726,30 @@ func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode { return genAltNode(concats...) } -// Refelences: -// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 -// * Table 3-6. UTF-8 Bit Distribution -// * Table 3-7. Well-Formed UTF-8 Byte Sequences func genAnyCharAST() astNode { - return genAltNode( - // 1 byte character <00..7F> - newRangeSymbolNode(0x00, 0x7f), - // 2 bytes character <C2..DF 80..BF> - genConcatNode( - newRangeSymbolNode(0xc2, 0xdf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 3 bytes character <E0 A0..BF 80..BF> - genConcatNode( - newSymbolNode(0xe0), - newRangeSymbolNode(0xa0, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 3 bytes character <E1..EC 80..BF 80..BF> - genConcatNode( - newRangeSymbolNode(0xe1, 0xec), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 3 bytes character <ED 80..9F 80..BF> - genConcatNode( - newSymbolNode(0xed), - newRangeSymbolNode(0x80, 0x9f), - newRangeSymbolNode(0x80, 0xbf), - ), - // 3 bytes character <EE..EF 80..BF 80..BF> - genConcatNode( - newRangeSymbolNode(0xee, 0xef), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 4 bytes character <F0 90..BF 80..BF 80..BF> - genConcatNode( - newSymbolNode(0xf0), - newRangeSymbolNode(0x90, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 4 bytes character <F1..F3 80..BF 80..BF 80..BF> - genConcatNode( - newRangeSymbolNode(0xf1, 0xf3), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - // 4 bytes character <F4 80..8F 80..BF 80..BF> - genConcatNode( - newSymbolNode(0xf4), - newRangeSymbolNode(0x80, 0x8f), - newRangeSymbolNode(0x80, 0xbf), - newRangeSymbolNode(0x80, 0xbf), - ), - ) + return convertCharBlocksToAST(utf8.AllCharBlocks()) } func genRangeAST(fromNode, toNode astNode) astNode { from := genByteSeq(fromNode) to := genByteSeq(toNode) - switch len(from) { - case 1: - switch len(to) { - case 1: - return gen1ByteCharRangeAST(from, to) - case 2: - return genAltNode( - gen1ByteCharRangeAST(from, []byte{0x7f}), - gen2ByteCharRangeAST([]byte{0xc2, 0x80}, to), - ) - case 3: - return genAltNode( - gen1ByteCharRangeAST(from, []byte{0x7f}), - gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}), - gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, to), - ) - case 4: - return genAltNode( - gen1ByteCharRangeAST(from, []byte{0x7f}), - gen2ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xdf, 0xbf}), - gen3ByteCharRangeAST([]byte{0xe0, 0xa0, 0x80}, []byte{0xef, 0xbf, 0xbf}), - gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), - ) - } - case 2: - switch len(to) { - case 2: - return gen2ByteCharRangeAST(from, to) - case 3: - return genAltNode( - gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}), - gen3ByteCharRangeAST([]byte{0xc2, 0x80}, to), - ) - case 4: - return genAltNode( - gen2ByteCharRangeAST(from, []byte{0xdf, 0xbf}), - gen3ByteCharRangeAST([]byte{0xc2, 0x80}, []byte{0xef, 0xbf, 0xbf}), - gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), - ) - } - case 3: - switch len(to) { - case 3: - return gen3ByteCharRangeAST(from, to) - case 4: - return genAltNode( - gen3ByteCharRangeAST(from, []byte{0xef, 0xbf, 0xbf}), - gen4ByteCharRangeAST([]byte{0xf0, 0x90, 0x80}, to), - ) + blks, err := utf8.GenCharBlocks(from, to) + if err != nil { + panic(err) + } + return convertCharBlocksToAST(blks) +} + +func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode { + var alt astNode + for _, blk := range blks { + r := make([]astNode, len(blk.From)) + for i := 0; i < len(blk.From); i++ { + r[i] = newRangeSymbolNode(blk.From[i], blk.To[i]) } - case 4: - return gen4ByteCharRangeAST(from, to) + alt = genAltNode(alt, genConcatNode(r...)) } - panic(fmt.Errorf("invalid range; from: %v, to: %v", from, to)) + return alt } func genByteSeq(node astNode) []byte { @@ -888,487 +795,6 @@ func isValidOrder(from, to []byte) bool { return true } -type byteBoundsEntry struct { - min byte - max byte -} - -var ( - bounds1 = [][]byteBoundsEntry{ - nil, - {{min: 0x00, max: 0x7f}}, - } - - bounds2 = [][]byteBoundsEntry{ - nil, - {{min: 0xc2, max: 0xdf}, {min: 0x80, max: 0xbf}}, - } - - bounds3 = [][]byteBoundsEntry{ - nil, - {{min: 0xe0, max: 0xe0}, {min: 0xa0, max: 0xbf}, {min: 0x80, max: 0xbf}}, - {{min: 0xe1, max: 0xec}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, - {{min: 0xed, max: 0xed}, {min: 0x80, max: 0x9f}, {min: 0x80, max: 0xbf}}, - {{min: 0xee, max: 0xef}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, - } - - bounds4 = [][]byteBoundsEntry{ - nil, - {{min: 0xf0, max: 0xf0}, {min: 0x90, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, - {{min: 0xf1, max: 0xf3}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, - {{min: 0xf4, max: 0xf4}, {min: 0x80, max: 0x8f}, {min: 0x80, max: 0xbf}, {min: 0x80, max: 0xbf}}, - } -) - -func gen1ByteCharRangeAST(from, to []byte) astNode { - return newRangeSymbolNode(from[0], to[0]) -} - -func gen2ByteCharRangeAST(from, to []byte) astNode { - from0 := from[0] - from1 := from[1] - to0 := to[0] - to1 := to[1] - switch { - case from0 == to0 && from1 == to1: - return genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - ) - case from0 == to0: - return genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1, to1), - ) - default: - alt1 := genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1, 0xbf), - ) - alt2 := genConcatNode( - newRangeSymbolNode(from0+1, to0), - newRangeSymbolNode(0x80, to1), - ) - return newAltNode(alt1, alt2) - } -} - -func get3ByteCharRangeNum(seq []byte) int { - head := seq[0] - switch { - case head == 0xe0: - return 1 - case head >= 0xe1 && head <= 0xec: - return 2 - case head == 0xed: - return 3 - case head >= 0xee && head <= 0xef: - return 4 - } - return 0 -} - -func get4ByteCharRangeNum(seq []byte) int { - head := seq[0] - switch { - case head == 0xf0: - return 1 - case head >= 0xf1 && head <= 0xf3: - return 2 - case head == 0xf4: - return 3 - } - return 0 -} - -func gen3ByteCharRangeAST(from, to []byte) astNode { - from0 := from[0] - from1 := from[1] - from2 := from[2] - to0 := to[0] - to1 := to[1] - to2 := to[2] - switch { - case from0 == to0 && from1 == to1 && from2 == to2: - return genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - ) - case from0 == to0 && from1 == to1: - return genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2, to2), - ) - case from0 == to0: - rangeNum := get3ByteCharRangeNum(from) - bounds := bounds3[rangeNum] - var alt astNode - alt = genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2, bounds[2].max), - ) - if from1+1 < to1 { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1+1, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2), - ), - ) - return alt - default: - fromRangeNum := get3ByteCharRangeNum(from) - toRangeNum := get3ByteCharRangeNum(to) - bounds := bounds3[fromRangeNum] - var alt astNode - alt = genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2, bounds[2].max), - ) - if from1 < bounds[1].max { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1+1, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - if fromRangeNum == toRangeNum { - if from0+1 < to0 { - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(from0+1, to0-1), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - if to1 > bounds[1].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newRangeSymbolNode(bounds[1].min, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2), - ), - ) - return alt - } - for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ { - bounds := bounds3[rangeNum] - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(bounds[0].min, bounds[0].max), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - bounds = bounds3[toRangeNum] - if to0 > bounds[0].min { - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(bounds[0].min, to0-1), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - if to1 > bounds[1].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newRangeSymbolNode(bounds[1].min, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2), - ), - ) - return alt - } -} - -func gen4ByteCharRangeAST(from, to []byte) astNode { - from0 := from[0] - from1 := from[1] - from2 := from[2] - from3 := from[3] - to0 := to[0] - to1 := to[1] - to2 := to[2] - to3 := to[3] - switch { - case from0 == to0 && from1 == to1 && from2 == to2 && from3 == to3: - return genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - newSymbolNode(from3), - ) - case from0 == to0 && from1 == to1 && from2 == to2: - return genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - newRangeSymbolNode(from3, to3), - ) - case from0 == to0 && from1 == to1: - rangeNum := get4ByteCharRangeNum(from) - bounds := bounds4[rangeNum] - var alt astNode - alt = genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - newRangeSymbolNode(from3, bounds[3].max), - ) - if from2+1 < to2 { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2+1, to2-1), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(to2), - newRangeSymbolNode(bounds[3].min, to3), - ), - ) - return alt - case from0 == to0: - rangeNum := get4ByteCharRangeNum(from) - bounds := bounds4[rangeNum] - var alt astNode - alt = genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - newRangeSymbolNode(from3, bounds[3].max), - ) - if from2 < bounds[2].max { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2+1, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if from1+1 < to1 { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1+1, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if to2 > bounds[2].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2-1), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(to1), - newSymbolNode(to2), - newRangeSymbolNode(bounds[3].min, to3), - ), - ) - return alt - default: - fromRangeNum := get4ByteCharRangeNum(from) - toRangeNum := get4ByteCharRangeNum(to) - bounds := bounds4[fromRangeNum] - var alt astNode - alt = genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newSymbolNode(from2), - newRangeSymbolNode(from3, bounds[3].max), - ) - if from2 < bounds[2].max { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newSymbolNode(from1), - newRangeSymbolNode(from2+1, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if from1 < bounds[1].max { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(from0), - newRangeSymbolNode(from1+1, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if fromRangeNum == toRangeNum { - if from0+1 < to0 { - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(from0+1, to0-1), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if to1 > bounds[1].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newRangeSymbolNode(bounds[1].min, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if to2 > bounds[2].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2-1), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newSymbolNode(to2), - newRangeSymbolNode(bounds[3].min, to3), - ), - ) - return alt - } - for rangeNum := fromRangeNum + 1; rangeNum < toRangeNum; rangeNum++ { - bounds := bounds4[rangeNum] - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(bounds[0].min, bounds[0].max), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - bounds = bounds4[toRangeNum] - if to0 > bounds[0].min { - alt = genAltNode( - alt, - genConcatNode( - newRangeSymbolNode(bounds[0].min, to0-1), - newRangeSymbolNode(bounds[1].min, bounds[1].max), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if to1 > bounds[1].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newRangeSymbolNode(bounds[1].min, to1-1), - newRangeSymbolNode(bounds[2].min, bounds[2].max), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - if to2 > bounds[2].min { - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newRangeSymbolNode(bounds[2].min, to2-1), - newRangeSymbolNode(bounds[3].min, bounds[3].max), - ), - ) - } - alt = genAltNode( - alt, - genConcatNode( - newSymbolNode(to0), - newSymbolNode(to1), - newSymbolNode(to2), - newRangeSymbolNode(bounds[3].min, to3), - ), - ) - return alt - } -} - func genConcatNode(cs ...astNode) astNode { if len(cs) <= 0 { return nil |