package tre import ( "bufio" "bytes" "encoding/binary" "fmt" "io" "regexp" "strconv" "strings" "sort" "ucd" ) type CharBlock struct { From []byte To []byte } type cpRange struct { from rune to rune } // "github.com/nihei9/maleeni/spec" func (b *CharBlock) String() string { var s strings.Builder fmt.Fprint(&s, "<") fmt.Fprintf(&s, "%X", b.From[0]) for i := 1; i < len(b.From); i++ { fmt.Fprintf(&s, " %X", b.From[i]) } fmt.Fprint(&s, "..") fmt.Fprintf(&s, "%X", b.To[0]) for i := 1; i < len(b.To); i++ { fmt.Fprintf(&s, " %X", b.To[i]) } fmt.Fprint(&s, ">") return s.String() } func GenCharBlocks(from, to rune) ([]*CharBlock, error) { rs, err := splitCodePoint(from, to) if err != nil { return nil, err } blks := make([]*CharBlock, len(rs)) for i, r := range rs { blks[i] = &CharBlock{ From: []byte(string(r.from)), To: []byte(string(r.to)), } } return blks, nil } /// `splitCodePoint` splits a code point range represented by into /// some blocks. The code points that the block contains will be a continuous /// byte sequence when encoded into UTF-8. For instance, this function splits /// into and because /// is continuous on the code point but non-continuous in the /// UTF-8 byte sequence (In UTF-8, is encoded <00..7F>, and /// is encoded ). /// /// The blocks don't contain surrogate code points because byte /// sequences encoding them are ill-formed in UTF-8. For instance, /// is split into and . /// However, when `from` or `to` itself is the surrogate code point, this /// function returns an error. func splitCodePoint(from, to rune) ([]*cpRange, error) { if from > to { return nil, fmt.Errorf( "code point range must be from <= to: U+%X..U+%X", from, to, ) } if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff { return nil, fmt.Errorf( "code point must be >=U+0000 and <=U+10FFFF:" + "U+%X..U+%X", from, to, ) } // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf // > 3.9 Unicode Encoding Forms // > UTF-8 D92 // > Because surrogate code points are not Unicode scalar values, // > any UTF-8 byte sequence that would otherwise // > map to code points U+D800..U+DFFF is ill-formed. if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff { return nil, fmt.Errorf( "surrogate code points U+D800..U+DFFF " + "are not allowed in UTF-8: U+%X..U+%X", from, to, ) } in := &cpRange{ from: from, to: to, } var rs []*cpRange for in.from <= in.to { r := &cpRange{ from: in.from, to: in.to, } // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf // > 3.9 Unicode Encoding Forms // > UTF-8 Table 3-7. // > Well-Formed UTF-8 Byte Sequences switch { case in.from <= 0x007f && in.to > 0x007f: r.to = 0x007f case in.from <= 0x07ff && in.to > 0x07ff: r.to = 0x07ff case in.from <= 0x0fff && in.to > 0x0fff: r.to = 0x0fff case in.from <= 0xcfff && in.to > 0xcfff: r.to = 0xcfff case in.from <= 0xd7ff && in.to > 0xd7ff: r.to = 0xd7ff case in.from <= 0xffff && in.to > 0xffff: r.to = 0xffff case in.from <= 0x3ffff && in.to > 0x3ffff: r.to = 0x3ffff case in.from <= 0xfffff && in.to > 0xfffff: r.to = 0xfffff } rs = append(rs, r) in.from = r.to + 1 // Skip surrogate code points U+D800..U+DFFF. if in.from >= 0xd800 && in.from <= 0xdfff { in.from = 0xe000 } } return rs, nil } type OriginalTable struct { entries []int rowCount int colCount int } func NewOriginalTable(entries []int, colCount int) (*OriginalTable, error) { if len(entries) == 0 { return nil, fmt.Errorf("enries is empty") } if colCount <= 0 { return nil, fmt.Errorf("colCount must be >=1") } if len(entries)%colCount != 0 { return nil, fmt.Errorf("entries length or column count are incorrect; entries length: %v, column count: %v", len(entries), colCount) } return &OriginalTable{ entries: entries, rowCount: len(entries) / colCount, colCount: colCount, }, nil } type Compressor interface { Compress(orig *OriginalTable) error Lookup(row, col int) (int, error) OriginalTableSize() (int, int) } var ( _ Compressor = &CompressorUniqueEntriesTable{} _ Compressor = &CompressorRowDisplacementTable{} ) type CompressorUniqueEntriesTable struct { UniqueEntries []int RowNums []int OriginalRowCount int OriginalColCount int } func NewCompressorUniqueEntriesTable() *CompressorUniqueEntriesTable { return &CompressorUniqueEntriesTable{} } func (tab *CompressorUniqueEntriesTable) Lookup(row, col int) (int, error) { if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount { return 0, fmt.Errorf("indexes are out of range: [%v, %v]", row, col) } return tab.UniqueEntries[tab.RowNums[row]*tab.OriginalColCount+col], nil } func (tab *CompressorUniqueEntriesTable) OriginalTableSize() (int, int) { return tab.OriginalRowCount, tab.OriginalColCount } func (tab *CompressorUniqueEntriesTable) Compress(orig *OriginalTable) error { var uniqueEntries []int rowNums := make([]int, orig.rowCount) hash2RowNum := map[string]int{} nextRowNum := 0 for row := 0; row < orig.rowCount; row++ { var rowHash string { buf := make([]byte, 0, orig.colCount*8) for col := 0; col < orig.colCount; col++ { b := make([]byte, 8) binary.PutUvarint(b, uint64(orig.entries[row*orig.colCount+col])) buf = append(buf, b...) } rowHash = string(buf) } rowNum, ok := hash2RowNum[rowHash] if !ok { rowNum = nextRowNum nextRowNum++ hash2RowNum[rowHash] = rowNum start := row * orig.colCount entry := append([]int{}, orig.entries[start:start+orig.colCount]...) uniqueEntries = append(uniqueEntries, entry...) } rowNums[row] = rowNum } tab.UniqueEntries = uniqueEntries tab.RowNums = rowNums tab.OriginalRowCount = orig.rowCount tab.OriginalColCount = orig.colCount return nil } const ForbiddenValue = -1 type CompressorRowDisplacementTable struct { OriginalRowCount int OriginalColCount int EmptyValue int Entries []int Bounds []int RowDisplacement []int } func NewCompressorRowDisplacementTable(emptyValue int) *CompressorRowDisplacementTable { return &CompressorRowDisplacementTable{ EmptyValue: emptyValue, } } func (tab *CompressorRowDisplacementTable) Lookup(row int, col int) (int, error) { if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount { return tab.EmptyValue, fmt.Errorf("indexes are out of range: [%v, %v]", row, col) } d := tab.RowDisplacement[row] if tab.Bounds[d+col] != row { return tab.EmptyValue, nil } return tab.Entries[d+col], nil } func (tab *CompressorRowDisplacementTable) OriginalTableSize() (int, int) { return tab.OriginalRowCount, tab.OriginalColCount } type rowInfo struct { rowNum int nonEmptyCount int nonEmptyCol []int } func (tab *CompressorRowDisplacementTable) Compress(orig *OriginalTable) error { rowInfo := make([]rowInfo, orig.rowCount) { row := 0 col := 0 rowInfo[0].rowNum = 0 for _, v := range orig.entries { if col == orig.colCount { row++ col = 0 rowInfo[row].rowNum = row } if v != tab.EmptyValue { rowInfo[row].nonEmptyCount++ rowInfo[row].nonEmptyCol = append(rowInfo[row].nonEmptyCol, col) } col++ } sort.SliceStable(rowInfo, func(i int, j int) bool { return rowInfo[i].nonEmptyCount > rowInfo[j].nonEmptyCount }) } origEntriesLen := len(orig.entries) entries := make([]int, origEntriesLen) bounds := make([]int, origEntriesLen) resultBottom := orig.colCount rowDisplacement := make([]int, orig.rowCount) { for i := 0; i < origEntriesLen; i++ { entries[i] = tab.EmptyValue bounds[i] = ForbiddenValue } nextRowDisplacement := 0 for _, rInfo := range rowInfo { if rInfo.nonEmptyCount <= 0 { continue } for { isOverlapped := false for _, col := range rInfo.nonEmptyCol { if entries[nextRowDisplacement+col] == tab.EmptyValue { continue } nextRowDisplacement++ isOverlapped = true break } if isOverlapped { continue } rowDisplacement[rInfo.rowNum] = nextRowDisplacement for _, col := range rInfo.nonEmptyCol { entries[nextRowDisplacement+col] = orig.entries[(rInfo.rowNum*orig.colCount)+col] bounds[nextRowDisplacement+col] = rInfo.rowNum } resultBottom = nextRowDisplacement + orig.colCount nextRowDisplacement++ break } } } tab.OriginalRowCount = orig.rowCount tab.OriginalColCount = orig.colCount tab.Entries = entries[:resultBottom] tab.Bounds = bounds[:resultBottom] tab.RowDisplacement = rowDisplacement return nil } func Main() { } var rep = strings.NewReplacer( `.`, `\.`, `*`, `\*`, `+`, `\+`, `?`, `\?`, `|`, `\|`, `(`, `$`, `)`, `$`, `[`, `\[`, `\`, `\\`, ) // EscapePattern escapes the special characters. // For example, EscapePattern(`+`) returns `\+`. func EscapePattern(s string) string { return rep.Replace(s) } // LexKindID represents an ID of a lexical kind and is unique across all modes. type LexKindID int const ( LexKindIDNil = LexKindID(0) LexKindIDMin = LexKindID(1) ) func (id LexKindID) Int() int { return int(id) } // LexModeKindID represents an ID of a lexical kind and is unique within a mode. // Use LexKindID to identify a kind across all modes uniquely. type LexModeKindID int const ( LexModeKindIDNil = LexModeKindID(0) LexModeKindIDMin = LexModeKindID(1) ) func (id LexModeKindID) Int() int { return int(id) } // LexKindName represents a name of a lexical kind. type LexKindName string const LexKindNameNil = LexKindName("") func (k LexKindName) String() string { return string(k) } func (k LexKindName) validate() error { err := validateIdentifier(k.String()) if err != nil { return fmt.Errorf("invalid kind name: %v", err) } return nil } // LexPattern represents a pattern of a lexeme. // The pattern is written in regular expression. type LexPattern string func (p LexPattern) validate() error { if p == "" { return fmt.Errorf("pattern doesn't allow to be the empty string") } return nil } // LexModeID represents an ID of a lex mode. type LexModeID int const ( LexModeIDNil = LexModeID(0) LexModeIDDefault = LexModeID(1) ) func (n LexModeID) String() string { return strconv.Itoa(int(n)) } func (n LexModeID) Int() int { return int(n) } func (n LexModeID) IsNil() bool { return n == LexModeIDNil } // LexModeName represents a name of a lex mode. type LexModeName string const ( LexModeNameNil = LexModeName("") LexModeNameDefault = LexModeName("default") ) func (m LexModeName) String() string { return string(m) } func (m LexModeName) validate() error { err := validateIdentifier(m.String()) if err != nil { return fmt.Errorf("invalid mode name: %v", err) } return nil } const idPattern = `^[a-z](_?[0-9a-z]+)*$` var idRE = regexp.MustCompile(idPattern) func validateIdentifier(id string) error { if id == "" { return fmt.Errorf("identifier doesn't allow to be the empty string") } if !idRE.MatchString(id) { return fmt.Errorf("identifier must be %v", idPattern) } return nil } func SnakeCaseToUpperCamelCase(snake string) string { elems := strings.Split(snake, "_") for i, e := range elems { if len(e) == 0 { continue } elems[i] = strings.ToUpper(string(e[0])) + e[1:] } return strings.Join(elems, "") } type LexEntry struct { Kind LexKindName `json:"kind"` Pattern LexPattern `json:"pattern"` Modes []LexModeName `json:"modes"` Push LexModeName `json:"push"` Pop bool `json:"pop"` Fragment bool `json:"fragment"` } func (e *LexEntry) validate() error { err := e.Kind.validate() if err != nil { return err } err = e.Pattern.validate() if err != nil { return err } if len(e.Modes) > 0 { for _, mode := range e.Modes { err = mode.validate() if err != nil { return err } } } return nil } type LexSpec struct { Name string `json:"name"` Entries []*LexEntry `json:"entries"` } func (s *LexSpec) Validate() error { err := validateIdentifier(s.Name) if err != nil { return fmt.Errorf("invalid specification name: %v", err) } if len(s.Entries) <= 0 { return fmt.Errorf("the lexical specification must have at least one entry") } { var errs []error for i, e := range s.Entries { err := e.validate() if err != nil { errs = append(errs, fmt.Errorf("entry #%v: %w", i+1, err)) } } if len(errs) > 0 { var b strings.Builder fmt.Fprintf(&b, "%v", errs[0]) for _, err := range errs[1:] { fmt.Fprintf(&b, "\n%v", err) } return fmt.Errorf(b.String()) } } { ks := map[string]struct{}{} fks := map[string]struct{}{} for _, e := range s.Entries { // Allow duplicate names between fragments and non-fragments. if e.Fragment { if _, exist := fks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } fks[e.Kind.String()] = struct{}{} } else { if _, exist := ks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } ks[e.Kind.String()] = struct{}{} } } } { kinds := []string{} modes := []string{ LexModeNameDefault.String(), // This is a predefined mode. } for _, e := range s.Entries { if e.Fragment { continue } kinds = append(kinds, e.Kind.String()) for _, m := range e.Modes { modes = append(modes, m.String()) } } kindErrs := findSpellingInconsistenciesErrors(kinds, nil) modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(LexModeNameDefault.String()) { var b strings.Builder fmt.Fprintf(&b, "%+v", ids[0]) for _, id := range ids[1:] { fmt.Fprintf(&b, ", %+v", id) } return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", LexModeNameDefault, b.String()) } return nil }) errs := append(kindErrs, modeErrs...) if len(errs) > 0 { var b strings.Builder fmt.Fprintf(&b, "%v", errs[0]) for _, err := range errs[1:] { fmt.Fprintf(&b, "\n%v", err) } return fmt.Errorf(b.String()) } } return nil } func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { duplicated := FindSpellingInconsistencies(ids) if len(duplicated) == 0 { return nil } var errs []error for _, dup := range duplicated { if hook != nil { err := hook(dup) if err != nil { errs = append(errs, err) continue } } var b strings.Builder fmt.Fprintf(&b, "%+v", dup[0]) for _, id := range dup[1:] { fmt.Fprintf(&b, ", %+v", id) } err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) errs = append(errs, err) } return errs } // FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same // if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same // in UpperCamelCase. Thus they are considere to be spelling inconsistency. func FindSpellingInconsistencies(ids []string) [][]string { m := map[string][]string{} for _, id := range removeDuplicates(ids) { c := SnakeCaseToUpperCamelCase(id) m[c] = append(m[c], id) } var duplicated [][]string for _, camels := range m { if len(camels) == 1 { continue } duplicated = append(duplicated, camels) } for _, dup := range duplicated { sort.Slice(dup, func(i, j int) bool { return dup[i] < dup[j] }) } sort.Slice(duplicated, func(i, j int) bool { return duplicated[i][0] < duplicated[j][0] }) return duplicated } func removeDuplicates(s []string) []string { m := map[string]struct{}{} for _, v := range s { m[v] = struct{}{} } var unique []string for v := range m { unique = append(unique, v) } return unique } // StateID represents an ID of a state of a transition table. type StateID int const ( // StateIDNil represents an empty entry of a transition table. // When the driver reads this value, it raises an error meaning lexical analysis failed. StateIDNil = StateID(0) // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as // sequential numbers starting from this value. StateIDMin = StateID(1) ) func (id StateID) Int() int { return int(id) } type SpecRowDisplacementTable struct { OriginalRowCount int `json:"original_row_count"` OriginalColCount int `json:"original_col_count"` EmptyValue StateID `json:"empty_value"` Entries []StateID `json:"entries"` Bounds []int `json:"bounds"` RowDisplacement []int `json:"row_displacement"` } type SpecUniqueEntriesTable struct { UniqueEntries *SpecRowDisplacementTable `json:"unique_entries,omitempty"` UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"` RowNums []int `json:"row_nums"` OriginalRowCount int `json:"original_row_count"` OriginalColCount int `json:"original_col_count"` EmptyValue int `json:"empty_value"` } type TransitionTable struct { InitialStateID StateID `json:"initial_state_id"` AcceptingStates []LexModeKindID `json:"accepting_states"` RowCount int `json:"row_count"` ColCount int `json:"col_count"` Transition *SpecUniqueEntriesTable `json:"transition,omitempty"` UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"` } type CompiledLexModeSpec struct { KindNames []LexKindName `json:"kind_names"` Push []LexModeID `json:"push"` Pop []int `json:"pop"` DFA *TransitionTable `json:"dfa"` } type CompiledLexSpec struct { Name string `json:"name"` InitialModeID LexModeID `json:"initial_mode_id"` ModeNames []LexModeName `json:"mode_names"` KindNames []LexKindName `json:"kind_names"` KindIDs [][]LexKindID `json:"kind_ids"` CompressionLevel int `json:"compression_level"` Specs []*CompiledLexModeSpec `json:"specs"` } var ( ParseErr = fmt.Errorf("parse error") // lexical errors synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") // syntax errors synErrUnexpectedToken = fmt.Errorf("unexpected token") synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") synErrRangeInvalidForm = fmt.Errorf("invalid range expression") synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") synErrCharPropUnsupported = fmt.Errorf("unsupported character property") synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") ) type incompleteFragment struct { kind LexKindName root *rootNode } func CompleteFragments(fragments map[LexKindName]CPTree) error { if len(fragments) == 0 { return nil } completeFragments := map[LexKindName]CPTree{} incompleteFragments := []*incompleteFragment{} for kind, tree := range fragments { root, ok := tree.(*rootNode) if !ok { return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) } if root.incomplete() { incompleteFragments = append(incompleteFragments, &incompleteFragment{ kind: kind, root: root, }) } else { completeFragments[kind] = root } } for len(incompleteFragments) > 0 { lastIncompCount := len(incompleteFragments) remainingFragments := []*incompleteFragment{} for _, e := range incompleteFragments { complete, err := ApplyFragments(e.root, completeFragments) if err != nil { return err } if !complete { remainingFragments = append(remainingFragments, e) } else { completeFragments[e.kind] = e.root } } incompleteFragments = remainingFragments if len(incompleteFragments) == lastIncompCount { return ParseErr } } return nil } func ApplyFragments(t CPTree, fragments map[LexKindName]CPTree) (bool, error) { root, ok := t.(*rootNode) if !ok { return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) } for name, frag := range fragments { err := root.applyFragment(name, frag) if err != nil { return false, err } } return !root.incomplete(), nil } type tokenKind string const ( tokenKindChar tokenKind = "char" tokenKindAnyChar tokenKind = "." tokenKindRepeat tokenKind = "*" tokenKindRepeatOneOrMore tokenKind = "+" tokenKindOption tokenKind = "?" tokenKindAlt tokenKind = "|" tokenKindGroupOpen tokenKind = "(" tokenKindGroupClose tokenKind = ")" tokenKindBExpOpen tokenKind = "[" tokenKindInverseBExpOpen tokenKind = "[^" tokenKindBExpClose tokenKind = "]" tokenKindCharRange tokenKind = "-" tokenKindCodePointLeader tokenKind = "\\u" tokenKindCharPropLeader tokenKind = "\\p" tokenKindFragmentLeader tokenKind = "\\f" tokenKindLBrace tokenKind = "{" tokenKindRBrace tokenKind = "}" tokenKindEqual tokenKind = "=" tokenKindCodePoint tokenKind = "code point" tokenKindCharPropSymbol tokenKind = "character property symbol" tokenKindFragmentSymbol tokenKind = "fragment symbol" tokenKindEOF tokenKind = "eof" ) type token struct { kind tokenKind char rune propSymbol string codePoint string fragmentSymbol string } const nullChar = '\u0000' func newToken(kind tokenKind, char rune) *token { return &token{ kind: kind, char: char, } } func newCodePointToken(codePoint string) *token { return &token{ kind: tokenKindCodePoint, codePoint: codePoint, } } func newCharPropSymbolToken(propSymbol string) *token { return &token{ kind: tokenKindCharPropSymbol, propSymbol: propSymbol, } } func newFragmentSymbolToken(fragmentSymbol string) *token { return &token{ kind: tokenKindFragmentSymbol, fragmentSymbol: fragmentSymbol, } } type lexerMode string const ( lexerModeDefault lexerMode = "default" lexerModeBExp lexerMode = "bracket expression" lexerModeCPExp lexerMode = "code point expression" lexerModeCharPropExp lexerMode = "character property expression" lexerModeFragmentExp lexerMode = "fragment expression" ) type lexerModeStack struct { stack []lexerMode } func newLexerModeStack() *lexerModeStack { return &lexerModeStack{ stack: []lexerMode{ lexerModeDefault, }, } } func (s *lexerModeStack) top() lexerMode { return s.stack[len(s.stack)-1] } func (s *lexerModeStack) push(m lexerMode) { s.stack = append(s.stack, m) } func (s *lexerModeStack) pop() { s.stack = s.stack[:len(s.stack)-1] } type rangeState string // [a-z] // ^^^^ // |||`-- ready // ||`-- expect range terminator // |`-- read range initiator // `-- ready const ( rangeStateReady rangeState = "ready" rangeStateReadRangeInitiator rangeState = "read range initiator" rangeStateExpectRangeTerminator rangeState = "expect range terminator" ) type lexer struct { src *bufio.Reader peekChar2 rune peekEOF2 bool peekChar1 rune peekEOF1 bool lastChar rune reachedEOF bool prevChar1 rune prevEOF1 bool prevChar2 rune pervEOF2 bool modeStack *lexerModeStack rangeState rangeState errCause error errDetail string } func newLexer(src io.Reader) *lexer { return &lexer{ src: bufio.NewReader(src), peekChar2: nullChar, peekEOF2: false, peekChar1: nullChar, peekEOF1: false, lastChar: nullChar, reachedEOF: false, prevChar1: nullChar, prevEOF1: false, prevChar2: nullChar, pervEOF2: false, modeStack: newLexerModeStack(), rangeState: rangeStateReady, } } func (l *lexer) error() (string, error) { return l.errDetail, l.errCause } func (l *lexer) next() (*token, error) { c, eof, err := l.read() if err != nil { return nil, err } if eof { return newToken(tokenKindEOF, nullChar), nil } switch l.modeStack.top() { case lexerModeBExp: tok, err := l.nextInBExp(c) if err != nil { return nil, err } if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { switch l.rangeState { case rangeStateReady: l.rangeState = rangeStateReadRangeInitiator case rangeStateExpectRangeTerminator: l.rangeState = rangeStateReady } } switch tok.kind { case tokenKindBExpClose: l.modeStack.pop() case tokenKindCharRange: l.rangeState = rangeStateExpectRangeTerminator case tokenKindCodePointLeader: l.modeStack.push(lexerModeCPExp) case tokenKindCharPropLeader: l.modeStack.push(lexerModeCharPropExp) } return tok, nil case lexerModeCPExp: tok, err := l.nextInCodePoint(c) if err != nil { return nil, err } switch tok.kind { case tokenKindRBrace: l.modeStack.pop() } return tok, nil case lexerModeCharPropExp: tok, err := l.nextInCharProp(c) if err != nil { return nil, err } switch tok.kind { case tokenKindRBrace: l.modeStack.pop() } return tok, nil case lexerModeFragmentExp: tok, err := l.nextInFragment(c) if err != nil { return nil, err } switch tok.kind { case tokenKindRBrace: l.modeStack.pop() } return tok, nil default: tok, err := l.nextInDefault(c) if err != nil { return nil, err } switch tok.kind { case tokenKindBExpOpen: l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady case tokenKindInverseBExpOpen: l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady case tokenKindCodePointLeader: l.modeStack.push(lexerModeCPExp) case tokenKindCharPropLeader: l.modeStack.push(lexerModeCharPropExp) case tokenKindFragmentLeader: l.modeStack.push(lexerModeFragmentExp) } return tok, nil } } func (l *lexer) nextInDefault(c rune) (*token, error) { switch c { case '*': return newToken(tokenKindRepeat, nullChar), nil case '+': return newToken(tokenKindRepeatOneOrMore, nullChar), nil case '?': return newToken(tokenKindOption, nullChar), nil case '.': return newToken(tokenKindAnyChar, nullChar), nil case '|': return newToken(tokenKindAlt, nullChar), nil case '(': return newToken(tokenKindGroupOpen, nullChar), nil case ')': return newToken(tokenKindGroupClose, nullChar), nil case '[': c1, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindBExpOpen, nullChar), nil } if c1 != '^' { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindBExpOpen, nullChar), nil } c2, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindInverseBExpOpen, nullChar), nil } if c2 != ']' { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindInverseBExpOpen, nullChar), nil } err = l.restore() if err != nil { return nil, err } err = l.restore() if err != nil { return nil, err } return newToken(tokenKindBExpOpen, nullChar), nil case '\\': c, eof, err := l.read() if err != nil { return nil, err } if eof { l.errCause = synErrIncompletedEscSeq return nil, ParseErr } if c == 'u' { return newToken(tokenKindCodePointLeader, nullChar), nil } if c == 'p' { return newToken(tokenKindCharPropLeader, nullChar), nil } if c == 'f' { return newToken(tokenKindFragmentLeader, nullChar), nil } if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil } l.errCause = synErrInvalidEscSeq l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) return nil, ParseErr default: return newToken(tokenKindChar, c), nil } } func (l *lexer) nextInBExp(c rune) (*token, error) { switch c { case '-': if l.rangeState != rangeStateReadRangeInitiator { return newToken(tokenKindChar, c), nil } c1, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindChar, c), nil } if c1 != ']' { err := l.restore() if err != nil { return nil, err } return newToken(tokenKindCharRange, nullChar), nil } err = l.restore() if err != nil { return nil, err } return newToken(tokenKindChar, c), nil case ']': return newToken(tokenKindBExpClose, nullChar), nil case '\\': c, eof, err := l.read() if err != nil { return nil, err } if eof { l.errCause = synErrIncompletedEscSeq return nil, ParseErr } if c == 'u' { return newToken(tokenKindCodePointLeader, nullChar), nil } if c == 'p' { return newToken(tokenKindCharPropLeader, nullChar), nil } if c == '\\' || c == '^' || c == '-' || c == ']' { return newToken(tokenKindChar, c), nil } l.errCause = synErrInvalidEscSeq l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) return nil, ParseErr default: return newToken(tokenKindChar, c), nil } } func (l *lexer) nextInCodePoint(c rune) (*token, error) { switch c { case '{': return newToken(tokenKindLBrace, nullChar), nil case '}': return newToken(tokenKindRBrace, nullChar), nil default: if !isHexDigit(c) { l.errCause = synErrInvalidCodePoint return nil, ParseErr } var b strings.Builder fmt.Fprint(&b, string(c)) n := 1 for { c, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } break } if c == '}' { err := l.restore() if err != nil { return nil, err } break } if !isHexDigit(c) || n >= 6 { l.errCause = synErrInvalidCodePoint return nil, ParseErr } fmt.Fprint(&b, string(c)) n++ } cp := b.String() cpLen := len(cp) if !(cpLen == 4 || cpLen == 6) { l.errCause = synErrInvalidCodePoint return nil, ParseErr } return newCodePointToken(b.String()), nil } } func isHexDigit(c rune) bool { if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { return true } return false } func (l *lexer) nextInCharProp(c rune) (*token, error) { switch c { case '{': return newToken(tokenKindLBrace, nullChar), nil case '}': return newToken(tokenKindRBrace, nullChar), nil case '=': return newToken(tokenKindEqual, nullChar), nil default: var b strings.Builder fmt.Fprint(&b, string(c)) n := 1 for { c, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } break } if c == '}' || c == '=' { err := l.restore() if err != nil { return nil, err } break } fmt.Fprint(&b, string(c)) n++ } sym := strings.TrimSpace(b.String()) if len(sym) == 0 { l.errCause = synErrCharPropInvalidSymbol return nil, ParseErr } return newCharPropSymbolToken(sym), nil } } func (l *lexer) nextInFragment(c rune) (*token, error) { switch c { case '{': return newToken(tokenKindLBrace, nullChar), nil case '}': return newToken(tokenKindRBrace, nullChar), nil default: var b strings.Builder fmt.Fprint(&b, string(c)) n := 1 for { c, eof, err := l.read() if err != nil { return nil, err } if eof { err := l.restore() if err != nil { return nil, err } break } if c == '}' { err := l.restore() if err != nil { return nil, err } break } fmt.Fprint(&b, string(c)) n++ } sym := strings.TrimSpace(b.String()) if len(sym) == 0 { l.errCause = SynErrFragmentInvalidSymbol return nil, ParseErr } return newFragmentSymbolToken(sym), nil } } func (l *lexer) read() (rune, bool, error) { if l.reachedEOF { return l.lastChar, l.reachedEOF, nil } if l.peekChar1 != nullChar || l.peekEOF1 { l.prevChar2 = l.prevChar1 l.pervEOF2 = l.prevEOF1 l.prevChar1 = l.lastChar l.prevEOF1 = l.reachedEOF l.lastChar = l.peekChar1 l.reachedEOF = l.peekEOF1 l.peekChar1 = l.peekChar2 l.peekEOF1 = l.peekEOF2 l.peekChar2 = nullChar l.peekEOF2 = false return l.lastChar, l.reachedEOF, nil } c, _, err := l.src.ReadRune() if err != nil { if err == io.EOF { l.prevChar2 = l.prevChar1 l.pervEOF2 = l.prevEOF1 l.prevChar1 = l.lastChar l.prevEOF1 = l.reachedEOF l.lastChar = nullChar l.reachedEOF = true return l.lastChar, l.reachedEOF, nil } return nullChar, false, err } l.prevChar2 = l.prevChar1 l.pervEOF2 = l.prevEOF1 l.prevChar1 = l.lastChar l.prevEOF1 = l.reachedEOF l.lastChar = c l.reachedEOF = false return l.lastChar, l.reachedEOF, nil } func (l *lexer) restore() error { if l.lastChar == nullChar && !l.reachedEOF { return fmt.Errorf("failed to call restore() because the last character is null") } l.peekChar2 = l.peekChar1 l.peekEOF2 = l.peekEOF1 l.peekChar1 = l.lastChar l.peekEOF1 = l.reachedEOF l.lastChar = l.prevChar1 l.reachedEOF = l.prevEOF1 l.prevChar1 = l.prevChar2 l.prevEOF1 = l.pervEOF2 l.prevChar2 = nullChar l.pervEOF2 = false return nil } type PatternEntry struct { ID LexModeKindID Pattern []byte } type parser struct { kind LexKindName lex *lexer peekedTok *token lastTok *token // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that // appear in property expressions. // // The contributory properties are not exposed, and users cannot use those properties because the parser // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. // // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to // interpret derived properties internally because the derived properties consist of other properties that // may contain the contributory properties. // // [UAX #44 5.13 Property APIs] says: // > The following subtypes of Unicode character properties should generally not be exposed in APIs, // > except in limited circumstances. They may not be useful, particularly in public API collections, // > and may instead prove misleading to the users of such API collections. // > * Contributory properties are not recommended for public APIs. // > ... // https://unicode.org/reports/tr44/#Property_APIs isContributoryPropertyExposed bool errCause error errDetail string } func NewParser(kind LexKindName, src io.Reader) *parser { return &parser{ kind: kind, lex: newLexer(src), isContributoryPropertyExposed: false, } } func (p *parser) exposeContributoryProperty() { p.isContributoryPropertyExposed = true } func (p *parser) Error() (string, error) { return p.errDetail, p.errCause } func (p *parser) Parse() (root CPTree, retErr error) { defer func() { err := recover() if err != nil { var ok bool retErr, ok = err.(error) if !ok { panic(err) } return } }() return newRootNode(p.kind, p.parseRegexp()), nil } func (p *parser) parseRegexp() CPTree { alt := p.parseAlt() if alt == nil { if p.consume(tokenKindGroupClose) { p.raiseParseError(synErrGroupNoInitiator, "") } p.raiseParseError(synErrNullPattern, "") } if p.consume(tokenKindGroupClose) { p.raiseParseError(synErrGroupNoInitiator, "") } p.expect(tokenKindEOF) return alt } func (p *parser) parseAlt() CPTree { left := p.parseConcat() if left == nil { if p.consume(tokenKindAlt) { p.raiseParseError(synErrAltLackOfOperand, "") } return nil } for { if !p.consume(tokenKindAlt) { break } right := p.parseConcat() if right == nil { p.raiseParseError(synErrAltLackOfOperand, "") } left = newAltNode(left, right) } return left } func (p *parser) parseConcat() CPTree { left := p.parseRepeat() for { right := p.parseRepeat() if right == nil { break } left = newConcatNode(left, right) } return left } func (p *parser) parseRepeat() CPTree { group := p.parseGroup() if group == nil { if p.consume(tokenKindRepeat) { p.raiseParseError(synErrRepNoTarget, "* needs an operand") } if p.consume(tokenKindRepeatOneOrMore) { p.raiseParseError(synErrRepNoTarget, "+ needs an operand") } if p.consume(tokenKindOption) { p.raiseParseError(synErrRepNoTarget, "? needs an operand") } return nil } if p.consume(tokenKindRepeat) { return newRepeatNode(group) } if p.consume(tokenKindRepeatOneOrMore) { return newRepeatOneOrMoreNode(group) } if p.consume(tokenKindOption) { return newOptionNode(group) } return group } func (p *parser) parseGroup() CPTree { if p.consume(tokenKindGroupOpen) { alt := p.parseAlt() if alt == nil { if p.consume(tokenKindEOF) { p.raiseParseError(synErrGroupUnclosed, "") } p.raiseParseError(synErrGroupNoElem, "") } if p.consume(tokenKindEOF) { p.raiseParseError(synErrGroupUnclosed, "") } if !p.consume(tokenKindGroupClose) { p.raiseParseError(synErrGroupInvalidForm, "") } return alt } return p.parseSingleChar() } func (p *parser) parseSingleChar() CPTree { if p.consume(tokenKindAnyChar) { return genAnyCharAST() } if p.consume(tokenKindBExpOpen) { left := p.parseBExpElem() if left == nil { if p.consume(tokenKindEOF) { p.raiseParseError(synErrBExpUnclosed, "") } p.raiseParseError(synErrBExpNoElem, "") } for { right := p.parseBExpElem() if right == nil { break } left = newAltNode(left, right) } if p.consume(tokenKindEOF) { p.raiseParseError(synErrBExpUnclosed, "") } p.expect(tokenKindBExpClose) return left } if p.consume(tokenKindInverseBExpOpen) { elem := p.parseBExpElem() if elem == nil { if p.consume(tokenKindEOF) { p.raiseParseError(synErrBExpUnclosed, "") } p.raiseParseError(synErrBExpNoElem, "") } inverse := exclude(elem, genAnyCharAST()) if inverse == nil { p.raiseParseError(synErrUnmatchablePattern, "") } for { elem := p.parseBExpElem() if elem == nil { break } inverse = exclude(elem, inverse) if inverse == nil { p.raiseParseError(synErrUnmatchablePattern, "") } } if p.consume(tokenKindEOF) { p.raiseParseError(synErrBExpUnclosed, "") } p.expect(tokenKindBExpClose) return inverse } if p.consume(tokenKindCodePointLeader) { return p.parseCodePoint() } if p.consume(tokenKindCharPropLeader) { return p.parseCharProp() } if p.consume(tokenKindFragmentLeader) { return p.parseFragment() } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { p.raiseParseError(synErrBExpInvalidForm, "") } return nil } return c } func (p *parser) parseBExpElem() CPTree { var left CPTree switch { case p.consume(tokenKindCodePointLeader): left = p.parseCodePoint() case p.consume(tokenKindCharPropLeader): left = p.parseCharProp() if p.consume(tokenKindCharRange) { p.raiseParseError(synErrRangePropIsUnavailable, "") } default: left = p.parseNormalChar() } if left == nil { return nil } if !p.consume(tokenKindCharRange) { return left } var right CPTree switch { case p.consume(tokenKindCodePointLeader): right = p.parseCodePoint() case p.consume(tokenKindCharPropLeader): p.raiseParseError(synErrRangePropIsUnavailable, "") default: right = p.parseNormalChar() } if right == nil { p.raiseParseError(synErrRangeInvalidForm, "") } from, _, _ := left.Range() _, to, _ := right.Range() if !isValidOrder(from, to) { p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) } return newRangeSymbolNode(from, to) } func (p *parser) parseCodePoint() CPTree { if !p.consume(tokenKindLBrace) { p.raiseParseError(synErrCPExpInvalidForm, "") } if !p.consume(tokenKindCodePoint) { p.raiseParseError(synErrCPExpInvalidForm, "") } n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) if err != nil { panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) } if n < 0x0000 || n > 0x10FFFF { p.raiseParseError(synErrCPExpOutOfRange, "") } sym := newSymbolNode(rune(n)) if !p.consume(tokenKindRBrace) { p.raiseParseError(synErrCPExpInvalidForm, "") } return sym } func (p *parser) parseCharProp() CPTree { if !p.consume(tokenKindLBrace) { p.raiseParseError(synErrCharPropExpInvalidForm, "") } var sym1, sym2 string if !p.consume(tokenKindCharPropSymbol) { p.raiseParseError(synErrCharPropExpInvalidForm, "") } sym1 = p.lastTok.propSymbol if p.consume(tokenKindEqual) { if !p.consume(tokenKindCharPropSymbol) { p.raiseParseError(synErrCharPropExpInvalidForm, "") } sym2 = p.lastTok.propSymbol } var alt CPTree var propName, propVal string if sym2 != "" { propName = sym1 propVal = sym2 } else { propName = "" propVal = sym1 } if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { p.raiseParseError(synErrCharPropUnsupported, propName) } pat, err := ucd.NormalizeCharacterProperty(propName, propVal) if err != nil { p.raiseParseError(synErrCharPropUnsupported, err.Error()) } if pat != "" { p := NewParser(p.kind, bytes.NewReader([]byte(pat))) p.exposeContributoryProperty() ast, err := p.Parse() if err != nil { panic(err) } alt = ast } else { cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) if err != nil { p.raiseParseError(synErrCharPropUnsupported, err.Error()) } if inverse { r := cpRanges[0] alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) if alt == nil { p.raiseParseError(synErrUnmatchablePattern, "") } for _, r := range cpRanges[1:] { alt = exclude(newRangeSymbolNode(r.From, r.To), alt) if alt == nil { p.raiseParseError(synErrUnmatchablePattern, "") } } } else { for _, r := range cpRanges { alt = genAltNode( alt, newRangeSymbolNode(r.From, r.To), ) } } } if !p.consume(tokenKindRBrace) { p.raiseParseError(synErrCharPropExpInvalidForm, "") } return alt } func (p *parser) parseFragment() CPTree { if !p.consume(tokenKindLBrace) { p.raiseParseError(synErrFragmentExpInvalidForm, "") } if !p.consume(tokenKindFragmentSymbol) { p.raiseParseError(synErrFragmentExpInvalidForm, "") } sym := p.lastTok.fragmentSymbol if !p.consume(tokenKindRBrace) { p.raiseParseError(synErrFragmentExpInvalidForm, "") } return newFragmentNode(LexKindName(sym), nil) } func (p *parser) parseNormalChar() CPTree { if !p.consume(tokenKindChar) { return nil } return newSymbolNode(p.lastTok.char) } func exclude(symbol, base CPTree) CPTree { if left, right, ok := symbol.Alternatives(); ok { return exclude(right, exclude(left, base)) } if left, right, ok := base.Alternatives(); ok { return genAltNode( exclude(symbol, left), exclude(symbol, right), ) } if bFrom, bTo, ok := base.Range(); ok { sFrom, sTo, ok := symbol.Range() if !ok { panic(fmt.Errorf("invalid symbol tree: %T", symbol)) } switch { case sFrom > bFrom && sTo < bTo: return genAltNode( newRangeSymbolNode(bFrom, sFrom-1), newRangeSymbolNode(sTo+1, bTo), ) case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: return newRangeSymbolNode(sTo+1, bTo) case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: return newRangeSymbolNode(bFrom, sFrom-1) case sFrom <= bFrom && sTo >= bTo: return nil default: return base } } panic(fmt.Errorf("invalid base tree: %T", base)) } func genAnyCharAST() CPTree { return newRangeSymbolNode(0x0, 0x10FFFF) } func isValidOrder(from, to rune) bool { return from <= to } func genConcatNode(cs ...CPTree) CPTree { nonNilNodes := []CPTree{} for _, c := range cs { if c == nil { continue } nonNilNodes = append(nonNilNodes, c) } if len(nonNilNodes) <= 0 { return nil } if len(nonNilNodes) == 1 { return nonNilNodes[0] } concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) for _, c := range nonNilNodes[2:] { concat = newConcatNode(concat, c) } return concat } func genAltNode(cs ...CPTree) CPTree { nonNilNodes := []CPTree{} for _, c := range cs { if c == nil { continue } nonNilNodes = append(nonNilNodes, c) } if len(nonNilNodes) <= 0 { return nil } if len(nonNilNodes) == 1 { return nonNilNodes[0] } alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) for _, c := range nonNilNodes[2:] { alt = newAltNode(alt, c) } return alt } func (p *parser) expect(expected tokenKind) { if !p.consume(expected) { tok := p.peekedTok p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) } } func (p *parser) consume(expected tokenKind) bool { var tok *token var err error if p.peekedTok != nil { tok = p.peekedTok p.peekedTok = nil } else { tok, err = p.lex.next() if err != nil { if err == ParseErr { detail, cause := p.lex.error() p.raiseParseError(cause, detail) } panic(err) } } p.lastTok = tok if tok.kind == expected { return true } p.peekedTok = tok p.lastTok = nil return false } func (p *parser) raiseParseError(err error, detail string) { p.errCause = err p.errDetail = detail panic(ParseErr) } type CPRange struct { From rune To rune } type CPTree interface { fmt.Stringer Range() (rune, rune, bool) Optional() (CPTree, bool) Repeatable() (CPTree, bool) Concatenation() (CPTree, CPTree, bool) Alternatives() (CPTree, CPTree, bool) Describe() (LexKindName, []LexKindName, error) children() (CPTree, CPTree) clone() CPTree } var ( _ CPTree = &rootNode{} _ CPTree = &symbolNode{} _ CPTree = &concatNode{} _ CPTree = &altNode{} _ CPTree = &quantifierNode{} _ CPTree = &fragmentNode{} ) type rootNode struct { kind LexKindName tree CPTree fragments map[LexKindName][]*fragmentNode } func newRootNode(kind LexKindName, t CPTree) *rootNode { fragments := map[LexKindName][]*fragmentNode{} collectFragments(t, fragments) return &rootNode{ kind: kind, tree: t, fragments: fragments, } } func collectFragments(n CPTree, fragments map[LexKindName][]*fragmentNode) { if n == nil { return } if f, ok := n.(*fragmentNode); ok { fragments[f.kind] = append(fragments[f.kind], f) return } l, r := n.children() collectFragments(l, fragments) collectFragments(r, fragments) } func (n *rootNode) String() string { return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) } func (n *rootNode) Range() (rune, rune, bool) { return n.tree.Range() } func (n *rootNode) Optional() (CPTree, bool) { return n.tree.Optional() } func (n *rootNode) Repeatable() (CPTree, bool) { return n.tree.Repeatable() } func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { return n.tree.Concatenation() } func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { return n.tree.Alternatives() } func (n *rootNode) Describe() (LexKindName, []LexKindName, error) { var frags []LexKindName for f := range n.fragments { frags = append(frags, LexKindName(f)) } sort.Slice(frags, func(i, j int) bool { return frags[i] < frags[j] }) return n.kind, frags, nil } func (n *rootNode) children() (CPTree, CPTree) { return n.tree.children() } func (n *rootNode) clone() CPTree { return n.tree.clone() } func (n *rootNode) incomplete() bool { return len(n.fragments) > 0 } func (n *rootNode) applyFragment(kind LexKindName, fragment CPTree) error { root, ok := fragment.(*rootNode) if !ok { return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) } if root.incomplete() { return fmt.Errorf("fragment is incomplete") } fs, ok := n.fragments[kind] if !ok { return nil } for _, f := range fs { f.tree = root.clone() } delete(n.fragments, kind) return nil } type symbolNode struct { CPRange } func newSymbolNode(cp rune) *symbolNode { return &symbolNode{ CPRange: CPRange{ From: cp, To: cp, }, } } func newRangeSymbolNode(from, to rune) *symbolNode { return &symbolNode{ CPRange: CPRange{ From: from, To: to, }, } } func (n *symbolNode) String() string { return fmt.Sprintf("symbol: %X..%X", n.From, n.To) } func (n *symbolNode) Range() (rune, rune, bool) { return n.From, n.To, true } func (n *symbolNode) Optional() (CPTree, bool) { return nil, false } func (n *symbolNode) Repeatable() (CPTree, bool) { return nil, false } func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { return nil, nil, false } func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { return nil, nil, false } func (n *symbolNode) Describe() (LexKindName, []LexKindName, error) { return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) } func (n *symbolNode) children() (CPTree, CPTree) { return nil, nil } func (n *symbolNode) clone() CPTree { return newRangeSymbolNode(n.From, n.To) } type concatNode struct { left CPTree right CPTree } func newConcatNode(left, right CPTree) *concatNode { return &concatNode{ left: left, right: right, } } func (n *concatNode) String() string { return "concat" } func (n *concatNode) Range() (rune, rune, bool) { return 0, 0, false } func (n *concatNode) Optional() (CPTree, bool) { return nil, false } func (n *concatNode) Repeatable() (CPTree, bool) { return nil, false } func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { return n.left, n.right, true } func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { return nil, nil, false } func (n *concatNode) Describe() (LexKindName, []LexKindName, error) { return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) } func (n *concatNode) children() (CPTree, CPTree) { return n.left, n.right } func (n *concatNode) clone() CPTree { if n == nil { return nil } return newConcatNode(n.left.clone(), n.right.clone()) } type altNode struct { left CPTree right CPTree } func newAltNode(left, right CPTree) *altNode { return &altNode{ left: left, right: right, } } func (n *altNode) String() string { return "alt" } func (n *altNode) Range() (rune, rune, bool) { return 0, 0, false } func (n *altNode) Optional() (CPTree, bool) { return nil, false } func (n *altNode) Repeatable() (CPTree, bool) { return nil, false } func (n *altNode) Concatenation() (CPTree, CPTree, bool) { return nil, nil, false } func (n *altNode) Alternatives() (CPTree, CPTree, bool) { return n.left, n.right, true } func (n *altNode) Describe() (LexKindName, []LexKindName, error) { return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) } func (n *altNode) children() (CPTree, CPTree) { return n.left, n.right } func (n *altNode) clone() CPTree { return newAltNode(n.left.clone(), n.right.clone()) } type quantifierNode struct { optional bool repeatable bool tree CPTree } func (n *quantifierNode) String() string { switch { case n.repeatable: return "repeatable (>= 0 times)" case n.optional: return "optional (0 or 1 times)" default: return "invalid quantifier" } } func newRepeatNode(t CPTree) *quantifierNode { return &quantifierNode{ repeatable: true, tree: t, } } func newRepeatOneOrMoreNode(t CPTree) *concatNode { return newConcatNode( t, &quantifierNode{ repeatable: true, tree: t.clone(), }) } func newOptionNode(t CPTree) *quantifierNode { return &quantifierNode{ optional: true, tree: t, } } func (n *quantifierNode) Range() (rune, rune, bool) { return 0, 0, false } func (n *quantifierNode) Optional() (CPTree, bool) { return n.tree, n.optional } func (n *quantifierNode) Repeatable() (CPTree, bool) { return n.tree, n.repeatable } func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { return nil, nil, false } func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { return nil, nil, false } func (n *quantifierNode) Describe() (LexKindName, []LexKindName, error) { return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) } func (n *quantifierNode) children() (CPTree, CPTree) { return n.tree, nil } func (n *quantifierNode) clone() CPTree { if n.repeatable { return newRepeatNode(n.tree.clone()) } return newOptionNode(n.tree.clone()) } type fragmentNode struct { kind LexKindName tree CPTree } func newFragmentNode(kind LexKindName, t CPTree) *fragmentNode { return &fragmentNode{ kind: kind, tree: t, } } func (n *fragmentNode) String() string { return fmt.Sprintf("fragment: %v", n.kind) } func (n *fragmentNode) Range() (rune, rune, bool) { return n.tree.Range() } func (n *fragmentNode) Optional() (CPTree, bool) { return n.tree.Optional() } func (n *fragmentNode) Repeatable() (CPTree, bool) { return n.tree.Repeatable() } func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { return n.tree.Concatenation() } func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { return n.tree.Alternatives() } func (n *fragmentNode) Describe() (LexKindName, []LexKindName, error) { return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) } func (n *fragmentNode) children() (CPTree, CPTree) { return n.tree.children() } func (n *fragmentNode) clone() CPTree { if n.tree == nil { return newFragmentNode(n.kind, nil) } return newFragmentNode(n.kind, n.tree.clone()) } //nolint:unused func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { if t == nil { return } fmt.Fprintf(w, "%v%v\n", ruledLine, t) children := []CPTree{} switch n := t.(type) { case *rootNode: children = append(children, n.tree) case *fragmentNode: children = append(children, n.tree) default: left, right := t.children() if left != nil { children = append(children, left) } if right != nil { children = append(children, right) } } num := len(children) for i, child := range children { line := "└─ " if num > 1 { if i == 0 { line = "├─ " } else if i < num-1 { line = "│ " } } prefix := "│ " if i >= num-1 { prefix = " " } printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) } }