diff options
Diffstat (limited to 'src/urubu/spec/grammar')
-rw-r--r-- | src/urubu/spec/grammar/clexspec.json (renamed from src/urubu/spec/grammar/parser/clexspec.json) | 0 | ||||
-rw-r--r-- | src/urubu/spec/grammar/description.go | 71 | ||||
-rw-r--r-- | src/urubu/spec/grammar/grammar.go | 160 | ||||
-rw-r--r-- | src/urubu/spec/grammar/lexspec.json (renamed from src/urubu/spec/grammar/parser/lexspec.json) | 0 | ||||
-rw-r--r-- | src/urubu/spec/grammar/parser.go (renamed from src/urubu/spec/grammar/parser/vartan_lexer.go) | 911 | ||||
-rw-r--r-- | src/urubu/spec/grammar/parser/lexer.go | 297 | ||||
-rw-r--r-- | src/urubu/spec/grammar/parser/parser.go | 582 | ||||
-rw-r--r-- | src/urubu/spec/grammar/parser/syntax_error.go | 45 | ||||
-rw-r--r-- | src/urubu/spec/grammar/util.go | 21 |
9 files changed, 910 insertions, 1177 deletions
diff --git a/src/urubu/spec/grammar/parser/clexspec.json b/src/urubu/spec/grammar/clexspec.json index d0ed3d3..d0ed3d3 100644 --- a/src/urubu/spec/grammar/parser/clexspec.json +++ b/src/urubu/spec/grammar/clexspec.json diff --git a/src/urubu/spec/grammar/description.go b/src/urubu/spec/grammar/description.go deleted file mode 100644 index 0d2a0b7..0000000 --- a/src/urubu/spec/grammar/description.go +++ /dev/null @@ -1,71 +0,0 @@ -package grammar - -type Terminal struct { - Number int `json:"number"` - Name string `json:"name"` - Pattern string `json:"pattern"` - Precedence int `json:"prec"` - Associativity string `json:"assoc"` -} - -type NonTerminal struct { - Number int `json:"number"` - Name string `json:"name"` -} - -type Production struct { - Number int `json:"number"` - LHS int `json:"lhs"` - RHS []int `json:"rhs"` - Precedence int `json:"prec"` - Associativity string `json:"assoc"` -} - -type Item struct { - Production int `json:"production"` - Dot int `json:"dot"` -} - -type Transition struct { - Symbol int `json:"symbol"` - State int `json:"state"` -} - -type Reduce struct { - LookAhead []int `json:"look_ahead"` - Production int `json:"production"` -} - -type SRConflict struct { - Symbol int `json:"symbol"` - State int `json:"state"` - Production int `json:"production"` - AdoptedState *int `json:"adopted_state"` - AdoptedProduction *int `json:"adopted_production"` - ResolvedBy int `json:"resolved_by"` -} - -type RRConflict struct { - Symbol int `json:"symbol"` - Production1 int `json:"production_1"` - Production2 int `json:"production_2"` - AdoptedProduction int `json:"adopted_production"` - ResolvedBy int `json:"resolved_by"` -} - -type State struct { - Number int `json:"number"` - Kernel []*Item `json:"kernel"` - Shift []*Transition `json:"shift"` - Reduce []*Reduce `json:"reduce"` - GoTo []*Transition `json:"goto"` - SRConflict []*SRConflict `json:"sr_conflict"` - RRConflict []*RRConflict `json:"rr_conflict"` -} - -type Report struct { - Terminals []*Terminal `json:"terminals"` - NonTerminals []*NonTerminal `json:"non_terminals"` - Productions []*Production `json:"productions"` - States []*State `json:"states"` -} diff --git a/src/urubu/spec/grammar/grammar.go b/src/urubu/spec/grammar/grammar.go deleted file mode 100644 index bf1ea89..0000000 --- a/src/urubu/spec/grammar/grammar.go +++ /dev/null @@ -1,160 +0,0 @@ -package grammar - -import "strconv" - -type CompiledGrammar struct { - Name string `json:"name"` - Lexical *LexicalSpec `json:"lexical"` - Syntactic *SyntacticSpec `json:"syntactic"` - ASTAction *ASTAction `json:"ast_action"` -} - -// StateID represents an ID of a state of a transition table. -type StateID int - -const ( - // StateIDNil represents an empty entry of a transition table. - // When the driver reads this value, it raises an error meaning lexical analysis failed. - StateIDNil = StateID(0) - - // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as - // sequential numbers starting from this value. - StateIDMin = StateID(1) -) - -func (id StateID) Int() int { - return int(id) -} - -// LexModeID represents an ID of a lex mode. -type LexModeID int - -const ( - LexModeIDNil = LexModeID(0) - LexModeIDDefault = LexModeID(1) -) - -func (n LexModeID) String() string { - return strconv.Itoa(int(n)) -} - -func (n LexModeID) Int() int { - return int(n) -} - -func (n LexModeID) IsNil() bool { - return n == LexModeIDNil -} - -// LexModeName represents a name of a lex mode. -type LexModeName string - -const ( - LexModeNameNil = LexModeName("") - LexModeNameDefault = LexModeName("default") -) - -func (m LexModeName) String() string { - return string(m) -} - -// LexKindID represents an ID of a lexical kind and is unique across all modes. -type LexKindID int - -const ( - LexKindIDNil = LexKindID(0) - LexKindIDMin = LexKindID(1) -) - -func (id LexKindID) Int() int { - return int(id) -} - -// LexModeKindID represents an ID of a lexical kind and is unique within a mode. -// Use LexKindID to identify a kind across all modes uniquely. -type LexModeKindID int - -const ( - LexModeKindIDNil = LexModeKindID(0) - LexModeKindIDMin = LexModeKindID(1) -) - -func (id LexModeKindID) Int() int { - return int(id) -} - -// LexKindName represents a name of a lexical kind. -type LexKindName string - -const LexKindNameNil = LexKindName("") - -func (k LexKindName) String() string { - return string(k) -} - -type RowDisplacementTable struct { - OriginalRowCount int `json:"original_row_count"` - OriginalColCount int `json:"original_col_count"` - EmptyValue StateID `json:"empty_value"` - Entries []StateID `json:"entries"` - Bounds []int `json:"bounds"` - RowDisplacement []int `json:"row_displacement"` -} - -type UniqueEntriesTable struct { - UniqueEntries *RowDisplacementTable `json:"unique_entries,omitempty"` - UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"` - RowNums []int `json:"row_nums"` - OriginalRowCount int `json:"original_row_count"` - OriginalColCount int `json:"original_col_count"` - EmptyValue int `json:"empty_value"` -} - -type TransitionTable struct { - InitialStateID StateID `json:"initial_state_id"` - AcceptingStates []LexModeKindID `json:"accepting_states"` - RowCount int `json:"row_count"` - ColCount int `json:"col_count"` - Transition *UniqueEntriesTable `json:"transition,omitempty"` - UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"` -} - -type CompiledLexModeSpec struct { - KindNames []LexKindName `json:"kind_names"` - Push []LexModeID `json:"push"` - Pop []int `json:"pop"` - DFA *TransitionTable `json:"dfa"` -} - -type LexicalSpec struct { - InitialModeID LexModeID `json:"initial_mode_id"` - ModeNames []LexModeName `json:"mode_names"` - KindNames []LexKindName `json:"kind_names"` - KindIDs [][]LexKindID `json:"kind_ids"` - CompressionLevel int `json:"compression_level"` - Specs []*CompiledLexModeSpec `json:"specs"` -} - -type SyntacticSpec struct { - Action []int `json:"action"` - GoTo []int `json:"goto"` - StateCount int `json:"state_count"` - InitialState int `json:"initial_state"` - StartProduction int `json:"start_production"` - LHSSymbols []int `json:"lhs_symbols"` - AlternativeSymbolCounts []int `json:"alternative_symbol_counts"` - Terminals []string `json:"terminals"` - TerminalCount int `json:"terminal_count"` - TerminalSkip []int `json:"terminal_skip"` - KindToTerminal []int `json:"kind_to_terminal"` - NonTerminals []string `json:"non_terminals"` - NonTerminalCount int `json:"non_terminal_count"` - EOFSymbol int `json:"eof_symbol"` - ErrorSymbol int `json:"error_symbol"` - ErrorTrapperStates []int `json:"error_trapper_states"` - RecoverProductions []int `json:"recover_productions"` -} - -type ASTAction struct { - Entries [][]int `json:"entries"` -} diff --git a/src/urubu/spec/grammar/parser/lexspec.json b/src/urubu/spec/grammar/lexspec.json index caf1f0e..caf1f0e 100644 --- a/src/urubu/spec/grammar/parser/lexspec.json +++ b/src/urubu/spec/grammar/lexspec.json diff --git a/src/urubu/spec/grammar/parser/vartan_lexer.go b/src/urubu/spec/grammar/parser.go index 76ddfde..0e5a16b 100644 --- a/src/urubu/spec/grammar/parser/vartan_lexer.go +++ b/src/urubu/spec/grammar/parser.go @@ -1,11 +1,920 @@ -// Code generated by maleeni-go. DO NOT EDIT. +//go:generate maleeni compile lexspec.json -o clexspec.json +//go:generate maleeni-go clexspec.json --package parser + package parser import ( + _ "embed" "fmt" "io" "io/ioutil" + "regexp" + "strings" + + verr "urubu/error" + spec "urubu/spec/grammar" +) + +type tokenKind string + +const ( + tokenKindKWFragment = tokenKind("fragment") + tokenKindID = tokenKind("id") + tokenKindTerminalPattern = tokenKind("terminal pattern") + tokenKindStringLiteral = tokenKind("string") + tokenKindColon = tokenKind(":") + tokenKindOr = tokenKind("|") + tokenKindSemicolon = tokenKind(";") + tokenKindLabelMarker = tokenKind("@") + tokenKindDirectiveMarker = tokenKind("#") + tokenKindExpantion = tokenKind("...") + tokenKindOrderedSymbolMarker = tokenKind("$") + tokenKindLParen = tokenKind("(") + tokenKindRParen = tokenKind(")") + tokenKindNewline = tokenKind("newline") + tokenKindEOF = tokenKind("eof") + tokenKindInvalid = tokenKind("invalid") +) + +var ( + reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) + reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) +) + +type Position struct { + Row int + Col int +} + +func newPosition(row, col int) Position { + return Position{ + Row: row, + Col: col, + } +} + +type token struct { + kind tokenKind + text string + pos Position +} + +func newSymbolToken(kind tokenKind, pos Position) *token { + return &token{ + kind: kind, + pos: pos, + } +} + +func newIDToken(text string, pos Position) *token { + return &token{ + kind: tokenKindID, + text: text, + pos: pos, + } +} + +func newTerminalPatternToken(text string, pos Position) *token { + return &token{ + kind: tokenKindTerminalPattern, + text: text, + pos: pos, + } +} + +func newStringLiteralToken(text string, pos Position) *token { + return &token{ + kind: tokenKindStringLiteral, + text: text, + pos: pos, + } +} + +func newEOFToken() *token { + return &token{ + kind: tokenKindEOF, + } +} + +func newInvalidToken(text string, pos Position) *token { + return &token{ + kind: tokenKindInvalid, + text: text, + pos: pos, + } +} + +type lexer struct { + d *Lexer + buf *token +} + +func newLexer(src io.Reader) (*lexer, error) { + d, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + return &lexer{ + d: d, + }, nil +} + +func (l *lexer) next() (*token, error) { + if l.buf != nil { + tok := l.buf + l.buf = nil + return tok, nil + } + + var newline *token + for { + tok, err := l.lexAndSkipWSs() + if err != nil { + return nil, err + } + if tok.kind == tokenKindNewline { + newline = tok + continue + } + + if newline != nil { + l.buf = tok + return newline, nil + } + return tok, nil + } +} + +func (l *lexer) lexAndSkipWSs() (*token, error) { + var tok *Token + for { + var err error + tok, err = l.d.Next() + if err != nil { + return nil, err + } + if tok.Invalid { + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } + if tok.EOF { + return newEOFToken(), nil + } + switch tok.KindID { + case KindIDWhiteSpace: + continue + case KindIDLineComment: + continue + } + + break + } + + switch tok.KindID { + case KindIDNewline: + return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDKwFragment: + return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDIdentifier: + if !reIDChar.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidChar, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidUnderscorePos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.Contains(string(tok.Lexeme), "__") { + return nil, &verr.SpecError{ + Cause: synErrIDConsecutiveUnderscores, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if reIDInvalidDigitsPos.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidDigitsPos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDTerminalOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedTerminal, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDPattern: + // The escape sequences in a pattern string are interpreted by the lexer, except for the \". + // We must interpret the \" before passing them to the lexer because they are delimiters for + // the pattern strings. + fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) + case KindIDEscapeSymbol: + return nil, &verr.SpecError{ + Cause: synErrIncompletedEscSeq, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + case KindIDTerminalClose: + pat := b.String() + if pat == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyPattern, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDStringLiteralOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDCharSeq: + fmt.Fprint(&b, string(tok.Lexeme)) + case KindIDStringLiteralClose: + str := b.String() + if str == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDColon: + return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOr: + return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDSemicolon: + return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLabelMarker: + return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDDirectiveMarker: + return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDExpansion: + return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOrderedSymbolMarker: + return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLParen: + return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDRParen: + return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil + default: + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } +} + +type RootNode struct { + Directives []*DirectiveNode + Productions []*ProductionNode + LexProductions []*ProductionNode + Fragments []*FragmentNode +} + +type ProductionNode struct { + Directives []*DirectiveNode + LHS string + RHS []*AlternativeNode + Pos Position +} + +func (n *ProductionNode) isLexical() bool { + if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" { + return true + } + return false +} + +type AlternativeNode struct { + Elements []*ElementNode + Directives []*DirectiveNode + Pos Position +} + +type ElementNode struct { + ID string + Pattern string + Label *LabelNode + Literally bool + Pos Position +} + +type LabelNode struct { + Name string + Pos Position +} + +type DirectiveNode struct { + Name string + Parameters []*ParameterNode + Pos Position +} + +type ParameterNode struct { + ID string + Pattern string + String string + OrderedSymbol string + Group []*DirectiveNode + Expansion bool + Pos Position +} + +type FragmentNode struct { + LHS string + RHS string + Pos Position +} + +func raiseSyntaxError(row int, synErr *SyntaxError) { + panic(&verr.SpecError{ + Cause: synErr, + Row: row, + }) +} + +func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) { + panic(&verr.SpecError{ + Cause: synErr, + Detail: detail, + Row: row, + }) +} + +func Parse(src io.Reader) (*RootNode, error) { + p, err := newParser(src) + if err != nil { + return nil, err + } + + return p.parse() +} + +type parser struct { + lex *lexer + peekedTok *token + lastTok *token + errs verr.SpecErrors + + // A token position that the parser read at last. + // It is used as additional information in error messages. + pos Position +} + +func newParser(src io.Reader) (*parser, error) { + lex, err := newLexer(src) + if err != nil { + return nil, err + } + return &parser{ + lex: lex, + }, nil +} + +func (p *parser) parse() (root *RootNode, retErr error) { + root = p.parseRoot() + if len(p.errs) > 0 { + return nil, p.errs + } + + return root, nil +} + +func (p *parser) parseRoot() *RootNode { + defer func() { + err := recover() + if err != nil { + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(fmt.Errorf("an unexpected error occurred: %v", err)) + } + p.errs = append(p.errs, specErr) + } + }() + + var dirs []*DirectiveNode + var prods []*ProductionNode + var lexProds []*ProductionNode + var fragments []*FragmentNode + for { + dir := p.parseTopLevelDirective() + if dir != nil { + dirs = append(dirs, dir) + continue + } + + fragment := p.parseFragment() + if fragment != nil { + fragments = append(fragments, fragment) + continue + } + + prod := p.parseProduction() + if prod != nil { + if prod.isLexical() { + lexProds = append(lexProds, prod) + } else { + prods = append(prods, prod) + } + continue + } + + if p.consume(tokenKindEOF) { + break + } + } + + return &RootNode{ + Directives: dirs, + Productions: prods, + LexProductions: lexProds, + Fragments: fragments, + } +} + +func (p *parser) parseTopLevelDirective() *DirectiveNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + dir := p.parseDirective() + if dir == nil { + return nil + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon) + } + + return dir +} + +func (p *parser) parseFragment() *FragmentNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindKWFragment) { + return nil + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoProductionName) + } + lhs := p.lastTok.text + lhsPos := p.lastTok.pos + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindColon) { + raiseSyntaxError(p.pos.Row, synErrNoColon) + } + + var rhs string + switch { + case p.consume(tokenKindTerminalPattern): + rhs = p.lastTok.text + case p.consume(tokenKindStringLiteral): + rhs = spec.EscapePattern(p.lastTok.text) + default: + raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrNoSemicolon) + } + + if !p.consume(tokenKindNewline) { + if !p.consume(tokenKindEOF) { + raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) + } + } + + return &FragmentNode{ + LHS: lhs, + RHS: rhs, + Pos: lhsPos, + } +} + +func (p *parser) parseProduction() *ProductionNode { + defer func() { + err := recover() + if err == nil { + return + } + + specErr, ok := err.(*verr.SpecError) + if !ok { + panic(err) + } + + p.errs = append(p.errs, specErr) + p.skipOverTo(tokenKindSemicolon) + }() + + p.consume(tokenKindNewline) + + if p.consume(tokenKindEOF) { + return nil + } + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoProductionName) + } + lhs := p.lastTok.text + lhsPos := p.lastTok.pos + + var dirs []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + dirs = append(dirs, dir) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindColon) { + raiseSyntaxError(p.pos.Row, synErrNoColon) + } + + alt := p.parseAlternative() + rhs := []*AlternativeNode{alt} + for { + p.consume(tokenKindNewline) + + if !p.consume(tokenKindOr) { + break + } + alt := p.parseAlternative() + rhs = append(rhs, alt) + } + + p.consume(tokenKindNewline) + + if !p.consume(tokenKindSemicolon) { + raiseSyntaxError(p.pos.Row, synErrNoSemicolon) + } + + if !p.consume(tokenKindNewline) { + if !p.consume(tokenKindEOF) { + raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) + } + } + + prod := &ProductionNode{ + Directives: dirs, + LHS: lhs, + RHS: rhs, + Pos: lhsPos, + } + + // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs. + // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate + // name to the pattern. Therefore, this code prohibits alternatives from containing patterns. + if !prod.isLexical() { + for _, alt := range prod.RHS { + for _, elem := range alt.Elements { + if elem.Pattern != "" { + raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt) + } + } + } + } + + return prod +} + +func (p *parser) parseAlternative() *AlternativeNode { + elems := []*ElementNode{} + for { + elem := p.parseElement() + if elem == nil { + break + } + elems = append(elems, elem) + } + + // When a length of an alternative is zero, we cannot set a position. + var firstElemPos Position + if len(elems) > 0 { + firstElemPos = elems[0].Pos + } + + var dirs []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + dirs = append(dirs, dir) + } + + return &AlternativeNode{ + Elements: elems, + Directives: dirs, + Pos: firstElemPos, + } +} + +func (p *parser) parseElement() *ElementNode { + var elem *ElementNode + switch { + case p.consume(tokenKindID): + elem = &ElementNode{ + ID: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindTerminalPattern): + elem = &ElementNode{ + Pattern: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindStringLiteral): + elem = &ElementNode{ + Pattern: p.lastTok.text, + Literally: true, + Pos: p.lastTok.pos, + } + default: + if p.consume(tokenKindLabelMarker) { + raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol) + } + return nil + } + if p.consume(tokenKindLabelMarker) { + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoLabel) + } + elem.Label = &LabelNode{ + Name: p.lastTok.text, + Pos: p.lastTok.pos, + } + } + return elem +} + +func (p *parser) parseDirective() *DirectiveNode { + p.consume(tokenKindNewline) + + if !p.consume(tokenKindDirectiveMarker) { + return nil + } + dirPos := p.lastTok.pos + + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoDirectiveName) + } + name := p.lastTok.text + + var params []*ParameterNode + for { + param := p.parseParameter() + if param == nil { + break + } + params = append(params, param) + } + + return &DirectiveNode{ + Name: name, + Parameters: params, + Pos: dirPos, + } +} + +func (p *parser) parseParameter() *ParameterNode { + var param *ParameterNode + switch { + case p.consume(tokenKindID): + param = &ParameterNode{ + ID: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindTerminalPattern): + param = &ParameterNode{ + Pattern: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindStringLiteral): + param = &ParameterNode{ + String: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindOrderedSymbolMarker): + if !p.consume(tokenKindID) { + raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName) + } + param = &ParameterNode{ + OrderedSymbol: p.lastTok.text, + Pos: p.lastTok.pos, + } + case p.consume(tokenKindLParen): + pos := p.lastTok.pos + var g []*DirectiveNode + for { + dir := p.parseDirective() + if dir == nil { + break + } + g = append(g, dir) + } + if !p.consume(tokenKindRParen) { + raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup) + } + if len(g) == 0 { + // Set an empty slice representing an empty directive group to distinguish between the following two cases. + // + // - #prec (); // vartan allows this case. + // - #prec; // This case will raise an error. + g = []*DirectiveNode{} + } + param = &ParameterNode{ + Group: g, + Pos: pos, + } + } + if p.consume(tokenKindExpantion) { + switch { + case param == nil: + raiseSyntaxError(p.pos.Row, synErrStrayExpOp) + case param.ID == "": + raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand) + } + param.Expansion = true + } + return param +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + panic(err) + } + } + p.pos = tok.pos + if tok.kind == tokenKindInvalid { + raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text) + } + if tok.kind == expected { + p.lastTok = tok + return true + } + p.peekedTok = tok + + return false +} + +func (p *parser) skip() { + var tok *token + var err error + for { + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + p.errs = append(p.errs, &verr.SpecError{ + Cause: err, + Row: p.pos.Row, + }) + continue + } + } + + break + } + + p.lastTok = tok + p.pos = tok.pos +} + +func (p *parser) skipOverTo(kind tokenKind) { + for { + if p.consume(kind) || p.consume(tokenKindEOF) { + return + } + p.skip() + } +} + +type SyntaxError struct { + message string +} + +func newSyntaxError(message string) *SyntaxError { + return &SyntaxError{ + message: message, + } +} + +func (e *SyntaxError) Error() string { + return e.message +} + +var ( + // lexical errors + synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore") + synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier") + synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively") + synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier") + synErrUnclosedTerminal = newSyntaxError("unclosed terminal") + synErrUnclosedString = newSyntaxError("unclosed string") + synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash") + synErrEmptyPattern = newSyntaxError("a pattern must include at least one character") + synErrEmptyString = newSyntaxError("a string must include at least one character") + + // syntax errors + synErrInvalidToken = newSyntaxError("invalid token") + synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;") + synErrNoProductionName = newSyntaxError("a production name is missing") + synErrNoColon = newSyntaxError("the colon must precede alternatives") + synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative") + synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol") + synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @") + synErrNoDirectiveName = newSyntaxError("a directive needs a name") + synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing") + synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )") + synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal") + synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier") + synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier") + synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline") + synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element") ) +// Code generated by maleeni-go. DO NOT EDIT. type ModeID int diff --git a/src/urubu/spec/grammar/parser/lexer.go b/src/urubu/spec/grammar/parser/lexer.go deleted file mode 100644 index bd8a24f..0000000 --- a/src/urubu/spec/grammar/parser/lexer.go +++ /dev/null @@ -1,297 +0,0 @@ -//go:generate maleeni compile lexspec.json -o clexspec.json -//go:generate maleeni-go clexspec.json --package parser - -package parser - -import ( - _ "embed" - "fmt" - "io" - "regexp" - "strings" - - verr "urubu/error" -) - -type tokenKind string - -const ( - tokenKindKWFragment = tokenKind("fragment") - tokenKindID = tokenKind("id") - tokenKindTerminalPattern = tokenKind("terminal pattern") - tokenKindStringLiteral = tokenKind("string") - tokenKindColon = tokenKind(":") - tokenKindOr = tokenKind("|") - tokenKindSemicolon = tokenKind(";") - tokenKindLabelMarker = tokenKind("@") - tokenKindDirectiveMarker = tokenKind("#") - tokenKindExpantion = tokenKind("...") - tokenKindOrderedSymbolMarker = tokenKind("$") - tokenKindLParen = tokenKind("(") - tokenKindRParen = tokenKind(")") - tokenKindNewline = tokenKind("newline") - tokenKindEOF = tokenKind("eof") - tokenKindInvalid = tokenKind("invalid") -) - -var ( - reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) - reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) -) - -type Position struct { - Row int - Col int -} - -func newPosition(row, col int) Position { - return Position{ - Row: row, - Col: col, - } -} - -type token struct { - kind tokenKind - text string - pos Position -} - -func newSymbolToken(kind tokenKind, pos Position) *token { - return &token{ - kind: kind, - pos: pos, - } -} - -func newIDToken(text string, pos Position) *token { - return &token{ - kind: tokenKindID, - text: text, - pos: pos, - } -} - -func newTerminalPatternToken(text string, pos Position) *token { - return &token{ - kind: tokenKindTerminalPattern, - text: text, - pos: pos, - } -} - -func newStringLiteralToken(text string, pos Position) *token { - return &token{ - kind: tokenKindStringLiteral, - text: text, - pos: pos, - } -} - -func newEOFToken() *token { - return &token{ - kind: tokenKindEOF, - } -} - -func newInvalidToken(text string, pos Position) *token { - return &token{ - kind: tokenKindInvalid, - text: text, - pos: pos, - } -} - -type lexer struct { - d *Lexer - buf *token -} - -func newLexer(src io.Reader) (*lexer, error) { - d, err := NewLexer(NewLexSpec(), src) - if err != nil { - return nil, err - } - return &lexer{ - d: d, - }, nil -} - -func (l *lexer) next() (*token, error) { - if l.buf != nil { - tok := l.buf - l.buf = nil - return tok, nil - } - - var newline *token - for { - tok, err := l.lexAndSkipWSs() - if err != nil { - return nil, err - } - if tok.kind == tokenKindNewline { - newline = tok - continue - } - - if newline != nil { - l.buf = tok - return newline, nil - } - return tok, nil - } -} - -func (l *lexer) lexAndSkipWSs() (*token, error) { - var tok *Token - for { - var err error - tok, err = l.d.Next() - if err != nil { - return nil, err - } - if tok.Invalid { - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } - if tok.EOF { - return newEOFToken(), nil - } - switch tok.KindID { - case KindIDWhiteSpace: - continue - case KindIDLineComment: - continue - } - - break - } - - switch tok.KindID { - case KindIDNewline: - return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDKwFragment: - return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDIdentifier: - if !reIDChar.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidChar, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidUnderscorePos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.Contains(string(tok.Lexeme), "__") { - return nil, &verr.SpecError{ - Cause: synErrIDConsecutiveUnderscores, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if reIDInvalidDigitsPos.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidDigitsPos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDTerminalOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedTerminal, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDPattern: - // The escape sequences in a pattern string are interpreted by the lexer, except for the \". - // We must interpret the \" before passing them to the lexer because they are delimiters for - // the pattern strings. - fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) - case KindIDEscapeSymbol: - return nil, &verr.SpecError{ - Cause: synErrIncompletedEscSeq, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - case KindIDTerminalClose: - pat := b.String() - if pat == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyPattern, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDStringLiteralOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDCharSeq: - fmt.Fprint(&b, string(tok.Lexeme)) - case KindIDStringLiteralClose: - str := b.String() - if str == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDColon: - return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOr: - return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDSemicolon: - return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLabelMarker: - return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDDirectiveMarker: - return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDExpansion: - return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOrderedSymbolMarker: - return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLParen: - return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDRParen: - return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil - default: - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } -} diff --git a/src/urubu/spec/grammar/parser/parser.go b/src/urubu/spec/grammar/parser/parser.go deleted file mode 100644 index b604074..0000000 --- a/src/urubu/spec/grammar/parser/parser.go +++ /dev/null @@ -1,582 +0,0 @@ -package parser - -import ( - "fmt" - "io" - - verr "urubu/error" - spec "urubu/spec/grammar" -) - -type RootNode struct { - Directives []*DirectiveNode - Productions []*ProductionNode - LexProductions []*ProductionNode - Fragments []*FragmentNode -} - -type ProductionNode struct { - Directives []*DirectiveNode - LHS string - RHS []*AlternativeNode - Pos Position -} - -func (n *ProductionNode) isLexical() bool { - if len(n.RHS) == 1 && len(n.RHS[0].Elements) == 1 && n.RHS[0].Elements[0].Pattern != "" { - return true - } - return false -} - -type AlternativeNode struct { - Elements []*ElementNode - Directives []*DirectiveNode - Pos Position -} - -type ElementNode struct { - ID string - Pattern string - Label *LabelNode - Literally bool - Pos Position -} - -type LabelNode struct { - Name string - Pos Position -} - -type DirectiveNode struct { - Name string - Parameters []*ParameterNode - Pos Position -} - -type ParameterNode struct { - ID string - Pattern string - String string - OrderedSymbol string - Group []*DirectiveNode - Expansion bool - Pos Position -} - -type FragmentNode struct { - LHS string - RHS string - Pos Position -} - -func raiseSyntaxError(row int, synErr *SyntaxError) { - panic(&verr.SpecError{ - Cause: synErr, - Row: row, - }) -} - -func raiseSyntaxErrorWithDetail(row int, synErr *SyntaxError, detail string) { - panic(&verr.SpecError{ - Cause: synErr, - Detail: detail, - Row: row, - }) -} - -func Parse(src io.Reader) (*RootNode, error) { - p, err := newParser(src) - if err != nil { - return nil, err - } - - return p.parse() -} - -type parser struct { - lex *lexer - peekedTok *token - lastTok *token - errs verr.SpecErrors - - // A token position that the parser read at last. - // It is used as additional information in error messages. - pos Position -} - -func newParser(src io.Reader) (*parser, error) { - lex, err := newLexer(src) - if err != nil { - return nil, err - } - return &parser{ - lex: lex, - }, nil -} - -func (p *parser) parse() (root *RootNode, retErr error) { - root = p.parseRoot() - if len(p.errs) > 0 { - return nil, p.errs - } - - return root, nil -} - -func (p *parser) parseRoot() *RootNode { - defer func() { - err := recover() - if err != nil { - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(fmt.Errorf("an unexpected error occurred: %v", err)) - } - p.errs = append(p.errs, specErr) - } - }() - - var dirs []*DirectiveNode - var prods []*ProductionNode - var lexProds []*ProductionNode - var fragments []*FragmentNode - for { - dir := p.parseTopLevelDirective() - if dir != nil { - dirs = append(dirs, dir) - continue - } - - fragment := p.parseFragment() - if fragment != nil { - fragments = append(fragments, fragment) - continue - } - - prod := p.parseProduction() - if prod != nil { - if prod.isLexical() { - lexProds = append(lexProds, prod) - } else { - prods = append(prods, prod) - } - continue - } - - if p.consume(tokenKindEOF) { - break - } - } - - return &RootNode{ - Directives: dirs, - Productions: prods, - LexProductions: lexProds, - Fragments: fragments, - } -} - -func (p *parser) parseTopLevelDirective() *DirectiveNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - dir := p.parseDirective() - if dir == nil { - return nil - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrTopLevelDirNoSemicolon) - } - - return dir -} - -func (p *parser) parseFragment() *FragmentNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindKWFragment) { - return nil - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoProductionName) - } - lhs := p.lastTok.text - lhsPos := p.lastTok.pos - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindColon) { - raiseSyntaxError(p.pos.Row, synErrNoColon) - } - - var rhs string - switch { - case p.consume(tokenKindTerminalPattern): - rhs = p.lastTok.text - case p.consume(tokenKindStringLiteral): - rhs = spec.EscapePattern(p.lastTok.text) - default: - raiseSyntaxError(p.pos.Row, synErrFragmentNoPattern) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrNoSemicolon) - } - - if !p.consume(tokenKindNewline) { - if !p.consume(tokenKindEOF) { - raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) - } - } - - return &FragmentNode{ - LHS: lhs, - RHS: rhs, - Pos: lhsPos, - } -} - -func (p *parser) parseProduction() *ProductionNode { - defer func() { - err := recover() - if err == nil { - return - } - - specErr, ok := err.(*verr.SpecError) - if !ok { - panic(err) - } - - p.errs = append(p.errs, specErr) - p.skipOverTo(tokenKindSemicolon) - }() - - p.consume(tokenKindNewline) - - if p.consume(tokenKindEOF) { - return nil - } - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoProductionName) - } - lhs := p.lastTok.text - lhsPos := p.lastTok.pos - - var dirs []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - dirs = append(dirs, dir) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindColon) { - raiseSyntaxError(p.pos.Row, synErrNoColon) - } - - alt := p.parseAlternative() - rhs := []*AlternativeNode{alt} - for { - p.consume(tokenKindNewline) - - if !p.consume(tokenKindOr) { - break - } - alt := p.parseAlternative() - rhs = append(rhs, alt) - } - - p.consume(tokenKindNewline) - - if !p.consume(tokenKindSemicolon) { - raiseSyntaxError(p.pos.Row, synErrNoSemicolon) - } - - if !p.consume(tokenKindNewline) { - if !p.consume(tokenKindEOF) { - raiseSyntaxError(p.pos.Row, synErrSemicolonNoNewline) - } - } - - prod := &ProductionNode{ - Directives: dirs, - LHS: lhs, - RHS: rhs, - Pos: lhsPos, - } - - // Vartan's driver must provide a user with the names of expected tokens when a syntax error occurs. - // However, if a pattern appears directly in an alternative, Vartan's compiler cannot assign an appropriate - // name to the pattern. Therefore, this code prohibits alternatives from containing patterns. - if !prod.isLexical() { - for _, alt := range prod.RHS { - for _, elem := range alt.Elements { - if elem.Pattern != "" { - raiseSyntaxError(elem.Pos.Row, synErrPatternInAlt) - } - } - } - } - - return prod -} - -func (p *parser) parseAlternative() *AlternativeNode { - elems := []*ElementNode{} - for { - elem := p.parseElement() - if elem == nil { - break - } - elems = append(elems, elem) - } - - // When a length of an alternative is zero, we cannot set a position. - var firstElemPos Position - if len(elems) > 0 { - firstElemPos = elems[0].Pos - } - - var dirs []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - dirs = append(dirs, dir) - } - - return &AlternativeNode{ - Elements: elems, - Directives: dirs, - Pos: firstElemPos, - } -} - -func (p *parser) parseElement() *ElementNode { - var elem *ElementNode - switch { - case p.consume(tokenKindID): - elem = &ElementNode{ - ID: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindTerminalPattern): - elem = &ElementNode{ - Pattern: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindStringLiteral): - elem = &ElementNode{ - Pattern: p.lastTok.text, - Literally: true, - Pos: p.lastTok.pos, - } - default: - if p.consume(tokenKindLabelMarker) { - raiseSyntaxError(p.pos.Row, synErrLabelWithNoSymbol) - } - return nil - } - if p.consume(tokenKindLabelMarker) { - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoLabel) - } - elem.Label = &LabelNode{ - Name: p.lastTok.text, - Pos: p.lastTok.pos, - } - } - return elem -} - -func (p *parser) parseDirective() *DirectiveNode { - p.consume(tokenKindNewline) - - if !p.consume(tokenKindDirectiveMarker) { - return nil - } - dirPos := p.lastTok.pos - - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoDirectiveName) - } - name := p.lastTok.text - - var params []*ParameterNode - for { - param := p.parseParameter() - if param == nil { - break - } - params = append(params, param) - } - - return &DirectiveNode{ - Name: name, - Parameters: params, - Pos: dirPos, - } -} - -func (p *parser) parseParameter() *ParameterNode { - var param *ParameterNode - switch { - case p.consume(tokenKindID): - param = &ParameterNode{ - ID: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindTerminalPattern): - param = &ParameterNode{ - Pattern: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindStringLiteral): - param = &ParameterNode{ - String: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindOrderedSymbolMarker): - if !p.consume(tokenKindID) { - raiseSyntaxError(p.pos.Row, synErrNoOrderedSymbolName) - } - param = &ParameterNode{ - OrderedSymbol: p.lastTok.text, - Pos: p.lastTok.pos, - } - case p.consume(tokenKindLParen): - pos := p.lastTok.pos - var g []*DirectiveNode - for { - dir := p.parseDirective() - if dir == nil { - break - } - g = append(g, dir) - } - if !p.consume(tokenKindRParen) { - raiseSyntaxError(p.pos.Row, synErrUnclosedDirGroup) - } - if len(g) == 0 { - // Set an empty slice representing an empty directive group to distinguish between the following two cases. - // - // - #prec (); // vartan allows this case. - // - #prec; // This case will raise an error. - g = []*DirectiveNode{} - } - param = &ParameterNode{ - Group: g, - Pos: pos, - } - } - if p.consume(tokenKindExpantion) { - switch { - case param == nil: - raiseSyntaxError(p.pos.Row, synErrStrayExpOp) - case param.ID == "": - raiseSyntaxError(p.pos.Row, synErrInvalidExpOperand) - } - param.Expansion = true - } - return param -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - panic(err) - } - } - p.pos = tok.pos - if tok.kind == tokenKindInvalid { - raiseSyntaxErrorWithDetail(p.pos.Row, synErrInvalidToken, tok.text) - } - if tok.kind == expected { - p.lastTok = tok - return true - } - p.peekedTok = tok - - return false -} - -func (p *parser) skip() { - var tok *token - var err error - for { - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - p.errs = append(p.errs, &verr.SpecError{ - Cause: err, - Row: p.pos.Row, - }) - continue - } - } - - break - } - - p.lastTok = tok - p.pos = tok.pos -} - -func (p *parser) skipOverTo(kind tokenKind) { - for { - if p.consume(kind) || p.consume(tokenKindEOF) { - return - } - p.skip() - } -} diff --git a/src/urubu/spec/grammar/parser/syntax_error.go b/src/urubu/spec/grammar/parser/syntax_error.go deleted file mode 100644 index 719fb94..0000000 --- a/src/urubu/spec/grammar/parser/syntax_error.go +++ /dev/null @@ -1,45 +0,0 @@ -package parser - -type SyntaxError struct { - message string -} - -func newSyntaxError(message string) *SyntaxError { - return &SyntaxError{ - message: message, - } -} - -func (e *SyntaxError) Error() string { - return e.message -} - -var ( - // lexical errors - synErrIDInvalidChar = newSyntaxError("an identifier can contain only the lower-case letter, the digits, and the underscore") - synErrIDInvalidUnderscorePos = newSyntaxError("the underscore cannot be placed at the beginning or end of an identifier") - synErrIDConsecutiveUnderscores = newSyntaxError("the underscore cannot be placed consecutively") - synErrIDInvalidDigitsPos = newSyntaxError("the digits cannot be placed at the biginning of an identifier") - synErrUnclosedTerminal = newSyntaxError("unclosed terminal") - synErrUnclosedString = newSyntaxError("unclosed string") - synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following a backslash") - synErrEmptyPattern = newSyntaxError("a pattern must include at least one character") - synErrEmptyString = newSyntaxError("a string must include at least one character") - - // syntax errors - synErrInvalidToken = newSyntaxError("invalid token") - synErrTopLevelDirNoSemicolon = newSyntaxError("a top-level directive must be followed by ;") - synErrNoProductionName = newSyntaxError("a production name is missing") - synErrNoColon = newSyntaxError("the colon must precede alternatives") - synErrNoSemicolon = newSyntaxError("the semicolon is missing at the last of an alternative") - synErrLabelWithNoSymbol = newSyntaxError("a label must follow a symbol") - synErrNoLabel = newSyntaxError("an identifier that represents a label is missing after the label marker @") - synErrNoDirectiveName = newSyntaxError("a directive needs a name") - synErrNoOrderedSymbolName = newSyntaxError("an ordered symbol name is missing") - synErrUnclosedDirGroup = newSyntaxError("a directive group must be closed by )") - synErrPatternInAlt = newSyntaxError("a pattern literal cannot appear directly in an alternative. instead, please define a terminal symbol with the pattern literal") - synErrStrayExpOp = newSyntaxError("an expansion operator ... must be preceded by an identifier") - synErrInvalidExpOperand = newSyntaxError("an expansion operator ... can be applied to only an identifier") - synErrSemicolonNoNewline = newSyntaxError("a semicolon must be followed by a newline") - synErrFragmentNoPattern = newSyntaxError("a fragment needs one pattern element") -) diff --git a/src/urubu/spec/grammar/util.go b/src/urubu/spec/grammar/util.go deleted file mode 100644 index bf3f233..0000000 --- a/src/urubu/spec/grammar/util.go +++ /dev/null @@ -1,21 +0,0 @@ -package grammar - -import "strings" - -var rep = strings.NewReplacer( - `.`, `\.`, - `*`, `\*`, - `+`, `\+`, - `?`, `\?`, - `|`, `\|`, - `(`, `\(`, - `)`, `\)`, - `[`, `\[`, - `\`, `\\`, -) - -// EscapePattern escapes the special characters. -// For example, EscapePattern(`+`) returns `\+`. -func EscapePattern(s string) string { - return rep.Replace(s) -} |