diff options
Diffstat (limited to 'src/urubu/spec/grammar/parser/lexer.go')
-rw-r--r-- | src/urubu/spec/grammar/parser/lexer.go | 297 |
1 files changed, 0 insertions, 297 deletions
diff --git a/src/urubu/spec/grammar/parser/lexer.go b/src/urubu/spec/grammar/parser/lexer.go deleted file mode 100644 index bd8a24f..0000000 --- a/src/urubu/spec/grammar/parser/lexer.go +++ /dev/null @@ -1,297 +0,0 @@ -//go:generate maleeni compile lexspec.json -o clexspec.json -//go:generate maleeni-go clexspec.json --package parser - -package parser - -import ( - _ "embed" - "fmt" - "io" - "regexp" - "strings" - - verr "urubu/error" -) - -type tokenKind string - -const ( - tokenKindKWFragment = tokenKind("fragment") - tokenKindID = tokenKind("id") - tokenKindTerminalPattern = tokenKind("terminal pattern") - tokenKindStringLiteral = tokenKind("string") - tokenKindColon = tokenKind(":") - tokenKindOr = tokenKind("|") - tokenKindSemicolon = tokenKind(";") - tokenKindLabelMarker = tokenKind("@") - tokenKindDirectiveMarker = tokenKind("#") - tokenKindExpantion = tokenKind("...") - tokenKindOrderedSymbolMarker = tokenKind("$") - tokenKindLParen = tokenKind("(") - tokenKindRParen = tokenKind(")") - tokenKindNewline = tokenKind("newline") - tokenKindEOF = tokenKind("eof") - tokenKindInvalid = tokenKind("invalid") -) - -var ( - reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) - reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) -) - -type Position struct { - Row int - Col int -} - -func newPosition(row, col int) Position { - return Position{ - Row: row, - Col: col, - } -} - -type token struct { - kind tokenKind - text string - pos Position -} - -func newSymbolToken(kind tokenKind, pos Position) *token { - return &token{ - kind: kind, - pos: pos, - } -} - -func newIDToken(text string, pos Position) *token { - return &token{ - kind: tokenKindID, - text: text, - pos: pos, - } -} - -func newTerminalPatternToken(text string, pos Position) *token { - return &token{ - kind: tokenKindTerminalPattern, - text: text, - pos: pos, - } -} - -func newStringLiteralToken(text string, pos Position) *token { - return &token{ - kind: tokenKindStringLiteral, - text: text, - pos: pos, - } -} - -func newEOFToken() *token { - return &token{ - kind: tokenKindEOF, - } -} - -func newInvalidToken(text string, pos Position) *token { - return &token{ - kind: tokenKindInvalid, - text: text, - pos: pos, - } -} - -type lexer struct { - d *Lexer - buf *token -} - -func newLexer(src io.Reader) (*lexer, error) { - d, err := NewLexer(NewLexSpec(), src) - if err != nil { - return nil, err - } - return &lexer{ - d: d, - }, nil -} - -func (l *lexer) next() (*token, error) { - if l.buf != nil { - tok := l.buf - l.buf = nil - return tok, nil - } - - var newline *token - for { - tok, err := l.lexAndSkipWSs() - if err != nil { - return nil, err - } - if tok.kind == tokenKindNewline { - newline = tok - continue - } - - if newline != nil { - l.buf = tok - return newline, nil - } - return tok, nil - } -} - -func (l *lexer) lexAndSkipWSs() (*token, error) { - var tok *Token - for { - var err error - tok, err = l.d.Next() - if err != nil { - return nil, err - } - if tok.Invalid { - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } - if tok.EOF { - return newEOFToken(), nil - } - switch tok.KindID { - case KindIDWhiteSpace: - continue - case KindIDLineComment: - continue - } - - break - } - - switch tok.KindID { - case KindIDNewline: - return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDKwFragment: - return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDIdentifier: - if !reIDChar.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidChar, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidUnderscorePos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if strings.Contains(string(tok.Lexeme), "__") { - return nil, &verr.SpecError{ - Cause: synErrIDConsecutiveUnderscores, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - if reIDInvalidDigitsPos.Match(tok.Lexeme) { - return nil, &verr.SpecError{ - Cause: synErrIDInvalidDigitsPos, - Detail: string(tok.Lexeme), - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDTerminalOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedTerminal, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDPattern: - // The escape sequences in a pattern string are interpreted by the lexer, except for the \". - // We must interpret the \" before passing them to the lexer because they are delimiters for - // the pattern strings. - fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) - case KindIDEscapeSymbol: - return nil, &verr.SpecError{ - Cause: synErrIncompletedEscSeq, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - case KindIDTerminalClose: - pat := b.String() - if pat == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyPattern, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDStringLiteralOpen: - var b strings.Builder - for { - tok, err := l.d.Next() - if err != nil { - return nil, err - } - if tok.EOF { - return nil, &verr.SpecError{ - Cause: synErrUnclosedString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - switch tok.KindID { - case KindIDCharSeq: - fmt.Fprint(&b, string(tok.Lexeme)) - case KindIDStringLiteralClose: - str := b.String() - if str == "" { - return nil, &verr.SpecError{ - Cause: synErrEmptyString, - Row: tok.Row + 1, - Col: tok.Col + 1, - } - } - return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil - } - } - case KindIDColon: - return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOr: - return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDSemicolon: - return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLabelMarker: - return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDDirectiveMarker: - return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDExpansion: - return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDOrderedSymbolMarker: - return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDLParen: - return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil - case KindIDRParen: - return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil - default: - return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil - } -} |