diff options
Diffstat (limited to 'spec/grammar/parser/lexer.go')
-rw-r--r-- | spec/grammar/parser/lexer.go | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/spec/grammar/parser/lexer.go b/spec/grammar/parser/lexer.go new file mode 100644 index 0000000..1096ae9 --- /dev/null +++ b/spec/grammar/parser/lexer.go @@ -0,0 +1,297 @@ +//go:generate maleeni compile lexspec.json -o clexspec.json +//go:generate maleeni-go clexspec.json --package parser + +package parser + +import ( + _ "embed" + "fmt" + "io" + "regexp" + "strings" + + verr "github.com/nihei9/vartan/error" +) + +type tokenKind string + +const ( + tokenKindKWFragment = tokenKind("fragment") + tokenKindID = tokenKind("id") + tokenKindTerminalPattern = tokenKind("terminal pattern") + tokenKindStringLiteral = tokenKind("string") + tokenKindColon = tokenKind(":") + tokenKindOr = tokenKind("|") + tokenKindSemicolon = tokenKind(";") + tokenKindLabelMarker = tokenKind("@") + tokenKindDirectiveMarker = tokenKind("#") + tokenKindExpantion = tokenKind("...") + tokenKindOrderedSymbolMarker = tokenKind("$") + tokenKindLParen = tokenKind("(") + tokenKindRParen = tokenKind(")") + tokenKindNewline = tokenKind("newline") + tokenKindEOF = tokenKind("eof") + tokenKindInvalid = tokenKind("invalid") +) + +var ( + reIDChar = regexp.MustCompile(`^[0-9a-z_]+$`) + reIDInvalidDigitsPos = regexp.MustCompile(`^[0-9]`) +) + +type Position struct { + Row int + Col int +} + +func newPosition(row, col int) Position { + return Position{ + Row: row, + Col: col, + } +} + +type token struct { + kind tokenKind + text string + pos Position +} + +func newSymbolToken(kind tokenKind, pos Position) *token { + return &token{ + kind: kind, + pos: pos, + } +} + +func newIDToken(text string, pos Position) *token { + return &token{ + kind: tokenKindID, + text: text, + pos: pos, + } +} + +func newTerminalPatternToken(text string, pos Position) *token { + return &token{ + kind: tokenKindTerminalPattern, + text: text, + pos: pos, + } +} + +func newStringLiteralToken(text string, pos Position) *token { + return &token{ + kind: tokenKindStringLiteral, + text: text, + pos: pos, + } +} + +func newEOFToken() *token { + return &token{ + kind: tokenKindEOF, + } +} + +func newInvalidToken(text string, pos Position) *token { + return &token{ + kind: tokenKindInvalid, + text: text, + pos: pos, + } +} + +type lexer struct { + d *Lexer + buf *token +} + +func newLexer(src io.Reader) (*lexer, error) { + d, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + return &lexer{ + d: d, + }, nil +} + +func (l *lexer) next() (*token, error) { + if l.buf != nil { + tok := l.buf + l.buf = nil + return tok, nil + } + + var newline *token + for { + tok, err := l.lexAndSkipWSs() + if err != nil { + return nil, err + } + if tok.kind == tokenKindNewline { + newline = tok + continue + } + + if newline != nil { + l.buf = tok + return newline, nil + } + return tok, nil + } +} + +func (l *lexer) lexAndSkipWSs() (*token, error) { + var tok *Token + for { + var err error + tok, err = l.d.Next() + if err != nil { + return nil, err + } + if tok.Invalid { + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } + if tok.EOF { + return newEOFToken(), nil + } + switch tok.KindID { + case KindIDWhiteSpace: + continue + case KindIDLineComment: + continue + } + + break + } + + switch tok.KindID { + case KindIDNewline: + return newSymbolToken(tokenKindNewline, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDKwFragment: + return newSymbolToken(tokenKindKWFragment, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDIdentifier: + if !reIDChar.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidChar, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.HasPrefix(string(tok.Lexeme), "_") || strings.HasSuffix(string(tok.Lexeme), "_") { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidUnderscorePos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if strings.Contains(string(tok.Lexeme), "__") { + return nil, &verr.SpecError{ + Cause: synErrIDConsecutiveUnderscores, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + if reIDInvalidDigitsPos.Match(tok.Lexeme) { + return nil, &verr.SpecError{ + Cause: synErrIDInvalidDigitsPos, + Detail: string(tok.Lexeme), + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newIDToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDTerminalOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedTerminal, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDPattern: + // The escape sequences in a pattern string are interpreted by the lexer, except for the \". + // We must interpret the \" before passing them to the lexer because they are delimiters for + // the pattern strings. + fmt.Fprint(&b, strings.ReplaceAll(string(tok.Lexeme), `\"`, `"`)) + case KindIDEscapeSymbol: + return nil, &verr.SpecError{ + Cause: synErrIncompletedEscSeq, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + case KindIDTerminalClose: + pat := b.String() + if pat == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyPattern, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newTerminalPatternToken(pat, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDStringLiteralOpen: + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, &verr.SpecError{ + Cause: synErrUnclosedString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + switch tok.KindID { + case KindIDCharSeq: + fmt.Fprint(&b, string(tok.Lexeme)) + case KindIDStringLiteralClose: + str := b.String() + if str == "" { + return nil, &verr.SpecError{ + Cause: synErrEmptyString, + Row: tok.Row + 1, + Col: tok.Col + 1, + } + } + return newStringLiteralToken(str, newPosition(tok.Row+1, tok.Col+1)), nil + } + } + case KindIDColon: + return newSymbolToken(tokenKindColon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOr: + return newSymbolToken(tokenKindOr, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDSemicolon: + return newSymbolToken(tokenKindSemicolon, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLabelMarker: + return newSymbolToken(tokenKindLabelMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDDirectiveMarker: + return newSymbolToken(tokenKindDirectiveMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDExpansion: + return newSymbolToken(tokenKindExpantion, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDOrderedSymbolMarker: + return newSymbolToken(tokenKindOrderedSymbolMarker, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDLParen: + return newSymbolToken(tokenKindLParen, newPosition(tok.Row+1, tok.Col+1)), nil + case KindIDRParen: + return newSymbolToken(tokenKindRParen, newPosition(tok.Row+1, tok.Col+1)), nil + default: + return newInvalidToken(string(tok.Lexeme), newPosition(tok.Row+1, tok.Col+1)), nil + } +} |