diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-06-14 23:22:02 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-06-15 19:16:58 +0900 |
commit | f16811613aeb79444a3555115e4031f68cd183b9 (patch) | |
tree | 3e4207ddd7f331fdcb11c86e2273bcbe262e9d0f /spec/lexer.go | |
parent | Update README (diff) | |
download | cotia-f16811613aeb79444a3555115e4031f68cd183b9.tar.gz cotia-f16811613aeb79444a3555115e4031f68cd183b9.tar.xz |
Add spec parser
Currently, the parser only supports definitions of lexical specification.
Diffstat (limited to 'spec/lexer.go')
-rw-r--r-- | spec/lexer.go | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/spec/lexer.go b/spec/lexer.go new file mode 100644 index 0000000..54eb13a --- /dev/null +++ b/spec/lexer.go @@ -0,0 +1,137 @@ +//go:generate maleeni compile -l lexspec.json -o clexspec.json + +package spec + +import ( + _ "embed" + "encoding/json" + "fmt" + "io" + "strings" + + mldriver "github.com/nihei9/maleeni/driver" + mlspec "github.com/nihei9/maleeni/spec" +) + +type tokenKind string + +const ( + tokenKindID = tokenKind("id") + tokenKindTerminalPattern = tokenKind("terminal pattern") + tokenKindColon = tokenKind(":") + tokenKindSemicolon = tokenKind(";") + tokenKindEOF = tokenKind("eof") + tokenKindInvalid = tokenKind("invalid") +) + +type token struct { + kind tokenKind + text string +} + +func newSymbolToken(kind tokenKind) *token { + return &token{ + kind: kind, + } +} + +func newIDToken(text string) *token { + return &token{ + kind: tokenKindID, + text: text, + } +} + +func newTerminalPatternToken(text string) *token { + return &token{ + kind: tokenKindTerminalPattern, + text: text, + } +} + +func newEOFToken() *token { + return &token{ + kind: tokenKindEOF, + } +} + +func newInvalidToken(text string) *token { + return &token{ + kind: tokenKindInvalid, + text: text, + } +} + +type lexer struct { + s *mlspec.CompiledLexSpec + d *mldriver.Lexer + dufTok *token +} + +//go:embed clexspec.json +var lexspec []byte + +func newLexer(src io.Reader) (*lexer, error) { + s := &mlspec.CompiledLexSpec{} + err := json.Unmarshal(lexspec, s) + if err != nil { + return nil, err + } + d, err := mldriver.NewLexer(s, src) + if err != nil { + return nil, err + } + return &lexer{ + s: s, + d: d, + }, nil +} + +func (l *lexer) next() (*token, error) { + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.Invalid { + newInvalidToken(tok.Text()) + } + if tok.EOF { + return newEOFToken(), nil + } + switch tok.KindName { + case "white_space": + continue + case "newline": + continue + case "identifier": + return newIDToken(tok.Text()), nil + case "terminal_open": + var b strings.Builder + for { + tok, err := l.d.Next() + if err != nil { + return nil, err + } + if tok.EOF { + return nil, synErrUnclosedTerminal + } + switch tok.KindName { + case "pattern": + // Remove '\' character. + fmt.Fprintf(&b, strings.ReplaceAll(tok.Text(), `\"`, `"`)) + case "escape_symbol": + return nil, synErrIncompletedEscSeq + case "terminal_close": + return newTerminalPatternToken(b.String()), nil + } + } + case "colon": + return newSymbolToken(tokenKindColon), nil + case "semicolon": + return newSymbolToken(tokenKindSemicolon), nil + default: + return newInvalidToken(tok.Text()), nil + } + } +} |