diff options
-rw-r--r-- | cmd/maleeni/lex.go | 5 | ||||
-rw-r--r-- | driver/lexer.go | 48 | ||||
-rw-r--r-- | driver/lexer_test.go | 112 |
3 files changed, 140 insertions, 25 deletions
diff --git a/cmd/maleeni/lex.go b/cmd/maleeni/lex.go index e6eab18..142f996 100644 --- a/cmd/maleeni/lex.go +++ b/cmd/maleeni/lex.go @@ -24,7 +24,10 @@ func init() { Use: "lex clexspec", Short: "Tokenize a text stream", Long: `lex takes a text stream and tokenizes it according to a compiled lexical specification. -As use ` + "`maleeni compile`" + `, you can generate the specification.`, +As use ` + "`maleeni compile`" + `, you can generate the specification. + +Note that passive mode transitions are not performed. Thus, if there is a mode in +your lexical specification that is set passively, lexemes in that mode will not be recognized.`, Example: ` cat src | maleeni lex clexspec.json`, Args: cobra.ExactArgs(1), RunE: runLex, diff --git a/driver/lexer.go b/driver/lexer.go index 8d4a10f..1e54fa6 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -151,6 +151,13 @@ func (t *Token) MarshalJSON() ([]byte, error) { type LexerOption func(l *Lexer) error +func DisableModeTransition() LexerOption { + return func(l *Lexer) error { + l.passiveModeTran = true + return nil + } +} + func EnableLogging(w io.Writer) LexerOption { return func(l *Lexer) error { logger, err := log.NewLogger(w) @@ -163,12 +170,13 @@ func EnableLogging(w io.Writer) LexerOption { } type Lexer struct { - clspec *spec.CompiledLexSpec - src []byte - srcPtr int - tokBuf []*Token - modeStack []spec.LexModeNum - logger log.Logger + clspec *spec.CompiledLexSpec + src []byte + srcPtr int + tokBuf []*Token + modeStack []spec.LexModeNum + passiveModeTran bool + logger log.Logger } func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) { @@ -183,7 +191,8 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption) modeStack: []spec.LexModeNum{ clspec.InitialMode, }, - logger: log.NewNopLogger(), + passiveModeTran: false, + logger: log.NewNopLogger(), } for _, opt := range opts { err := opt(l) @@ -201,7 +210,7 @@ func (l *Lexer) Next() (*Token, error) { State: mode: #%v %v pointer: %v - token buffer: %v`, l.mode(), l.clspec.Modes[l.mode()], l.srcPtr, l.tokBuf) + token buffer: %v`, l.Mode(), l.clspec.Modes[l.Mode()], l.srcPtr, l.tokBuf) if len(l.tokBuf) > 0 { tok := l.tokBuf[0] @@ -212,7 +221,7 @@ func (l *Lexer) Next() (*Token, error) { return tok, nil } - tok, err := l.nextAndTranMode() + tok, err := l.nextAndTransition() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -226,7 +235,7 @@ func (l *Lexer) Next() (*Token, error) { } errTok := tok for { - tok, err = l.nextAndTranMode() + tok, err = l.nextAndTransition() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -246,7 +255,7 @@ func (l *Lexer) Next() (*Token, error) { return errTok, nil } -func (l *Lexer) nextAndTranMode() (*Token, error) { +func (l *Lexer) nextAndTransition() (*Token, error) { tok, err := l.next() if err != nil { return nil, err @@ -254,16 +263,19 @@ func (l *Lexer) nextAndTranMode() (*Token, error) { if tok.EOF || tok.Invalid { return tok, nil } - spec := l.clspec.Specs[l.mode()] + if l.passiveModeTran { + return tok, nil + } + spec := l.clspec.Specs[l.Mode()] if spec.Pop[tok.Kind] == 1 { - err := l.popMode() + err := l.PopMode() if err != nil { return nil, err } } mode := spec.Push[tok.Kind] if !mode.IsNil() { - l.pushMode(mode) + l.PushMode(mode) } // The checking length of the mode stack must be at after pop and push operations // because those operations can be performed at the same time. @@ -277,7 +289,7 @@ func (l *Lexer) nextAndTranMode() (*Token, error) { } func (l *Lexer) next() (*Token, error) { - mode := l.mode() + mode := l.Mode() modeName := l.clspec.Modes[mode] spec := l.clspec.Specs[mode] state := spec.DFA.InitialState @@ -343,15 +355,15 @@ func (l *Lexer) lookupNextState(mode spec.LexModeNum, state int, v int) (int, bo return next, true } -func (l *Lexer) mode() spec.LexModeNum { +func (l *Lexer) Mode() spec.LexModeNum { return l.modeStack[len(l.modeStack)-1] } -func (l *Lexer) pushMode(mode spec.LexModeNum) { +func (l *Lexer) PushMode(mode spec.LexModeNum) { l.modeStack = append(l.modeStack, mode) } -func (l *Lexer) popMode() error { +func (l *Lexer) PopMode() error { sLen := len(l.modeStack) if sLen == 0 { return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more") diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 87a381c..33edbc0 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -52,9 +52,11 @@ func newEOFTokenDefault() *Token { func TestLexer_Next(t *testing.T) { test := []struct { - lspec *spec.LexSpec - src string - tokens []*Token + lspec *spec.LexSpec + src string + tokens []*Token + passiveModeTran bool + tran func(l *Lexer, tok *Token) error }{ { lspec: &spec.LexSpec{ @@ -576,17 +578,108 @@ func TestLexer_Next(t *testing.T) { newEOFTokenDefault(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), + newLexEntry([]string{"default"}, "char", `.`, "", false), + newLexEntry([]string{"default"}, "push_1", `-> 1`, "", false), + newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false), + newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false), + newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", false), + }, + }, + src: `-> 1 -> 2 <- <- a`, + tokens: []*Token{ + newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))), + newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))), + newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))), + newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))), + newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))), + newEOFTokenDefault(), + }, + passiveModeTran: true, + tran: func(l *Lexer, tok *Token) error { + switch l.clspec.Modes[l.Mode().Int()] { + case "default": + switch tok.KindName { + case "push_1": + l.PushMode(2) + } + case "mode_1": + switch tok.KindName { + case "push_2": + l.PushMode(3) + case "pop_1": + return l.PopMode() + } + case "mode_2": + switch tok.KindName { + case "pop_2": + return l.PopMode() + } + } + return nil + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), + newLexEntry([]string{"default"}, "char", `.`, "", false), + newLexEntry([]string{"default"}, "push_1", `-> 1`, "mode_1", false), + newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false), + newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false), + newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", true), + }, + }, + src: `-> 1 -> 2 <- <- a`, + tokens: []*Token{ + newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))), + newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))), + newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))), + newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))), + newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))), + newEOFTokenDefault(), + }, + // Active mode transition and an external transition function can be used together. + passiveModeTran: false, + tran: func(l *Lexer, tok *Token) error { + switch l.clspec.Modes[l.Mode().Int()] { + case "mode_1": + switch tok.KindName { + case "push_2": + l.PushMode(3) + case "pop_1": + return l.PopMode() + } + } + return nil + }, + }, } for i, tt := range test { for compLv := compiler.CompressionLevelMin; compLv <= compiler.CompressionLevelMax; compLv++ { t.Run(fmt.Sprintf("#%v-%v", i, compLv), func(t *testing.T) { clspec, err := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv)) if err != nil { - t.Fatalf("unexpected error occurred: %v", err) + t.Fatalf("unexpected error: %v", err) } - lexer, err := NewLexer(clspec, strings.NewReader(tt.src)) + opts := []LexerOption{} + if tt.passiveModeTran { + opts = append(opts, DisableModeTransition()) + } + lexer, err := NewLexer(clspec, strings.NewReader(tt.src), opts...) if err != nil { - t.Fatalf("unexpecated error occurred; %v", err) + t.Fatalf("unexpected error: %v", err) } for _, eTok := range tt.tokens { tok, err := lexer.Next() @@ -599,6 +692,13 @@ func TestLexer_Next(t *testing.T) { if tok.EOF { break } + + if tt.tran != nil { + err := tt.tran(lexer, tok) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } } }) } |