aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/compiler.go84
-rw-r--r--driver/lexer.go135
-rw-r--r--driver/lexer_test.go421
-rw-r--r--spec/spec.go75
4 files changed, 504 insertions, 211 deletions
diff --git a/compiler/compiler.go b/compiler/compiler.go
index 15f42f3..6f878c5 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -42,12 +42,71 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp
}
}
+ modeEntries, modes, modeNums := groupEntriesByLexMode(lexspec.Entries)
+
+ modeSpecs := []*spec.CompiledLexModeSpec{
+ nil,
+ }
+ for i, es := range modeEntries[1:] {
+ modeName := modes[i+1]
+ config.logger.Log("Compile %v mode:", modeName)
+ modeSpec, err := compile(es, modeNums, config)
+ if err != nil {
+ return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err)
+ }
+ modeSpecs = append(modeSpecs, modeSpec)
+ }
+
+ return &spec.CompiledLexSpec{
+ InitialMode: spec.LexModeNumDefault,
+ Modes: modes,
+ Specs: modeSpecs,
+ }, nil
+}
+
+func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum) {
+ modes := []spec.LexModeName{
+ spec.LexModeNameNil,
+ spec.LexModeNameDefault,
+ }
+ modeNums := map[spec.LexModeName]spec.LexModeNum{
+ spec.LexModeNameNil: spec.LexModeNumNil,
+ spec.LexModeNameDefault: spec.LexModeNumDefault,
+ }
+ lastModeNum := spec.LexModeNumDefault
+ modeEntries := [][]*spec.LexEntry{
+ nil,
+ []*spec.LexEntry{},
+ }
+ for _, e := range entries {
+ ms := e.Modes
+ if len(ms) == 0 {
+ ms = []spec.LexModeName{
+ spec.LexModeNameDefault,
+ }
+ }
+ for _, mode := range ms {
+ num, ok := modeNums[mode]
+ if !ok {
+ num = lastModeNum.Succ()
+ lastModeNum = num
+ modeNums[mode] = num
+ modes = append(modes, mode)
+ modeEntries = append(modeEntries, []*spec.LexEntry{})
+ }
+ modeEntries[num] = append(modeEntries[num], e)
+ }
+ }
+ return modeEntries, modes, modeNums
+}
+
+func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, config *compilerConfig) (*spec.CompiledLexModeSpec, error) {
var kinds []spec.LexKind
var patterns map[int][]byte
{
kinds = append(kinds, spec.LexKindNil)
patterns = map[int][]byte{}
- for i, e := range lexspec.Entries {
+ for i, e := range entries {
kinds = append(kinds, e.Kind)
patterns[i+1] = []byte(e.Pattern)
}
@@ -58,6 +117,25 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp
}
}
+ push := []spec.LexModeNum{
+ spec.LexModeNumNil,
+ }
+ pop := []int{
+ 0,
+ }
+ for _, e := range entries {
+ pushV := spec.LexModeNumNil
+ if e.Push != "" {
+ pushV = modeNums[e.Push]
+ }
+ push = append(push, pushV)
+ popV := 0
+ if e.Pop {
+ popV = 1
+ }
+ pop = append(pop, popV)
+ }
+
var root astNode
var symTab *symbolTable
{
@@ -90,8 +168,10 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp
}
}
- return &spec.CompiledLexSpec{
+ return &spec.CompiledLexModeSpec{
Kinds: kinds,
+ Push: push,
+ Pop: pop,
DFA: tranTab,
}, nil
}
diff --git a/driver/lexer.go b/driver/lexer.go
index 8a2afd8..72a6f69 100644
--- a/driver/lexer.go
+++ b/driver/lexer.go
@@ -54,46 +54,54 @@ func (s byteSequence) merge(a byteSequence) byteSequence {
}
type Token struct {
- ID int `json:"id"`
- Kind string `json:"kind"`
- Match byteSequence `json:"match"`
- Text string `json:"text"`
- EOF bool `json:"eof"`
- Invalid bool `json:"invalid"`
+ Mode spec.LexModeNum `json:"mode"`
+ ModeName spec.LexModeName `json:"mode_name"`
+ ID int `json:"id"`
+ Kind string `json:"kind"`
+ Match byteSequence `json:"match"`
+ Text string `json:"text"`
+ EOF bool `json:"eof"`
+ Invalid bool `json:"invalid"`
}
-func newToken(id int, kind string, match byteSequence) *Token {
+func newToken(mode spec.LexModeNum, modeName spec.LexModeName, id int, kind string, match byteSequence) *Token {
return &Token{
- ID: id,
- Kind: kind,
- Match: match,
- Text: string(match.ByteSlice()),
+ Mode: mode,
+ ModeName: modeName,
+ ID: id,
+ Kind: kind,
+ Match: match,
+ Text: string(match.ByteSlice()),
}
}
-func newEOFToken() *Token {
+func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token {
return &Token{
- ID: 0,
- EOF: true,
+ Mode: mode,
+ ModeName: modeName,
+ ID: 0,
+ EOF: true,
}
}
-func newInvalidToken(match byteSequence) *Token {
+func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token {
return &Token{
- ID: 0,
- Match: match,
- Invalid: true,
+ Mode: mode,
+ ModeName: modeName,
+ ID: 0,
+ Match: match,
+ Invalid: true,
}
}
func (t *Token) String() string {
if t.Invalid {
- return fmt.Sprintf("!{text: %v, byte: %v}", t.Text, t.Match)
+ return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text, t.Match)
}
if t.EOF {
return "{eof}"
}
- return fmt.Sprintf("{id: %v, kind: %v, text: %v, byte: %v}", t.ID, t.Kind, t.Text, t.Match)
+ return fmt.Sprintf("{mode: %v, mode name: %v, id: %v, kind: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.ID, t.Kind, t.Text, t.Match)
}
type lexerOption func(l *lexer) error
@@ -110,11 +118,12 @@ func EnableLogging(w io.Writer) lexerOption {
}
type lexer struct {
- clspec *spec.CompiledLexSpec
- src []byte
- srcPtr int
- tokBuf []*Token
- logger log.Logger
+ clspec *spec.CompiledLexSpec
+ src []byte
+ srcPtr int
+ tokBuf []*Token
+ modeStack []spec.LexModeNum
+ logger log.Logger
}
func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) (*lexer, error) {
@@ -126,6 +135,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption)
clspec: clspec,
src: b,
srcPtr: 0,
+ modeStack: []spec.LexModeNum{
+ clspec.InitialMode,
+ },
logger: log.NewNopLogger(),
}
for _, opt := range opts {
@@ -142,8 +154,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption)
func (l *lexer) Next() (*Token, error) {
l.logger.Log(`lexer#Next():
State:
+ mode: #%v %v
pointer: %v
- token buffer: %v`, l.srcPtr, l.tokBuf)
+ token buffer: %v`, l.mode(), l.clspec.Modes[l.mode()], l.srcPtr, l.tokBuf)
if len(l.tokBuf) > 0 {
tok := l.tokBuf[0]
@@ -154,7 +167,7 @@ func (l *lexer) Next() (*Token, error) {
return tok, nil
}
- tok, err := l.next()
+ tok, err := l.nextAndTranMode()
if err != nil {
l.logger.Log(" Detectes an error: %v", err)
return nil, err
@@ -168,7 +181,7 @@ func (l *lexer) Next() (*Token, error) {
}
errTok := tok
for {
- tok, err = l.next()
+ tok, err = l.nextAndTranMode()
if err != nil {
l.logger.Log(" Detectes an error: %v", err)
return nil, err
@@ -205,7 +218,7 @@ func (l *lexer) peekN(n int) (*Token, error) {
return nil, fmt.Errorf("peekN() can handle only [0..2]")
}
for len(l.tokBuf) < n+1 {
- tok, err := l.next()
+ tok, err := l.nextAndTranMode()
if err != nil {
return nil, err
}
@@ -214,8 +227,41 @@ func (l *lexer) peekN(n int) (*Token, error) {
return l.tokBuf[n], nil
}
+func (l *lexer) nextAndTranMode() (*Token, error) {
+ tok, err := l.next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.EOF || tok.Invalid {
+ return tok, nil
+ }
+ spec := l.clspec.Specs[l.mode()]
+ if spec.Pop[tok.ID] == 1 {
+ err := l.popMode()
+ if err != nil {
+ return nil, err
+ }
+ }
+ mode := spec.Push[tok.ID]
+ if !mode.IsNil() {
+ l.pushMode(mode)
+ }
+ // The checking length of the mode stack must be at after pop and push operations
+ // because those operations can be performed at the same time.
+ // When the mode stack has just one element and popped it, the mode stack will be temporarily emptied.
+ // However, since a push operation may be performed immediately after it,
+ // the lexer allows the stack to be temporarily empty.
+ if len(l.modeStack) == 0 {
+ return nil, fmt.Errorf("a mode stack must have at least one element")
+ }
+ return tok, nil
+}
+
func (l *lexer) next() (*Token, error) {
- state := l.clspec.DFA.InitialState
+ mode := l.mode()
+ modeName := l.clspec.Modes[mode]
+ spec := l.clspec.Specs[mode]
+ state := spec.DFA.InitialState
buf := []byte{}
unfixedBufLen := 0
var tok *Token
@@ -229,13 +275,13 @@ func (l *lexer) next() (*Token, error) {
// When `buf` has unaccepted data and reads the EOF,
// the lexer treats the buffered data as an invalid token.
if len(buf) > 0 {
- return newInvalidToken(newByteSequence(buf)), nil
+ return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
}
- return newEOFToken(), nil
+ return newEOFToken(mode, modeName), nil
}
buf = append(buf, v)
unfixedBufLen++
- entry := l.clspec.DFA.Transition[state]
+ entry := spec.DFA.Transition[state]
if len(entry) == 0 {
return nil, fmt.Errorf("no transition entry; state: %v", state)
}
@@ -245,17 +291,34 @@ func (l *lexer) next() (*Token, error) {
l.unread(unfixedBufLen)
return tok, nil
}
- return newInvalidToken(newByteSequence(buf)), nil
+ return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
}
state = nextState
- id, ok := l.clspec.DFA.AcceptingStates[state]
+ id, ok := spec.DFA.AcceptingStates[state]
if ok {
- tok = newToken(id, l.clspec.Kinds[id].String(), newByteSequence(buf))
+ tok = newToken(mode, modeName, id, spec.Kinds[id].String(), newByteSequence(buf))
unfixedBufLen = 0
}
}
}
+func (l *lexer) mode() spec.LexModeNum {
+ return l.modeStack[len(l.modeStack)-1]
+}
+
+func (l *lexer) pushMode(mode spec.LexModeNum) {
+ l.modeStack = append(l.modeStack, mode)
+}
+
+func (l *lexer) popMode() error {
+ sLen := len(l.modeStack)
+ if sLen == 0 {
+ return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more")
+ }
+ l.modeStack = l.modeStack[:sLen-1]
+ return nil
+}
+
func (l *lexer) read() (byte, bool) {
if l.srcPtr >= len(l.src) {
return 0, true
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 26b5d49..d3edb3c 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -10,13 +10,38 @@ import (
"github.com/nihei9/maleeni/spec"
)
-func newLexEntry(kind string, pattern string) *spec.LexEntry {
+func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry {
+ ms := []spec.LexModeName{}
+ for _, m := range modes {
+ ms = append(ms, spec.LexModeName(m))
+ }
+ return &spec.LexEntry{
+ Kind: spec.LexKind(kind),
+ Pattern: spec.LexPattern(pattern),
+ Modes: ms,
+ Push: spec.LexModeName(push),
+ Pop: pop,
+ }
+}
+
+func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry {
return &spec.LexEntry{
Kind: spec.LexKind(kind),
Pattern: spec.LexPattern(pattern),
+ Modes: []spec.LexModeName{
+ spec.LexModeNameDefault,
+ },
}
}
+func newTokenDefault(id int, kind string, match byteSequence) *Token {
+ return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match)
+}
+
+func newEOFTokenDefault() *Token {
+ return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault)
+}
+
func TestLexer_Next(t *testing.T) {
test := []struct {
lspec *spec.LexSpec
@@ -26,58 +51,58 @@ func TestLexer_Next(t *testing.T) {
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "(a|b)*abb"),
- newLexEntry("t2", " +"),
+ newLexEntryDefaultNOP("t1", "(a|b)*abb"),
+ newLexEntryDefaultNOP("t2", " +"),
},
},
src: "abb aabb aaabb babb bbabb abbbabb",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("abb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aaabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("babb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("bbabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("abbbabb"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("abb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("babb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "b?a+"),
- newLexEntry("t2", "(ab)?(cd)+"),
- newLexEntry("t3", " +"),
+ newLexEntryDefaultNOP("t1", "b?a+"),
+ newLexEntryDefaultNOP("t2", "(ab)?(cd)+"),
+ newLexEntryDefaultNOP("t3", " +"),
},
},
src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("ba"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("baaa"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("a"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aaa"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("abcd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("cd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("cdcdcd"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("ba"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("cd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "."),
+ newLexEntryDefaultNOP("t1", "."),
},
},
src: string([]byte{
@@ -99,45 +124,45 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte{0x00})),
- newToken(1, "t1", newByteSequence([]byte{0x7f})),
- newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0x00})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "[ab.*+?|()[\\]]"),
+ newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"),
},
},
src: "ab.*+?|()[]",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("a"))),
- newToken(1, "t1", newByteSequence([]byte("b"))),
- newToken(1, "t1", newByteSequence([]byte("."))),
- newToken(1, "t1", newByteSequence([]byte("*"))),
- newToken(1, "t1", newByteSequence([]byte("+"))),
- newToken(1, "t1", newByteSequence([]byte("?"))),
- newToken(1, "t1", newByteSequence([]byte("|"))),
- newToken(1, "t1", newByteSequence([]byte("("))),
- newToken(1, "t1", newByteSequence([]byte(")"))),
- newToken(1, "t1", newByteSequence([]byte("["))),
- newToken(1, "t1", newByteSequence([]byte("]"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("b"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("."))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("*"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("+"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("?"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("|"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("("))),
+ newTokenDefault(1, "t1", newByteSequence([]byte(")"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("["))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("]"))),
+ newEOFTokenDefault(),
},
},
{
@@ -149,7 +174,7 @@ func TestLexer_Next(t *testing.T) {
// maleeni cannot handle the null character in patterns because compiler.lexer,
// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
// If a pattern needs a null character, use code point expression \u{0000}.
- newLexEntry("1ByteChar", "[\x01-\x7f]"),
+ newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"),
},
},
src: string([]byte{
@@ -159,18 +184,18 @@ func TestLexer_Next(t *testing.T) {
0x7f,
}),
tokens: []*Token{
- newToken(1, "1ByteChar", newByteSequence([]byte{0x01})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x02})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})),
- newEOFToken(),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 2 byte characters
- newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
+ newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
},
},
src: string([]byte{
@@ -180,33 +205,33 @@ func TestLexer_Next(t *testing.T) {
0xdf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// All bytes are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
},
},
src: string([]byte{
0xe0, 0xa0, 0x80,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first two bytes are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
},
},
src: string([]byte{
@@ -216,18 +241,18 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xa0, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first byte are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
},
},
src: string([]byte{
@@ -237,18 +262,18 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 3 byte characters
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
},
},
src: string([]byte{
@@ -270,45 +295,45 @@ func TestLexer_Next(t *testing.T) {
0xef, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// All bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
},
},
src: string([]byte{
0xf0, 0x90, 0x80, 0x80,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first 3 bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
},
},
src: string([]byte{
@@ -318,18 +343,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0x80, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first 2 bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
},
},
src: string([]byte{
@@ -339,18 +364,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first byte are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
},
},
src: string([]byte{
@@ -360,18 +385,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0xbf, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 4 byte characters
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
},
},
src: string([]byte{
@@ -389,64 +414,114 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("NonNumber", "[^0-9]+[0-9]"),
+ newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"),
},
},
src: "foo9",
tokens: []*Token{
- newToken(1, "NonNumber", newByteSequence([]byte("foo9"))),
- newEOFToken(),
+ newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("1ByteChar", "\\u{006E}"),
- newLexEntry("2ByteChar", "\\u{03BD}"),
- newLexEntry("3ByteChar", "\\u{306B}"),
- newLexEntry("4ByteChar", "\\u{01F638}"),
+ newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"),
+ newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"),
+ newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"),
+ newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"),
},
},
src: "nνに😸",
tokens: []*Token{
- newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})),
- newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
- newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
- newEOFToken(),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})),
+ newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+ newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
},
},
src: "nνに😸",
tokens: []*Token{
- newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
- newEOFToken(),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFTokenDefault(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ newLexEntryDefaultNOP("white_space", ` *`),
+ newLexEntry([]string{"default"}, "string_open", `"`, "string", false),
+ newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false),
+ newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false),
+ newLexEntry([]string{"string"}, "string_close", `"`, "", true),
+ },
+ },
+ src: `"" "Hello world.\n\"Hello world.\""`,
+ tokens: []*Token{
+ newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
+ newEOFTokenDefault(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // `white_space` is enabled in multiple modes.
+ newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false),
+ newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false),
+ newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false),
+ newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true),
+ newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true),
+ },
+ },
+ src: ` a b < < `,
+ tokens: []*Token{
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))),
+ newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))),
+ newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))),
+ newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))),
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newEOFTokenDefault(),
},
},
}
@@ -479,8 +554,8 @@ func TestLexer_Next(t *testing.T) {
func TestLexer_PeekN(t *testing.T) {
clspec, err := compiler.Compile(&spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "foo"),
- newLexEntry("t2", "bar"),
+ newLexEntryDefaultNOP("t1", "foo"),
+ newLexEntryDefaultNOP("t2", "bar"),
},
})
if err != nil {
@@ -492,9 +567,9 @@ func TestLexer_PeekN(t *testing.T) {
}
expectedTokens := []*Token{
- newToken(1, "t1", []byte("foo")),
- newToken(2, "t2", []byte("bar")),
- newEOFToken(),
+ newTokenDefault(1, "t1", []byte("foo")),
+ newTokenDefault(2, "t2", []byte("bar")),
+ newEOFTokenDefault(),
}
tok, err := lex.Peek1()
@@ -539,7 +614,13 @@ func TestLexer_PeekN(t *testing.T) {
func testToken(t *testing.T, expected, actual *Token) {
t.Helper()
- if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid {
- t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match))
+ if actual.Mode != expected.Mode ||
+ actual.ModeName != actual.ModeName ||
+ actual.ID != expected.ID ||
+ actual.Kind != expected.Kind ||
+ !bytes.Equal(actual.Match, expected.Match) ||
+ actual.EOF != expected.EOF ||
+ actual.Invalid != expected.Invalid {
+ t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, string(expected.Match), actual, string(actual.Match))
}
}
diff --git a/spec/spec.go b/spec/spec.go
index 0f9b484..e2291e9 100644
--- a/spec/spec.go
+++ b/spec/spec.go
@@ -3,6 +3,7 @@ package spec
import (
"fmt"
"regexp"
+ "strconv"
"strings"
)
@@ -37,9 +38,61 @@ func (p LexPattern) validate() error {
return nil
}
+const lexModePattern = "[A-Za-z_][0-9A-Za-z_]*"
+
+var lexModeRE = regexp.MustCompile(lexKindPattern)
+
+type LexModeName string
+
+const (
+ LexModeNameNil = LexModeName("")
+ LexModeNameDefault = LexModeName("default")
+)
+
+func (m LexModeName) String() string {
+ return string(m)
+}
+
+func (m LexModeName) validate() error {
+ if m.isNil() || !lexModeRE.Match([]byte(m)) {
+ return fmt.Errorf("mode must be %v", lexModePattern)
+ }
+ return nil
+}
+
+func (m LexModeName) isNil() bool {
+ return m == LexModeNameNil
+}
+
+type LexModeNum int
+
+const (
+ LexModeNumNil = LexModeNum(0)
+ LexModeNumDefault = LexModeNum(1)
+)
+
+func (n LexModeNum) String() string {
+ return strconv.Itoa(int(n))
+}
+
+func (n LexModeNum) Int() int {
+ return int(n)
+}
+
+func (n LexModeNum) Succ() LexModeNum {
+ return n + 1
+}
+
+func (n LexModeNum) IsNil() bool {
+ return n == LexModeNumNil
+}
+
type LexEntry struct {
- Kind LexKind `json:"kind"`
- Pattern LexPattern `json:"pattern"`
+ Kind LexKind `json:"kind"`
+ Pattern LexPattern `json:"pattern"`
+ Modes []LexModeName `json:"modes"`
+ Push LexModeName `json:"push"`
+ Pop bool `json:"pop"`
}
func (e *LexEntry) validate() error {
@@ -51,6 +104,14 @@ func (e *LexEntry) validate() error {
if err != nil {
return err
}
+ if len(e.Modes) > 0 {
+ for _, mode := range e.Modes {
+ err = mode.validate()
+ if err != nil {
+ return err
+ }
+ }
+ }
return nil
}
@@ -97,7 +158,15 @@ type TransitionTable struct {
Transition [][]int `json:"transition"`
}
-type CompiledLexSpec struct {
+type CompiledLexModeSpec struct {
Kinds []LexKind `json:"kinds"`
+ Push []LexModeNum `json:"push"`
+ Pop []int `json:"pop"`
DFA *TransitionTable `json:"dfa"`
}
+
+type CompiledLexSpec struct {
+ InitialMode LexModeNum `json:"initial_mode"`
+ Modes []LexModeName `json:"modes"`
+ Specs []*CompiledLexModeSpec `json:"specs"`
+}