diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-08-07 12:55:44 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-08-07 12:55:44 +0900 |
commit | 82efd35e6f99af0eff0430fc32b825d5cb38ac4d (patch) | |
tree | fd0b19af5424d091b877ba283efe49457cbfa90e | |
parent | Use Go 1.16 (diff) | |
download | tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.gz tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.xz |
Add lexeme positions to tokens
close #1
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | driver/lexer.go | 122 | ||||
-rw-r--r-- | driver/lexer_test.go | 141 |
3 files changed, 226 insertions, 39 deletions
@@ -70,6 +70,8 @@ The JSON format of tokens that `maleeni lex` command prints is as follows: | kind_id | integer | An ID of a kind. This is unique among all modes. | | mode_kind_id | integer | An ID of a lexical kind. This is unique only within a mode. Note that you need to use `KindID` field if you want to identify a kind across all modes. | | kind_name | string | A name of a lexical kind. | +| row | integer | A row number where a lexeme appears. | +| col | integer | A column number where a lexeme appears. Note that `col` is counted in code points, not bytes. | | match | array of integers | A byte sequense of a lexeme. | | text | string | A string representation of a lexeme. | | eof | bool | When this field is `true`, it means the token is the EOF token. | diff --git a/driver/lexer.go b/driver/lexer.go index bad7dbd..03540e3 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -72,6 +72,13 @@ type Token struct { // KindName is a name of a lexical kind. KindName spec.LexKindName + // Row is a row number where a lexeme appears. + Row int + + // Col is a column number where a lexeme appears. + // Note that Col is counted in code points, not bytes. + Col int + // When this field is true, it means the token is the EOF token. EOF bool @@ -82,44 +89,14 @@ type Token struct { match byteSequence } -func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token { - return &Token{ - ModeID: modeID, - ModeName: modeName, - KindID: kindID, - ModeKindID: modeKindID, - KindName: kindName, - match: match, - } -} - -func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token { - return &Token{ - ModeID: modeID, - ModeName: modeName, - ModeKindID: 0, - EOF: true, - } -} - -func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token { - return &Token{ - ModeID: modeID, - ModeName: modeName, - ModeKindID: 0, - match: match, - Invalid: true, - } -} - func (t *Token) String() string { if t.Invalid { - return fmt.Sprintf("!{mode id: %v, mode name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Text(), t.Match()) + return fmt.Sprintf("!{mode id: %v, mode name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Row, t.Col, t.Text(), t.Match()) } if t.EOF { - return "{eof}" + return fmt.Sprintf("{kind name: eof, row: %v, col: %v}", t.Row, t.Col) } - return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Text(), t.Match()) + return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Row, t.Col, t.Text(), t.Match()) } // Match returns a byte slice matched a pattern of a lexical specification. @@ -139,6 +116,8 @@ func (t *Token) MarshalJSON() ([]byte, error) { KindID int `json:"kind_id"` ModeKindID int `json:"mode_kind_id"` KindName string `json:"kind_name"` + Row int `json:"row"` + Col int `json:"col"` Match byteSequence `json:"match"` Text string `json:"text"` EOF bool `json:"eof"` @@ -149,6 +128,8 @@ func (t *Token) MarshalJSON() ([]byte, error) { KindID: t.KindID.Int(), ModeKindID: t.ModeKindID.Int(), KindName: t.KindName.String(), + Row: t.Row, + Col: t.Col, Match: t.match, Text: t.Text(), EOF: t.EOF, @@ -180,6 +161,10 @@ type Lexer struct { clspec *spec.CompiledLexSpec src []byte srcPtr int + row int + col int + prevRow int + prevCol int tokBuf []*Token modeStack []spec.LexModeID passiveModeTran bool @@ -195,6 +180,8 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption) clspec: clspec, src: b, srcPtr: 0, + row: 0, + col: 0, modeStack: []spec.LexModeID{ clspec.InitialModeID, }, @@ -302,6 +289,8 @@ func (l *Lexer) next() (*Token, error) { state := spec.DFA.InitialStateID buf := []byte{} unfixedBufLen := 0 + row := l.row + col := l.col var tok *Token for { v, eof := l.read() @@ -313,9 +302,24 @@ func (l *Lexer) next() (*Token, error) { // When `buf` has unaccepted data and reads the EOF, // the lexer treats the buffered data as an invalid token. if len(buf) > 0 { - return newInvalidToken(mode, modeName, newByteSequence(buf)), nil + return &Token{ + ModeID: mode, + ModeName: modeName, + ModeKindID: 0, + Row: row, + Col: col, + match: newByteSequence(buf), + Invalid: true, + }, nil } - return newEOFToken(mode, modeName), nil + return &Token{ + ModeID: mode, + ModeName: modeName, + ModeKindID: 0, + Row: 0, + Col: 0, + EOF: true, + }, nil } buf = append(buf, v) unfixedBufLen++ @@ -325,13 +329,30 @@ func (l *Lexer) next() (*Token, error) { l.unread(unfixedBufLen) return tok, nil } - return newInvalidToken(mode, modeName, newByteSequence(buf)), nil + return &Token{ + ModeID: mode, + ModeName: modeName, + ModeKindID: 0, + Row: row, + Col: col, + match: newByteSequence(buf), + Invalid: true, + }, nil } state = nextState modeKindID := spec.DFA.AcceptingStates[state] if modeKindID != 0 { kindID := l.clspec.KindIDs[mode][modeKindID] - tok = newToken(mode, modeName, kindID, modeKindID, spec.KindNames[modeKindID], newByteSequence(buf)) + tok = &Token{ + ModeID: mode, + ModeName: modeName, + KindID: kindID, + ModeKindID: modeKindID, + KindName: spec.KindNames[modeKindID], + Row: row, + Col: col, + match: newByteSequence(buf), + } unfixedBufLen = 0 } } @@ -384,11 +405,38 @@ func (l *Lexer) read() (byte, bool) { if l.srcPtr >= len(l.src) { return 0, true } + b := l.src[l.srcPtr] l.srcPtr++ + + l.prevRow = l.row + l.prevCol = l.col + + // Count the token positions. + // The driver treats LF as the end of lines and counts columns in code points, not bytes. + // To count in code points, we refer to the First Byte column in the Table 3-6. + // + // Reference: + // - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6. UTF-8 Bit Distribution + if b < 128 { + // 0x0A is LF. + if b == 0x0A { + l.row++ + l.col = 0 + } else { + l.col++ + } + } else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 { + l.col++ + } + return b, false } +// You must not call this function consecutively to record the token position correctly. func (l *Lexer) unread(n int) { l.srcPtr -= n + + l.row = l.prevRow + l.col = l.prevCol } diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 5abe83c..33b206f 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -42,14 +42,50 @@ func newLexEntryFragment(kind string, pattern string) *spec.LexEntry { } } +func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token { + return &Token{ + ModeID: modeID, + ModeName: modeName, + KindID: kindID, + ModeKindID: modeKindID, + KindName: kindName, + match: match, + } +} + func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token { return newToken(spec.LexModeIDDefault, spec.LexModeNameDefault, spec.LexKindID(kindID), spec.LexModeKindID(modeKindID), spec.LexKindName(kindName), match) } +func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token { + return &Token{ + ModeID: modeID, + ModeName: modeName, + ModeKindID: 0, + EOF: true, + } +} + func newEOFTokenDefault() *Token { return newEOFToken(spec.LexModeIDDefault, spec.LexModeNameDefault) } +func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token { + return &Token{ + ModeID: modeID, + ModeName: modeName, + ModeKindID: 0, + match: match, + Invalid: true, + } +} + +func withPos(tok *Token, row, col int) *Token { + tok.Row = row + tok.Col = col + return tok +} + func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec @@ -715,7 +751,7 @@ func TestLexer_Next(t *testing.T) { t.Log(err) break } - testToken(t, eTok, tok) + testToken(t, eTok, tok, false) // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match(), tok.Text(), tok.EOF, tok.Invalid) if tok.EOF { break @@ -733,7 +769,102 @@ func TestLexer_Next(t *testing.T) { } } -func testToken(t *testing.T, expected, actual *Token) { +func TestLexer_Next_WithPosition(t *testing.T) { + lspec := &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("newline", `\u{000A}+`), + newLexEntryDefaultNOP("any", `.`), + }, + } + + clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + src := string([]byte{ + 0x00, + 0x7F, + 0x0A, + + 0xC2, 0x80, + 0xDF, 0xBF, + 0x0A, + + 0xE0, 0xA0, 0x80, + 0xE0, 0xBF, 0xBF, + 0xE1, 0x80, 0x80, + 0xEC, 0xBF, 0xBF, + 0xED, 0x80, 0x80, + 0xED, 0x9F, 0xBF, + 0xEE, 0x80, 0x80, + 0xEF, 0xBF, 0xBF, + 0x0A, + + 0xF0, 0x90, 0x80, 0x80, + 0xF0, 0xBF, 0xBF, 0xBF, + 0xF1, 0x80, 0x80, 0x80, + 0xF3, 0xBF, 0xBF, 0xBF, + 0xF4, 0x80, 0x80, 0x80, + 0xF4, 0x8F, 0xBF, 0xBF, + 0x0A, + 0x0A, + 0x0A, + }) + + expected := []*Token{ + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x00})), 0, 0), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x7F})), 0, 1), + withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 0, 2), + + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xC2, 0x80})), 1, 0), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xDF, 0xBF})), 1, 1), + withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 1, 2), + + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xA0, 0x80})), 2, 0), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xBF, 0xBF})), 2, 1), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE1, 0x80, 0x80})), 2, 2), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEC, 0xBF, 0xBF})), 2, 3), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x80, 0x80})), 2, 4), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x9F, 0xBF})), 2, 5), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEE, 0x80, 0x80})), 2, 6), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEF, 0xBF, 0xBF})), 2, 7), + withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 2, 8), + + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0x90, 0x80, 0x80})), 3, 0), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0xBF, 0xBF, 0xBF})), 3, 1), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF1, 0x80, 0x80, 0x80})), 3, 2), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF3, 0xBF, 0xBF, 0xBF})), 3, 3), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x80, 0x80, 0x80})), 3, 4), + withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x8F, 0xBF, 0xBF})), 3, 5), + + // When a token contains multiple line breaks, the driver sets the token position to + // the line number where a lexeme first appears. + withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A, 0x0A, 0x0A})), 3, 6), + + withPos(newEOFTokenDefault(), 0, 0), + } + + lexer, err := NewLexer(clspec, strings.NewReader(src)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + for _, eTok := range expected { + tok, err := lexer.Next() + if err != nil { + t.Fatal(err) + } + + testToken(t, eTok, tok, true) + + if tok.EOF { + break + } + } +} + +func testToken(t *testing.T, expected, actual *Token, checkPosition bool) { t.Helper() if actual.ModeID != expected.ModeID || @@ -746,4 +877,10 @@ func testToken(t *testing.T, expected, actual *Token) { actual.Invalid != expected.Invalid { t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text()) } + + if checkPosition { + if actual.Row != expected.Row || actual.Col != expected.Col { + t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text()) + } + } } |