Add lexeme positions to tokens

close #1
author: Ryo Nihei <nihei.dev@gmail.com> 2021-08-07 12:55:44 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-08-07 12:55:44 +0900
commit: 82efd35e6f99af0eff0430fc32b825d5cb38ac4d (patch)
tree: fd0b19af5424d091b877ba283efe49457cbfa90e
parent: Use Go 1.16 (diff)
download: tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.gz
tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.xz
3 files changed, 226 insertions, 39 deletions
diff --git a/README.md b/README.md
index ae3e8fa..5ec3111 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,8 @@ The JSON format of tokens that `maleeni lex` command prints is as follows:
 | kind_id      | integer           | An ID of a kind. This is unique among all modes.                                                                                                      |
 | mode_kind_id | integer           | An ID of a lexical kind. This is unique only within a mode. Note that you need to use `KindID` field if you want to identify a kind across all modes. |
 | kind_name    | string            | A name of a lexical kind.                                                                                                                             |
+| row          | integer           | A row number where a lexeme appears.                                                                                                                  |
+| col          | integer           | A column number where a lexeme appears. Note that `col` is counted in code points, not bytes.                                                         |
 | match        | array of integers | A byte sequense of a lexeme.                                                                                                                          |
 | text         | string            | A string representation of a lexeme.                                                                                                                  |
 | eof          | bool              | When this field is `true`, it means the token is the EOF token.                                                                                       |
diff --git a/driver/lexer.go b/driver/lexer.go
index bad7dbd..03540e3 100644
--- a/driver/lexer.go
+++ b/driver/lexer.go
@@ -72,6 +72,13 @@ type Token struct {
 	// KindName is a name of a lexical kind.
 	KindName spec.LexKindName
 
+	// Row is a row number where a lexeme appears.
+	Row int
+
+	// Col is a column number where a lexeme appears.
+	// Note that Col is counted in code points, not bytes.
+	Col int
+
 	// When this field is true, it means the token is the EOF token.
 	EOF bool
 
@@ -82,44 +89,14 @@ type Token struct {
 	match byteSequence
 }
 
-func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token {
-	return &Token{
-		ModeID:     modeID,
-		ModeName:   modeName,
-		KindID:     kindID,
-		ModeKindID: modeKindID,
-		KindName:   kindName,
-		match:      match,
-	}
-}
-
-func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token {
-	return &Token{
-		ModeID:     modeID,
-		ModeName:   modeName,
-		ModeKindID: 0,
-		EOF:        true,
-	}
-}
-
-func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token {
-	return &Token{
-		ModeID:     modeID,
-		ModeName:   modeName,
-		ModeKindID: 0,
-		match:      match,
-		Invalid:    true,
-	}
-}
-
 func (t *Token) String() string {
 	if t.Invalid {
-		return fmt.Sprintf("!{mode id: %v, mode name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Text(), t.Match())
+		return fmt.Sprintf("!{mode id: %v, mode name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Row, t.Col, t.Text(), t.Match())
 	}
 	if t.EOF {
-		return "{eof}"
+		return fmt.Sprintf("{kind name: eof, row: %v, col: %v}", t.Row, t.Col)
 	}
-	return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Text(), t.Match())
+	return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Row, t.Col, t.Text(), t.Match())
 }
 
 // Match returns a byte slice matched a pattern of a lexical specification.
@@ -139,6 +116,8 @@ func (t *Token) MarshalJSON() ([]byte, error) {
 		KindID     int          `json:"kind_id"`
 		ModeKindID int          `json:"mode_kind_id"`
 		KindName   string       `json:"kind_name"`
+		Row        int          `json:"row"`
+		Col        int          `json:"col"`
 		Match      byteSequence `json:"match"`
 		Text       string       `json:"text"`
 		EOF        bool         `json:"eof"`
@@ -149,6 +128,8 @@ func (t *Token) MarshalJSON() ([]byte, error) {
 		KindID:     t.KindID.Int(),
 		ModeKindID: t.ModeKindID.Int(),
 		KindName:   t.KindName.String(),
+		Row:        t.Row,
+		Col:        t.Col,
 		Match:      t.match,
 		Text:       t.Text(),
 		EOF:        t.EOF,
@@ -180,6 +161,10 @@ type Lexer struct {
 	clspec          *spec.CompiledLexSpec
 	src             []byte
 	srcPtr          int
+	row             int
+	col             int
+	prevRow         int
+	prevCol         int
 	tokBuf          []*Token
 	modeStack       []spec.LexModeID
 	passiveModeTran bool
@@ -195,6 +180,8 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption)
 		clspec: clspec,
 		src:    b,
 		srcPtr: 0,
+		row:    0,
+		col:    0,
 		modeStack: []spec.LexModeID{
 			clspec.InitialModeID,
 		},
@@ -302,6 +289,8 @@ func (l *Lexer) next() (*Token, error) {
 	state := spec.DFA.InitialStateID
 	buf := []byte{}
 	unfixedBufLen := 0
+	row := l.row
+	col := l.col
 	var tok *Token
 	for {
 		v, eof := l.read()
@@ -313,9 +302,24 @@ func (l *Lexer) next() (*Token, error) {
 			// When `buf` has unaccepted data and reads the EOF,
 			// the lexer treats the buffered data as an invalid token.
 			if len(buf) > 0 {
-				return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
+				return &Token{
+					ModeID:     mode,
+					ModeName:   modeName,
+					ModeKindID: 0,
+					Row:        row,
+					Col:        col,
+					match:      newByteSequence(buf),
+					Invalid:    true,
+				}, nil
 			}
-			return newEOFToken(mode, modeName), nil
+			return &Token{
+				ModeID:     mode,
+				ModeName:   modeName,
+				ModeKindID: 0,
+				Row:        0,
+				Col:        0,
+				EOF:        true,
+			}, nil
 		}
 		buf = append(buf, v)
 		unfixedBufLen++
@@ -325,13 +329,30 @@ func (l *Lexer) next() (*Token, error) {
 				l.unread(unfixedBufLen)
 				return tok, nil
 			}
-			return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
+			return &Token{
+				ModeID:     mode,
+				ModeName:   modeName,
+				ModeKindID: 0,
+				Row:        row,
+				Col:        col,
+				match:      newByteSequence(buf),
+				Invalid:    true,
+			}, nil
 		}
 		state = nextState
 		modeKindID := spec.DFA.AcceptingStates[state]
 		if modeKindID != 0 {
 			kindID := l.clspec.KindIDs[mode][modeKindID]
-			tok = newToken(mode, modeName, kindID, modeKindID, spec.KindNames[modeKindID], newByteSequence(buf))
+			tok = &Token{
+				ModeID:     mode,
+				ModeName:   modeName,
+				KindID:     kindID,
+				ModeKindID: modeKindID,
+				KindName:   spec.KindNames[modeKindID],
+				Row:        row,
+				Col:        col,
+				match:      newByteSequence(buf),
+			}
 			unfixedBufLen = 0
 		}
 	}
@@ -384,11 +405,38 @@ func (l *Lexer) read() (byte, bool) {
 	if l.srcPtr >= len(l.src) {
 		return 0, true
 	}
+
 	b := l.src[l.srcPtr]
 	l.srcPtr++
+
+	l.prevRow = l.row
+	l.prevCol = l.col
+
+	// Count the token positions.
+	// The driver treats LF as the end of lines and counts columns in code points, not bytes.
+	// To count in code points, we refer to the First Byte column in the Table 3-6.
+	//
+	// Reference:
+	// - [Table 3-6] https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf > Table 3-6.  UTF-8 Bit Distribution
+	if b < 128 {
+		// 0x0A is LF.
+		if b == 0x0A {
+			l.row++
+			l.col = 0
+		} else {
+			l.col++
+		}
+	} else if b>>5 == 6 || b>>4 == 14 || b>>3 == 30 {
+		l.col++
+	}
+
 	return b, false
 }
 
+// You must not call this function consecutively to record the token position correctly.
 func (l *Lexer) unread(n int) {
 	l.srcPtr -= n
+
+	l.row = l.prevRow
+	l.col = l.prevCol
 }
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 5abe83c..33b206f 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -42,14 +42,50 @@ func newLexEntryFragment(kind string, pattern string) *spec.LexEntry {
 	}
 }
 
+func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token {
+	return &Token{
+		ModeID:     modeID,
+		ModeName:   modeName,
+		KindID:     kindID,
+		ModeKindID: modeKindID,
+		KindName:   kindName,
+		match:      match,
+	}
+}
+
 func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token {
 	return newToken(spec.LexModeIDDefault, spec.LexModeNameDefault, spec.LexKindID(kindID), spec.LexModeKindID(modeKindID), spec.LexKindName(kindName), match)
 }
 
+func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token {
+	return &Token{
+		ModeID:     modeID,
+		ModeName:   modeName,
+		ModeKindID: 0,
+		EOF:        true,
+	}
+}
+
 func newEOFTokenDefault() *Token {
 	return newEOFToken(spec.LexModeIDDefault, spec.LexModeNameDefault)
 }
 
+func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token {
+	return &Token{
+		ModeID:     modeID,
+		ModeName:   modeName,
+		ModeKindID: 0,
+		match:      match,
+		Invalid:    true,
+	}
+}
+
+func withPos(tok *Token, row, col int) *Token {
+	tok.Row = row
+	tok.Col = col
+	return tok
+}
+
 func TestLexer_Next(t *testing.T) {
 	test := []struct {
 		lspec           *spec.LexSpec
@@ -715,7 +751,7 @@ func TestLexer_Next(t *testing.T) {
 						t.Log(err)
 						break
 					}
-					testToken(t, eTok, tok)
+					testToken(t, eTok, tok, false)
 					// t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match(), tok.Text(), tok.EOF, tok.Invalid)
 					if tok.EOF {
 						break
@@ -733,7 +769,102 @@ func TestLexer_Next(t *testing.T) {
 	}
 }
 
-func testToken(t *testing.T, expected, actual *Token) {
+func TestLexer_Next_WithPosition(t *testing.T) {
+	lspec := &spec.LexSpec{
+		Entries: []*spec.LexEntry{
+			newLexEntryDefaultNOP("newline", `\u{000A}+`),
+			newLexEntryDefaultNOP("any", `.`),
+		},
+	}
+
+	clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	src := string([]byte{
+		0x00,
+		0x7F,
+		0x0A,
+
+		0xC2, 0x80,
+		0xDF, 0xBF,
+		0x0A,
+
+		0xE0, 0xA0, 0x80,
+		0xE0, 0xBF, 0xBF,
+		0xE1, 0x80, 0x80,
+		0xEC, 0xBF, 0xBF,
+		0xED, 0x80, 0x80,
+		0xED, 0x9F, 0xBF,
+		0xEE, 0x80, 0x80,
+		0xEF, 0xBF, 0xBF,
+		0x0A,
+
+		0xF0, 0x90, 0x80, 0x80,
+		0xF0, 0xBF, 0xBF, 0xBF,
+		0xF1, 0x80, 0x80, 0x80,
+		0xF3, 0xBF, 0xBF, 0xBF,
+		0xF4, 0x80, 0x80, 0x80,
+		0xF4, 0x8F, 0xBF, 0xBF,
+		0x0A,
+		0x0A,
+		0x0A,
+	})
+
+	expected := []*Token{
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x00})), 0, 0),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x7F})), 0, 1),
+		withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 0, 2),
+
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xC2, 0x80})), 1, 0),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xDF, 0xBF})), 1, 1),
+		withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 1, 2),
+
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xA0, 0x80})), 2, 0),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xBF, 0xBF})), 2, 1),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE1, 0x80, 0x80})), 2, 2),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEC, 0xBF, 0xBF})), 2, 3),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x80, 0x80})), 2, 4),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x9F, 0xBF})), 2, 5),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEE, 0x80, 0x80})), 2, 6),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEF, 0xBF, 0xBF})), 2, 7),
+		withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 2, 8),
+
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0x90, 0x80, 0x80})), 3, 0),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0xBF, 0xBF, 0xBF})), 3, 1),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF1, 0x80, 0x80, 0x80})), 3, 2),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF3, 0xBF, 0xBF, 0xBF})), 3, 3),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x80, 0x80, 0x80})), 3, 4),
+		withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x8F, 0xBF, 0xBF})), 3, 5),
+
+		// When a token contains multiple line breaks, the driver sets the token position to
+		// the line number where a lexeme first appears.
+		withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A, 0x0A, 0x0A})), 3, 6),
+
+		withPos(newEOFTokenDefault(), 0, 0),
+	}
+
+	lexer, err := NewLexer(clspec, strings.NewReader(src))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	for _, eTok := range expected {
+		tok, err := lexer.Next()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		testToken(t, eTok, tok, true)
+
+		if tok.EOF {
+			break
+		}
+	}
+}
+
+func testToken(t *testing.T, expected, actual *Token, checkPosition bool) {
 	t.Helper()
 
 	if actual.ModeID != expected.ModeID ||
@@ -746,4 +877,10 @@ func testToken(t *testing.T, expected, actual *Token) {
 		actual.Invalid != expected.Invalid {
 		t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text())
 	}
+
+	if checkPosition {
+		if actual.Row != expected.Row || actual.Col != expected.Col {
+			t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text())
+		}
+	}
 }
author	Ryo Nihei <nihei.dev@gmail.com>	2021-08-07 12:55:44 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-08-07 12:55:44 +0900
commit	82efd35e6f99af0eff0430fc32b825d5cb38ac4d (patch)
tree	fd0b19af5424d091b877ba283efe49457cbfa90e
parent	Use Go 1.16 (diff)
download	tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.gz tre-82efd35e6f99af0eff0430fc32b825d5cb38ac4d.tar.xz