aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md108
-rw-r--r--cmd/maleeni-go/generate.go68
-rw-r--r--cmd/maleeni-go/main.go12
-rw-r--r--driver/template.go517
4 files changed, 668 insertions, 37 deletions
diff --git a/README.md b/README.md
index 44e5a6f..e73d865 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,27 @@
# maleeni
-maleeni provides a command that generates a portable DFA for lexical analysis and a driver for golang. maleeni also provides a command to perform lexical analysis to allow easy debugging of your lexical specification.
+maleeni is a lexer generator for golang. maleeni also provides a command to perform lexical analysis to allow easy debugging of your lexical specification.
[![Test](https://github.com/nihei9/maleeni/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/nihei9/maleeni/actions/workflows/test.yml)
## Installation
+Compiler:
+
```sh
$ go install github.com/nihei9/maleeni/cmd/maleeni@latest
```
+Code Generator:
+
+```sh
+$ go install github.com/nihei9/maleeni/cmd/maleeni-go@latest
+```
+
## Usage
+### 1. Define your lexical specification
+
First, define your lexical specification in JSON format. As an example, let's write the definitions of whitespace, words, and punctuation.
```json
@@ -35,14 +45,17 @@ First, define your lexical specification in JSON format. As an example, let's wr
Save the above specification to a file in UTF-8. In this explanation, the file name is lexspec.json.
+### 2. Compile the lexical specification
+
Next, generate a DFA from the lexical specification using `maleeni compile` command.
```sh
$ maleeni compile -l lexspec.json -o clexspec.json
```
-If you want to make sure that the lexical specification behaves as expected, you can use `maleeni lex` command to try lexical analysis without having to implement a driver.
-`maleeni lex` command outputs tokens in JSON format. For simplicity, print significant fields of the tokens in CSV format using jq command.
+### 3. Debug (Optional)
+
+If you want to make sure that the lexical specification behaves as expected, you can use `maleeni lex` command to try lexical analysis without having to generate a lexer. `maleeni lex` command outputs tokens in JSON format. For simplicity, print significant fields of the tokens in CSV format using jq command.
⚠️ An encoding that `maleeni lex` and the driver can handle is only UTF-8.
@@ -76,50 +89,71 @@ The JSON format of tokens that `maleeni lex` command prints is as follows:
| eof | bool | When this field is `true`, it means the token is the EOF token. |
| invalid | bool | When this field is `true`, it means the token is an error token. |
-When using the driver, please import `github.com/nihei9/maleeni/driver` and `github.com/nihei9/maleeni/spec` package.
-You can use the driver easily in the following way:
+### 4. Generate the lexer
+
+Using `maleeni-go` command, you can generate a source code of the lexer to recognize your lexical specification.
+
+```sh
+$ maleeni-go clexspec.json > lexer.go
+```
+
+The above command generates the lexer and saves it to `lexer.go` file. To use the lexer, you need to call `NewLexer` function defined in `lexer.go`. The following code is a simple example. In this example, the lexer reads a source code from stdin and writes the result, tokens, to stdout.
```go
-// Read your lexical specification file.
-f, err := os.Open(path)
-if err != nil {
- // error handling
-}
-data, err := ioutil.ReadAll(f)
-if err != nil {
- // error handling
-}
-clexspec := &spec.CompiledLexSpec{}
-err = json.Unmarshal(data, clexspec)
-if err != nil {
- // error handling
-}
+package main
-// Generate a lexer.
-lex, err := driver.NewLexer(clexspec, src)
-if err != nil {
- // error handling
-}
+import (
+ "fmt"
+ "os"
+)
-// Perform lexical analysis.
-for {
- tok, err := lex.Next()
+func main() {
+ lex, err := NewLexer(NewLexSpec(), os.Stdin)
if err != nil {
- // error handling
- }
- if tok.Invalid {
- // An error token appeared.
- // error handling
- }
- if tok.EOF {
- // The EOF token appeared.
- break
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
}
- // Do something using `tok`.
+ for {
+ tok, err := lex.Next()
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
+ }
+ if tok.EOF {
+ break
+ }
+ if tok.Invalid {
+ fmt.Printf("invalid: '%v'\n", string(tok.Lexeme))
+ } else {
+ fmt.Printf("valid: %v: '%v'\n", tok.KindName, string(tok.Lexeme))
+ }
+ }
}
```
+Please save the above source code to `main.go` and create a directory structure like the one below.
+
+```
+/project_root
+├── lexer.go ... Lexer generated from the compiled lexical specification (the result of `maleeni-go`).
+└── main.go .... Caller of the lexer.
+```
+
+Now, you can perform the lexical analysis.
+
+```sh
+$ echo -n 'I want to believe.' | go run main.go lexer.go
+valid: word: 'I'
+valid: whitespace: ' '
+valid: word: 'want'
+valid: whitespace: ' '
+valid: word: 'to'
+valid: whitespace: ' '
+valid: word: 'believe'
+valid: punctuation: '.'
+```
+
## More Practical Usage
See also [this example](example/README.md).
diff --git a/cmd/maleeni-go/generate.go b/cmd/maleeni-go/generate.go
new file mode 100644
index 0000000..d37defd
--- /dev/null
+++ b/cmd/maleeni-go/generate.go
@@ -0,0 +1,68 @@
+package main
+
+import (
+ _ "embed"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "github.com/nihei9/maleeni/driver"
+ "github.com/nihei9/maleeni/spec"
+ "github.com/spf13/cobra"
+)
+
+func Execute() error {
+ err := generateCmd.Execute()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "%v\n", err)
+ return err
+ }
+
+ return nil
+}
+
+var generateFlags = struct {
+ pkgName *string
+}{}
+
+var generateCmd = &cobra.Command{
+ Use: "maleeni-go",
+ Short: "Generate a lexer for Go",
+ Long: `maleeni-go generates a lexer for Go. The lexer recognizes the lexical specification specified as the argument.`,
+ Example: ` maleeni-go clexspec.json > lexer.go`,
+ Args: cobra.ExactArgs(1),
+ RunE: runGenerate,
+ SilenceErrors: true,
+ SilenceUsage: true,
+}
+
+func init() {
+ generateFlags.pkgName = generateCmd.Flags().StringP("package", "p", "main", "package name")
+}
+
+func runGenerate(cmd *cobra.Command, args []string) (retErr error) {
+ clspec, err := readCompiledLexSpec(args[0])
+ if err != nil {
+ return fmt.Errorf("Cannot read a compiled lexical specification: %w", err)
+ }
+
+ return driver.GenLexer(clspec, *generateFlags.pkgName)
+}
+
+func readCompiledLexSpec(path string) (*spec.CompiledLexSpec, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ data, err := ioutil.ReadAll(f)
+ if err != nil {
+ return nil, err
+ }
+ clspec := &spec.CompiledLexSpec{}
+ err = json.Unmarshal(data, clspec)
+ if err != nil {
+ return nil, err
+ }
+ return clspec, nil
+}
diff --git a/cmd/maleeni-go/main.go b/cmd/maleeni-go/main.go
new file mode 100644
index 0000000..701f02f
--- /dev/null
+++ b/cmd/maleeni-go/main.go
@@ -0,0 +1,12 @@
+package main
+
+import (
+ "os"
+)
+
+func main() {
+ err := Execute()
+ if err != nil {
+ os.Exit(1)
+ }
+}
diff --git a/driver/template.go b/driver/template.go
new file mode 100644
index 0000000..f7caa75
--- /dev/null
+++ b/driver/template.go
@@ -0,0 +1,517 @@
+package driver
+
+import (
+ _ "embed"
+ "fmt"
+ "go/ast"
+ "go/format"
+ "go/parser"
+ "go/token"
+ "os"
+ "strings"
+ "text/template"
+
+ "github.com/nihei9/maleeni/spec"
+)
+
+//go:embed lexer.go
+var lexerCoreSrc string
+
+func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error {
+ var lexerSrc string
+ {
+ fset := token.NewFileSet()
+ f, err := parser.ParseFile(fset, "lexer.go", lexerCoreSrc, parser.ParseComments)
+ if err != nil {
+ return err
+ }
+
+ var b strings.Builder
+ err = format.Node(&b, fset, f)
+ if err != nil {
+ return err
+ }
+
+ lexerSrc = b.String()
+ }
+
+ var specSrc string
+ {
+ t, err := template.New("").Funcs(genTemplateFuncs(clspec)).Parse(lexSpecTemplate)
+ if err != nil {
+ return err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, map[string]interface{}{
+ "initialModeID": clspec.InitialModeID,
+ "modeIDNil": spec.LexModeIDNil,
+ "modeKindIDNil": spec.LexModeKindIDNil,
+ "stateIDNil": spec.StateIDNil,
+ "compressionLevel": clspec.CompressionLevel,
+ })
+ if err != nil {
+ return err
+ }
+
+ specSrc = b.String()
+ }
+
+ var src string
+ {
+ tmpl := `// Code generated by maleeni-go. DO NOT EDIT.
+{{ .lexerSrc }}
+
+{{ .specSrc }}
+`
+
+ t, err := template.New("").Parse(tmpl)
+ if err != nil {
+ return err
+ }
+
+ var b strings.Builder
+ err = t.Execute(&b, map[string]string{
+ "lexerSrc": lexerSrc,
+ "specSrc": specSrc,
+ })
+ if err != nil {
+ return err
+ }
+
+ src = b.String()
+ }
+
+ fset := token.NewFileSet()
+ f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
+ if err != nil {
+ return err
+ }
+
+ f.Name = ast.NewIdent(pkgName)
+
+ return format.Node(os.Stdout, fset, f)
+}
+
+const lexSpecTemplate = `
+type lexSpec struct {
+ pop [][]bool
+ push [][]ModeID
+ modeNames []string
+ initialStates []StateID
+ acceptances [][]ModeKindID
+ kindIDs [][]KindID
+ kindNames []string
+ initialModeID ModeID
+ modeIDNil ModeID
+ modeKindIDNil ModeKindID
+ stateIDNil StateID
+
+ rowNums [][]int
+ rowDisplacements [][]int
+ bounds [][]int
+ entries [][]StateID
+ originalColCounts []int
+}
+
+func NewLexSpec() *lexSpec {
+ return &lexSpec{
+ pop: {{ genPopTable }},
+ push: {{ genPushTable }},
+ modeNames: {{ genModeNameTable }},
+ initialStates: {{ genInitialStateTable }},
+ acceptances: {{ genAcceptTable }},
+ kindIDs: {{ genKindIDTable }},
+ kindNames: {{ genKindNameTable }},
+ initialModeID: {{ .initialModeID }},
+ modeIDNil: {{ .modeIDNil }},
+ modeKindIDNil: {{ .modeKindIDNil }},
+ stateIDNil: {{ .stateIDNil }},
+
+ rowNums: {{ genRowNums }},
+ rowDisplacements: {{ genRowDisplacements }},
+ bounds: {{ genBounds }},
+ entries: {{ genEntries }},
+ originalColCounts: {{ genOriginalColCounts }},
+ }
+}
+
+func (s *lexSpec) InitialMode() ModeID {
+ return s.initialModeID
+}
+
+func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool {
+ return s.pop[mode][modeKind]
+}
+
+func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) {
+ id := s.push[mode][modeKind]
+ return id, id != s.modeIDNil
+}
+
+func (s *lexSpec) ModeName(mode ModeID) string {
+ return s.modeNames[mode]
+}
+
+func (s *lexSpec) InitialState(mode ModeID) StateID {
+ return s.initialStates[mode]
+}
+
+func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) {
+{{ if eq .compressionLevel 2 -}}
+ rowNum := s.rowNums[mode][state]
+ d := s.rowDisplacements[mode][rowNum]
+ if s.bounds[mode][d+v] != rowNum {
+ return s.stateIDNil, false
+ }
+ return s.entries[mode][d+v], true
+{{ else if eq .compressionLevel 1 -}}
+ rowNum := s.rowNums[mode][state]
+ colCount := s.originalColCounts[mode]
+ next := s.entries[mode][rowNum*colCount+v]
+ if next == s.stateIDNil {
+ return s.stateIDNil, false
+ }
+ return next, true
+{{ else -}}
+ colCount := s.originalColCounts[mode]
+ next := s.entries[mode][int(state)*colCount+v]
+ if next == s.stateIDNil {
+ return s.stateIDNil, false
+ }
+ return next, true
+{{ end -}}
+}
+
+func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) {
+ id := s.acceptances[mode][state]
+ return id, id != s.modeKindIDNil
+}
+
+func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) {
+ id := s.kindIDs[mode][modeKind]
+ return id, s.kindNames[id]
+}
+`
+
+func genTemplateFuncs(clspec *spec.CompiledLexSpec) template.FuncMap {
+ fns := template.FuncMap{
+ "genPopTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]bool{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.Pop[0] != 0)
+ for _, v := range s.Pop[1:] {
+ fmt.Fprintf(&b, ", %v", v != 0)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genPushTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]ModeID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.Push[0])
+ for _, v := range s.Push[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genModeNameTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]string{\n")
+ for i, name := range clspec.ModeNames {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "%#v,\n", "")
+ continue
+ }
+
+ fmt.Fprintf(&b, "%#v,\n", name)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genInitialStateTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]StateID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "%v,\n", spec.StateIDNil)
+ continue
+ }
+
+ fmt.Fprintf(&b, "%v,\n", s.DFA.InitialStateID)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genAcceptTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]ModeKindID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.AcceptingStates[0])
+ for _, v := range s.DFA.AcceptingStates[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genKindIDTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]KindID{\n")
+ for i, ids := range clspec.KindIDs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", ids[0])
+ for _, v := range ids[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ "genKindNameTable": func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]string{\n")
+ for i, name := range clspec.KindNames {
+ if i == spec.LexKindIDNil.Int() {
+ fmt.Fprintf(&b, "%#v,\n", "")
+ continue
+ }
+
+ fmt.Fprintf(&b, "%#v,\n", name)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ },
+ }
+
+ switch clspec.CompressionLevel {
+ case 2:
+ fns["genRowNums"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.RowNums[0])
+ for _, v := range s.DFA.Transition.RowNums[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genRowDisplacements"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.UniqueEntries.RowDisplacement[0])
+ for _, d := range s.DFA.Transition.UniqueEntries.RowDisplacement[1:] {
+ fmt.Fprintf(&b, ", %v", d)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genBounds"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.UniqueEntries.Bounds[0])
+ for _, v := range s.DFA.Transition.UniqueEntries.Bounds[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genEntries"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]StateID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.UniqueEntries.Entries[0])
+ for _, v := range s.DFA.Transition.UniqueEntries.Entries[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genOriginalColCounts"] = func() string {
+ return "nil"
+ }
+ case 1:
+ fns["genRowNums"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.RowNums[0])
+ for _, v := range s.DFA.Transition.RowNums[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genRowDisplacements"] = func() string {
+ return "nil"
+ }
+
+ fns["genBounds"] = func() string {
+ return "nil"
+ }
+
+ fns["genEntries"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]StateID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.Transition.UncompressedUniqueEntries[0])
+ for _, v := range s.DFA.Transition.UncompressedUniqueEntries[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genOriginalColCounts"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "0,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "%v,\n", s.DFA.Transition.OriginalColCount)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+ default:
+ fns["genRowNums"] = func() string {
+ return "nil"
+ }
+
+ fns["genRowDisplacements"] = func() string {
+ return "nil"
+ }
+
+ fns["genBounds"] = func() string {
+ return "nil"
+ }
+
+ fns["genEntries"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[][]StateID{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "nil,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "{")
+ fmt.Fprintf(&b, "%v", s.DFA.UncompressedTransition[0])
+ for _, v := range s.DFA.UncompressedTransition[1:] {
+ fmt.Fprintf(&b, ", %v", v)
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+
+ fns["genOriginalColCounts"] = func() string {
+ var b strings.Builder
+ fmt.Fprintf(&b, "[]int{\n")
+ for i, s := range clspec.Specs {
+ if i == spec.LexModeIDNil.Int() {
+ fmt.Fprintf(&b, "0,\n")
+ continue
+ }
+
+ fmt.Fprintf(&b, "%v,\n", s.DFA.ColCount)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+ }
+ }
+
+ return fns
+}