diff options
Diffstat (limited to 'src/urubu/grammar/item.go')
-rw-r--r-- | src/urubu/grammar/item.go | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go new file mode 100644 index 0000000..6c5fe42 --- /dev/null +++ b/src/urubu/grammar/item.go @@ -0,0 +1,206 @@ +package grammar + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "sort" + "strconv" + + "urubu/grammar/symbol" +) + +type lrItemID [32]byte + +func (id lrItemID) String() string { + return fmt.Sprintf("%x", id.num()) +} + +func (id lrItemID) num() uint32 { + return binary.LittleEndian.Uint32(id[:]) +} + +type lookAhead struct { + symbols map[symbol.Symbol]struct{} + + // When propagation is true, an item propagates look-ahead symbols to other items. + propagation bool +} + +type lrItem struct { + id lrItemID + prod productionID + + // E → E + T + // + // Dot | Dotted Symbol | Item + // ----+---------------+------------ + // 0 | E | E →・E + T + // 1 | + | E → E・+ T + // 2 | T | E → E +・T + // 3 | Nil | E → E + T・ + dot int + dottedSymbol symbol.Symbol + + // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. + // It looks like S' →・S. + initial bool + + // When reducible is true, the item looks like E → E + T・. + reducible bool + + // When kernel is true, the item is kernel item. + kernel bool + + // lookAhead stores look-ahead symbols, and they are terminal symbols. + // The item is reducible only when the look-ahead symbols appear as the next input symbol. + lookAhead lookAhead +} + +func newLR0Item(prod *production, dot int) (*lrItem, error) { + if prod == nil { + return nil, fmt.Errorf("production must be non-nil") + } + + if dot < 0 || dot > prod.rhsLen { + return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) + } + + var id lrItemID + { + b := []byte{} + b = append(b, prod.id[:]...) + bDot := make([]byte, 8) + binary.LittleEndian.PutUint64(bDot, uint64(dot)) + b = append(b, bDot...) + id = sha256.Sum256(b) + } + + dottedSymbol := symbol.SymbolNil + if dot < prod.rhsLen { + dottedSymbol = prod.rhs[dot] + } + + initial := false + if prod.lhs.IsStart() && dot == 0 { + initial = true + } + + reducible := false + if dot == prod.rhsLen { + reducible = true + } + + kernel := false + if initial || dot > 0 { + kernel = true + } + + item := &lrItem{ + id: id, + prod: prod.id, + dot: dot, + dottedSymbol: dottedSymbol, + initial: initial, + reducible: reducible, + kernel: kernel, + } + + return item, nil +} + +type kernelID [32]byte + +func (id kernelID) String() string { + return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) +} + +type kernel struct { + id kernelID + items []*lrItem +} + +func newKernel(items []*lrItem) (*kernel, error) { + if len(items) == 0 { + return nil, fmt.Errorf("a kernel need at least one item") + } + + // Remove duplicates from items. + var sortedItems []*lrItem + { + m := map[lrItemID]*lrItem{} + for _, item := range items { + if !item.kernel { + return nil, fmt.Errorf("not a kernel item: %v", item) + } + m[item.id] = item + } + sortedItems = []*lrItem{} + for _, item := range m { + sortedItems = append(sortedItems, item) + } + sort.Slice(sortedItems, func(i, j int) bool { + return sortedItems[i].id.num() < sortedItems[j].id.num() + }) + } + + var id kernelID + { + b := []byte{} + for _, item := range sortedItems { + b = append(b, item.id[:]...) + } + id = sha256.Sum256(b) + } + + return &kernel{ + id: id, + items: sortedItems, + }, nil +} + +type stateNum int + +const stateNumInitial = stateNum(0) + +func (n stateNum) Int() int { + return int(n) +} + +func (n stateNum) String() string { + return strconv.Itoa(int(n)) +} + +func (n stateNum) next() stateNum { + return stateNum(n + 1) +} + +type lrState struct { + *kernel + num stateNum + next map[symbol.Symbol]kernelID + reducible map[productionID]struct{} + + // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. + // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store + // look-ahead symbols because the kernel items don't include these items. + // + // For instance, we have the following productions, and A is a terminal symbol. + // + // s' → s + // s → A | ε + // + // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't + // include `s → ・ε`. + // + // s' → ・s + // s → ・A + // s → ・ε + emptyProdItems []*lrItem + + // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. + // The `α` and `β` can be empty. + // + // A → α・error β + isErrorTrapper bool +} |