2023-05-09 18:19:48 +01:00
|
|
|
package utilities
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sort"
|
|
|
|
)
|
|
|
|
|
|
|
|
// DoubleArray is a Double Array implementation of trie on sequences of strings.
|
|
|
|
type DoubleArray struct {
|
|
|
|
// Encoding keeps an encoding from string to int
|
|
|
|
Encoding map[string]int
|
|
|
|
// Base is the base array of Double Array
|
|
|
|
Base []int
|
|
|
|
// Check is the check array of Double Array
|
|
|
|
Check []int
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewDoubleArray builds a DoubleArray from a set of sequences of strings.
|
|
|
|
func NewDoubleArray(seqs [][]string) *DoubleArray {
|
|
|
|
da := &DoubleArray{Encoding: make(map[string]int)}
|
|
|
|
if len(seqs) == 0 {
|
|
|
|
return da
|
|
|
|
}
|
|
|
|
|
|
|
|
encoded := registerTokens(da, seqs)
|
|
|
|
sort.Sort(byLex(encoded))
|
|
|
|
|
|
|
|
root := node{row: -1, col: -1, left: 0, right: len(encoded)}
|
|
|
|
addSeqs(da, encoded, 0, root)
|
|
|
|
|
|
|
|
for i := len(da.Base); i > 0; i-- {
|
|
|
|
if da.Check[i-1] != 0 {
|
|
|
|
da.Base = da.Base[:i]
|
|
|
|
da.Check = da.Check[:i]
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return da
|
|
|
|
}
|
|
|
|
|
|
|
|
func registerTokens(da *DoubleArray, seqs [][]string) [][]int {
|
|
|
|
var result [][]int
|
|
|
|
for _, seq := range seqs {
|
2023-09-07 12:20:37 +01:00
|
|
|
encoded := make([]int, 0, len(seq))
|
2023-05-09 18:19:48 +01:00
|
|
|
for _, token := range seq {
|
|
|
|
if _, ok := da.Encoding[token]; !ok {
|
|
|
|
da.Encoding[token] = len(da.Encoding)
|
|
|
|
}
|
|
|
|
encoded = append(encoded, da.Encoding[token])
|
|
|
|
}
|
|
|
|
result = append(result, encoded)
|
|
|
|
}
|
|
|
|
for i := range result {
|
|
|
|
result[i] = append(result[i], len(da.Encoding))
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
type node struct {
|
|
|
|
row, col int
|
|
|
|
left, right int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n node) value(seqs [][]int) int {
|
|
|
|
return seqs[n.row][n.col]
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n node) children(seqs [][]int) []*node {
|
|
|
|
var result []*node
|
|
|
|
lastVal := int(-1)
|
|
|
|
last := new(node)
|
|
|
|
for i := n.left; i < n.right; i++ {
|
|
|
|
if lastVal == seqs[i][n.col+1] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
last.right = i
|
|
|
|
last = &node{
|
|
|
|
row: i,
|
|
|
|
col: n.col + 1,
|
|
|
|
left: i,
|
|
|
|
}
|
|
|
|
result = append(result, last)
|
|
|
|
}
|
|
|
|
last.right = n.right
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
func addSeqs(da *DoubleArray, seqs [][]int, pos int, n node) {
|
|
|
|
ensureSize(da, pos)
|
|
|
|
|
|
|
|
children := n.children(seqs)
|
|
|
|
var i int
|
|
|
|
for i = 1; ; i++ {
|
|
|
|
ok := func() bool {
|
|
|
|
for _, child := range children {
|
|
|
|
code := child.value(seqs)
|
|
|
|
j := i + code
|
|
|
|
ensureSize(da, j)
|
|
|
|
if da.Check[j] != 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}()
|
|
|
|
if ok {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
da.Base[pos] = i
|
|
|
|
for _, child := range children {
|
|
|
|
code := child.value(seqs)
|
|
|
|
j := i + code
|
|
|
|
da.Check[j] = pos + 1
|
|
|
|
}
|
|
|
|
terminator := len(da.Encoding)
|
|
|
|
for _, child := range children {
|
|
|
|
code := child.value(seqs)
|
|
|
|
if code == terminator {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
j := i + code
|
|
|
|
addSeqs(da, seqs, j, *child)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func ensureSize(da *DoubleArray, i int) {
|
|
|
|
for i >= len(da.Base) {
|
|
|
|
da.Base = append(da.Base, make([]int, len(da.Base)+1)...)
|
|
|
|
da.Check = append(da.Check, make([]int, len(da.Check)+1)...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type byLex [][]int
|
|
|
|
|
|
|
|
func (l byLex) Len() int { return len(l) }
|
|
|
|
func (l byLex) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
|
|
|
func (l byLex) Less(i, j int) bool {
|
|
|
|
si := l[i]
|
|
|
|
sj := l[j]
|
|
|
|
var k int
|
|
|
|
for k = 0; k < len(si) && k < len(sj); k++ {
|
|
|
|
if si[k] < sj[k] {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
if si[k] > sj[k] {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return k < len(sj)
|
|
|
|
}
|
|
|
|
|
|
|
|
// HasCommonPrefix determines if any sequence in the DoubleArray is a prefix of the given sequence.
|
|
|
|
func (da *DoubleArray) HasCommonPrefix(seq []string) bool {
|
|
|
|
if len(da.Base) == 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
var i int
|
|
|
|
for _, t := range seq {
|
|
|
|
code, ok := da.Encoding[t]
|
|
|
|
if !ok {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
j := da.Base[i] + code
|
|
|
|
if len(da.Check) <= j || da.Check[j] != i+1 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
i = j
|
|
|
|
}
|
|
|
|
j := da.Base[i] + len(da.Encoding)
|
|
|
|
if len(da.Check) <= j || da.Check[j] != i+1 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|