forkjo/vendor/github.com/couchbase/vellum/levenshtein/alphabet.go

126 lines
2.6 KiB
Go
Raw Normal View History

// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import (
"fmt"
"sort"
"unicode/utf8"
)
type FullCharacteristicVector []uint32
func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 {
bucketID := offset / 32
align := offset - bucketID*32
if align == 0 {
return fcv[bucketID] & mask
}
left := fcv[bucketID] >> align
right := fcv[bucketID+1] << (32 - align)
return (left | right) & mask
}
type tuple struct {
char rune
fcv FullCharacteristicVector
}
type sortRunes []rune
func (s sortRunes) Less(i, j int) bool {
return s[i] < s[j]
}
func (s sortRunes) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s sortRunes) Len() int {
return len(s)
}
func sortRune(r []rune) []rune {
sort.Sort(sortRunes(r))
return r
}
type Alphabet struct {
charset []tuple
index uint32
}
func (a *Alphabet) resetNext() {
a.index = 0
}
func (a *Alphabet) next() (rune, FullCharacteristicVector, error) {
if int(a.index) >= len(a.charset) {
return 0, nil, fmt.Errorf("eof")
}
rv := a.charset[a.index]
a.index++
return rv.char, rv.fcv, nil
}
func dedupe(in string) string {
lookUp := make(map[rune]struct{}, len(in))
var rv string
for len(in) > 0 {
r, size := utf8.DecodeRuneInString(in)
in = in[size:]
if _, ok := lookUp[r]; !ok {
rv += string(r)
lookUp[r] = struct{}{}
}
}
return rv
}
func queryChars(qChars string) Alphabet {
chars := dedupe(qChars)
inChars := sortRune([]rune(chars))
charsets := make([]tuple, 0, len(inChars))
for _, c := range inChars {
tempChars := qChars
var bits []uint32
for len(tempChars) > 0 {
var chunk string
if len(tempChars) > 32 {
chunk = tempChars[0:32]
tempChars = tempChars[32:]
} else {
chunk = tempChars
tempChars = tempChars[:0]
}
chunkBits := uint32(0)
bit := uint32(1)
for _, chr := range chunk {
if chr == c {
chunkBits |= bit
}
bit <<= 1
}
bits = append(bits, chunkBits)
}
bits = append(bits, 0)
charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)})
}
return Alphabet{charset: charsets}
}