"""
Minhash encoding of string arrays.
The principle is as follows:
1. A string is viewed as a succession of numbers (the ASCII or UTF8
representation of its elements).
2. The string is then decomposed into a set of n-grams, i.e.
n-dimensional vectors of integers.
3. A hashing function is used to assign an integer to each n-gram.
The minimum of the hashes over all n-grams is used in the encoding.
4. This process is repeated with N hashing functions are used to
form N-dimensional encodings.
Maxhash encodings can be computed similarly by taking the hashes maximum
instead.
With this procedure, strings that share many n-grams have greater
probability of having same encoding values. These encodings thus capture
morphological similarities between strings.
"""
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import murmurhash3_32
from .fast_hash import ngram_min_hash
from .utils import LRUDict, check_input
[docs]class MinHashEncoder(BaseEstimator, TransformerMixin):
"""
Encode string categorical features as a numeric array, minhash method
applied to ngram decomposition of strings based on ngram decomposition
of the string.
Parameters
----------
n_components : int, default=30
The number of dimension of encoded strings. Numbers around 300 tend to
lead to good prediction performance, but with more computational cost.
ngram_range : tuple (min_n, max_n), default=(2, 4)
The lower and upper boundary of the range of n-values for different
n-grams to be extracted. All values of n such that min_n <= n <= max_n.
will be used.
hashing : str {'fast', 'murmur'}, default=fast
Hashing function. fast is faster but
might have some concern with its entropy.
minmax_hash : bool, default=False
if True, return min hash and max hash concatenated.
handle_missing : 'error' or 'zero_impute' (default)
Whether to raise an error or encode missing values (NaN) with
vectors filled with zeros.
References
----------
For a detailed description of the method, see
`Encoding high-cardinality string categorical variables
<https://hal.inria.fr/hal-02171256v4>`_ by Cerda, Varoquaux (2019).
"""
_capacity = 2 ** 10
def __init__(self, n_components=30, ngram_range=(2, 4),
hashing='fast', minmax_hash=False,
handle_missing='zero_impute'):
self.ngram_range = ngram_range
self.n_components = n_components
self.hashing = hashing
self.minmax_hash = minmax_hash
self.handle_missing = handle_missing
def _more_tags(self):
"""
Used internally by sklearn to ease the estimator checks.
"""
return {"X_types": ["categorical"]}
[docs] def get_unique_ngrams(self, string, ngram_range):
""" Return the set of unique n-grams of a string.
Parameters
----------
string : str
The string to split in n-grams.
ngram_range : tuple (min_n, max_n)
The lower and upper boundary of the range of n-values for different
n-grams to be extracted. All values of n such that min_n <= n <= max_n.
Returns
-------
set
The set of unique n-grams of the string.
"""
spaces = ' ' # * (n // 2 + n % 2)
string = spaces + " ".join(string.lower().split()) + spaces
ngram_set = set()
for n in range(ngram_range[0], ngram_range[1] + 1):
string_list = [string[i:] for i in range(n)]
ngram_set |= set(zip(*string_list))
return ngram_set
[docs] def minhash(self, string, n_components, ngram_range):
""" Encode a string using murmur hashing function.
Parameters
----------
string : str
The string to encode.
n_components : int
The number of dimension of encoded string.
ngram_range : tuple (min_n, max_n)
The lower and upper boundary of the range of n-values for different
n-grams to be extracted. All values of n such that min_n <= n <= max_n.
Returns
-------
array, shape (n_components, )
The encoded string.
"""
min_hashes = np.ones(n_components) * np.infty
grams = self.get_unique_ngrams(string, self.ngram_range)
if len(grams) == 0:
grams = self.get_unique_ngrams(' Na ', self.ngram_range)
for gram in grams:
hash_array = np.array([
murmurhash3_32(''.join(gram), seed=d, positive=True)
for d in range(n_components)])
min_hashes = np.minimum(min_hashes, hash_array)
return min_hashes / (2 ** 32 - 1)
[docs] def get_fast_hash(self, string):
"""
Encode a string with fast hashing function.
fast hashing supports both min_hash and minmax_hash encoding.
Parameters
----------
string : str
The string to encode.
Returns
-------
array, shape (n_components, )
The encoded string, using specified encoding scheme.
"""
if self.minmax_hash:
return np.concatenate([ngram_min_hash(string, self.ngram_range,
seed, return_minmax=True)
for seed in range(self.n_components // 2)])
else:
return np.array([ngram_min_hash(string, self.ngram_range, seed)
for seed in range(self.n_components)])
[docs] def fit(self, X, y=None):
"""
Fit the MinHashEncoder to X. In practice, just initializes a dictionary
to store encodings to speed up computation.
Parameters
----------
X : array-like, shape (n_samples, ) or (n_samples, 1)
The string data to encode.
Returns
-------
self
The fitted MinHashEncoder instance.
"""
self.count = 0
self.hash_dict = LRUDict(capacity=self._capacity)
return self