"""
This class implements the SuperVectorizer, which is a preprocessor used to
automatically apply encoders to different types of data, without the need to
manually categorize them beforehand, or construct complex Pipelines.
"""
# Author: Lilian Boulard <lilian@boulard.fr> | https://github.com/LilianBoulard
import sklearn
import numpy as np
import pandas as pd
from warnings import warn
from typing import Union, Optional, List
from sklearn.base import BaseEstimator, clone
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import __version__ as sklearn_version
from dirty_cat import GapEncoder, DatetimeEncoder
from dirty_cat.utils import Version, check_input
def _has_missing_values(df: Union[pd.DataFrame, pd.Series]) -> bool:
"""
Returns True if `array` contains missing values, False otherwise.
"""
return any(df.isnull())
def _replace_missing_in_col(df: pd.Series, value: str = "missing") -> pd.Series:
"""
Takes a Series with string data, replaces the missing values, and returns it.
"""
dtype_name = df.dtype.name
if dtype_name == 'category' and (value not in df.cat.categories):
df = df.cat.add_categories(value)
df = df.fillna(value=value)
return df
[docs]class SuperVectorizer(ColumnTransformer):
"""
Easily transforms a heterogeneous data table (such as a dataframe) to
a numerical array for machine learning. For this it transforms each
column depending on its data type.
It provides a simplified interface for scikit-learn's `ColumnTransformer`.
.. versionadded:: 0.2.0
Parameters
----------
cardinality_threshold: int, default=40
Two lists of features will be created depending on this value: strictly
under this value, the low cardinality categorical values, and above or
equal, the high cardinality categorical values.
Different encoders will be applied to these two groups, defined by
the parameters `low_card_cat_transformer` and
`high_card_cat_transformer` respectively.
low_card_cat_transformer: Transformer or str or None, default=None
Transformer used on categorical/string features with low cardinality
(threshold is defined by `cardinality_threshold`).
Default value None is converted to `OneHotEncoder()`.
Can either be a transformer object instance (e.g. `OneHotEncoder()`),
a `Pipeline` containing the preprocessing steps,
None to apply `remainder`, 'drop' for dropping the columns,
or 'passthrough' to return the unencoded columns.
high_card_cat_transformer: Transformer or str or None, default=None
Transformer used on categorical/string features with high cardinality
(threshold is defined by `cardinality_threshold`).
Default value None is converted to `GapEncoder(n_components=30)`.
Can either be a transformer object instance (e.g. `GapEncoder()`),
a `Pipeline` containing the preprocessing steps,
None to apply `remainder`, 'drop' for dropping the columns,
or 'passthrough' to return the unencoded columns.
numerical_transformer: Transformer or str or None, default=None
Transformer used on numerical features.
Can either be a transformer object instance (e.g. `StandardScaler()`),
a `Pipeline` containing the preprocessing steps,
None to apply `remainder`, 'drop' for dropping the columns,
or 'passthrough' to return the unencoded columns.
datetime_transformer: Transformer or str or None, default=None
Transformer used on datetime features.
Default value None is converted to `DatetimeEncoder()`.
Can either be a transformer object instance (e.g. `DatetimeEncoder()`),
a `Pipeline` containing the preprocessing steps,
None to apply `remainder`, 'drop' for dropping the columns,
or 'passthrough' to return the unencoded columns.
auto_cast: bool, default=True
If set to `True`, will try to convert each column to the best possible
data type (dtype).
impute_missing: str, default='auto'
When to impute missing values in string columns.
'auto' will impute missing values if it's considered appropriate
(we are using an encoder that does not support missing values and/or
specific versions of pandas, numpy and scikit-learn).
'force' will impute all missing values.
'skip' will not impute at all.
When imputed, missing values are replaced by the string 'missing'.
See also attribute `imputed_columns_`.
remainder : {'drop', 'passthrough'} or estimator, default='drop'
By default, only the specified columns in `transformers` are
transformed and combined in the output, and the non-specified
columns are dropped. (default of ``'drop'``).
By specifying ``remainder='passthrough'``, all remaining columns that
were not specified in `transformers` will be automatically passed
through. This subset of columns is concatenated with the output of
the transformers.
By setting ``remainder`` to be an estimator, the remaining
non-specified columns will use the ``remainder`` estimator. The
estimator must support :term:`fit` and :term:`transform`.
Note that using this feature requires that the DataFrame columns
input at :term:`fit` and :term:`transform` have identical order.
sparse_threshold: float, default=0.3
If the output of the different transformers contains sparse matrices,
these will be stacked as a sparse matrix if the overall density is
lower than this value. Use sparse_threshold=0 to always return dense.
When the transformed output consists of all dense data, the stacked result
will be dense, and this keyword will be ignored.
n_jobs : int, default=None
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors.
transformer_weights : dict, default=None
Multiplicative weights for features per transformer. The output of the
transformer is multiplied by these weights. Keys are transformer names,
values the weights.
verbose : bool, default=False
If True, the time elapsed while fitting each transformer will be
printed as it is completed
Attributes
----------
transformers_: List[Tuple[str, Union[str, BaseEstimator], Union[str, int]]]
The final distribution of columns.
List of three-tuple containing
(1) the name of the category
(2) the encoder/transformer instance which will be applied
or "passthrough" or "drop"
(3) the list of column names or index
columns_: List[Union[str, int]]
The column names of fitted array.
types_: Dict[str, type]
A mapping of inferred types per column.
Key is the column name, value is the inferred dtype.
imputed_columns_: List[str]
The list of columns in which we imputed the missing values.
"""
# Override required parameters
_required_parameters = []
OptionalEstimator = Optional[Union[BaseEstimator, str]]
def __init__(self, *,
cardinality_threshold: int = 40,
low_card_cat_transformer: Optional[Union[BaseEstimator, str]] = None,
high_card_cat_transformer: Optional[Union[BaseEstimator, str]] = None,
numerical_transformer: Optional[Union[BaseEstimator, str]] = None,
datetime_transformer: Optional[Union[BaseEstimator, str]] = None,
auto_cast: bool = True,
impute_missing: str = 'auto',
# Following parameters are inherited from ColumnTransformer
remainder: str = 'passthrough',
sparse_threshold: float = 0.3,
n_jobs: int = None,
transformer_weights=None,
verbose: bool = False,
):
super().__init__(transformers=[])
self.cardinality_threshold = cardinality_threshold
self.low_card_cat_transformer = low_card_cat_transformer
self.high_card_cat_transformer = high_card_cat_transformer
self.numerical_transformer = numerical_transformer
self.datetime_transformer = datetime_transformer
self.auto_cast = auto_cast
self.impute_missing = impute_missing
self.remainder = remainder
self.sparse_threshold = sparse_threshold
self.n_jobs = n_jobs
self.transformer_weights = transformer_weights
self.verbose = verbose
def _more_tags(self):
"""
Used internally by sklearn to ease the estimator checks.
"""
return {"allow_nan": [True]}
def _clone_transformers(self):
if self.low_card_cat_transformer is not None:
self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
else:
self.low_card_cat_transformer_ = OneHotEncoder()
if self.high_card_cat_transformer is not None:
self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer)
else:
self.high_card_cat_transformer_ = GapEncoder(n_components=30)
if self.datetime_transformer is not None:
self.datetime_transformer_ = clone(self.datetime_transformer)
else:
self.datetime_transformer_ = DatetimeEncoder()
#TODO check that the provided transformers are valid
@staticmethod
def _auto_cast(X: pd.DataFrame) -> pd.DataFrame:
"""
Takes a dataframe and tries to convert its columns to the best
possible data type.
Parameters
----------
X : {dataframe} of shape (n_samples, n_features)
The data to be transformed.
Returns
-------
pd.DataFrame
The same pandas DataFrame, with its columns casted to the best possible
data type.
"""
from pandas.core.dtypes.base import ExtensionDtype
# Handle missing values
for col in X.columns:
contains_missing: bool = _has_missing_values(X[col])
# Convert pandas' NaN value (pd.NA) to numpy NaN value (np.nan)
# because the former tends to raise all kind of issues when dealing
# with scikit-learn (as of version 0.24).
if contains_missing:
# Some numerical dtypes like Int64 or Float64 only support
# pd.NA so they must be converted to np.float64 before.
if pd.api.types.is_numeric_dtype(X[col]):
X[col] = X[col].astype(np.float64)
X[col].fillna(value=np.nan, inplace=True)
STR_NA_VALUES = ['null', '', '1.#QNAN', '#NA', 'nan', '#N/A N/A', '-1.#QNAN', '<NA>', '-1.#IND', '-nan', 'n/a',
'-NaN', '1.#IND', 'NULL', 'NA', 'N/A', '#N/A', 'NaN'] # taken from pandas.io.parsers (version 1.1.4)
X = X.replace(STR_NA_VALUES + [None, "?", "..."],
np.nan)
X = X.replace(r'^\s+$', np.nan, regex=True) # replace whitespace only
# Convert to best possible data type
for col in X.columns:
if not pd.api.types.is_datetime64_any_dtype(X[col]): # we don't want to cast datetime64
try:
X[col] = pd.to_numeric(X[col], errors='raise')
except:
# Only try to convert to datetime if the variable isn't numeric.
try:
X[col] = pd.to_datetime(X[col], errors='raise',
infer_datetime_format=True)
except:
pass
# Cast pandas dtypes to numpy dtypes
# for earlier versions of sklearn
if issubclass(X[col].dtype.__class__, ExtensionDtype):
try:
X[col] = X[col].astype(X[col].dtype.type, errors='ignore')
except (TypeError, ValueError):
pass
return X
[docs] def get_feature_names_out(self, input_features=None) -> List[str]:
"""
Returns clean feature names with format
"<column_name>_<value>" if encoded by OneHotEncoder or alike,
e.g. "job_title_Police officer",
or "<column_name>" if not encoded.
"""
if Version(sklearn_version) < Version('0.23'):
try:
if Version(sklearn_version) < Version('1.0'):
ct_feature_names = super().get_feature_names()
else:
ct_feature_names = super().get_feature_names_out()
except NotImplementedError:
raise NotImplementedError(
'Prior to sklearn 0.23, get_feature_names with '
'"passthrough" is unsupported. To use the method, '
'either make sure there is no "passthrough" in the '
'transformers, or update your copy of scikit-learn.'
)
else:
if Version(sklearn_version) < Version('1.0'):
ct_feature_names = super().get_feature_names()
else:
ct_feature_names = super().get_feature_names_out()
all_trans_feature_names = []
for name, trans, cols, _ in self._iter(fitted=True):
if isinstance(trans, str):
if trans == 'drop':
continue
elif trans == 'passthrough':
if all(isinstance(col, int) for col in cols):
cols = [self.columns_[i] for i in cols]
all_trans_feature_names.extend(cols)
continue
if not hasattr(trans, 'get_feature_names'):
all_trans_feature_names.extend(cols)
else:
if Version(sklearn_version) < Version('1.0'):
trans_feature_names = trans.get_feature_names(cols)
else:
trans_feature_names = trans.get_feature_names_out(cols)
all_trans_feature_names.extend(trans_feature_names)
if len(ct_feature_names) != len(all_trans_feature_names):
warn('Could not extract clean feature names ; returning defaults.')
return ct_feature_names
return all_trans_feature_names
[docs] def get_feature_names(self) -> List[str]:
"""
Ensures compatibility with sklearn < 1.0.
Use `get_feature_names_out` instead.
"""
if Version(sklearn_version) >= '1.0':
warn(
"Following the changes in scikit-learn 1.0, "
"get_feature_names is deprecated. "
"Use get_feature_names_out instead.",
DeprecationWarning,
stacklevel=2,
)
return self.get_feature_names_out()