Source code for mlprimitives.custom.feature_extraction

# -*- coding: utf-8 -*-

import logging

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

LOGGER = logging.getLogger(__name__)


[docs]class OneHotLabelEncoder(object): """Combination of LabelEncoder + OneHotEncoder. Args: name (str or None): Name of this feature. If ``None`` is given, the name is taken from the training feature column. max_labels (int or None): Maximum number of columns to generate by feature. dropna (bool): Whether to drop null values before fitting. Defaults to True. >>> df = pd.DataFrame([ ... {'a': 'a', 'b': 1, 'c': 1}, ... {'a': 'a', 'b': 2, 'c': 2}, ... {'a': 'b', 'b': 2, 'c': 1}, ... ]) >>> OneHotLabelEncoder().fit_transform(df.a) a=a a=b 0 1 0 1 1 0 2 0 1 >>> OneHotLabelEncoder(max_labels=1).fit_transform(df.a) a=a 0 1 1 1 2 0 >>> OneHotLabelEncoder(name='a_name').fit_transform(df.a) a_name=a a_name=b 0 1 0 1 1 0 2 0 1 """ def __init__(self, name=None, max_labels=None, dropna=True): self.name = name self.max_labels = max_labels self.dropna = dropna
[docs] def fit(self, x): if self.dropna: x = x.dropna() self.dummies = pd.Series(x.value_counts().index).astype(str) if self.max_labels: self.dummies = self.dummies[:self.max_labels]
[docs] def transform(self, x): name = self.name or x.name dummies = pd.get_dummies(x.astype(str)) dummies = dummies.reindex(columns=self.dummies, fill_value=0) dummies.columns = ['{}={}'.format(name, c) for c in self.dummies] return dummies
[docs] def fit_transform(self, x): self.fit(x) return self.transform(x)
[docs]class FeatureExtractor(object): """Extract Features by applying single column feature extracts on multiple columns. Optionally detect the features on which to apply the feature extractor automatically. Args: copy (bool): Whether to make a copy of the input data or modify it in place. Defaults to ``True``. features (list or str): List of features to apply the feature extractor to. If ``'auto'`` is passed, try to detect the feature automatically. Defaults to an empty list. keep (bool): Whether to keep the original features instead of replacing them. Defaults to ``False``. """ def __init__(self, copy=True, features=None, keep=False): self.copy = copy self.features = list() if features is None else features self.keep = keep self._features = list() def _fit(self, x): pass def _detect_feautres(self, X): pass
[docs] def fit(self, X, y=None): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self.features == 'auto': self._features = self._detect_features(X) else: self._features = self.features for feature in self._features: self._fit(X[feature])
def _transform(self, x): pass
[docs] def transform(self, X): if self._features: if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) elif self.copy: X = X.copy() for feature in self._features: LOGGER.debug("Extracting feature %s", feature) if self.keep: x = X[feature] else: x = X.pop(feature) extracted = self._transform(x) X = pd.concat([X, extracted], axis=1) return X
[docs] def fit_transform(self, X, y=None): self.fit(X, y) return self.transform(X)
[docs]class CategoricalEncoder(FeatureExtractor): """FeatureExtractor that encodes categorical features using OneHotLabelEncoder. When autodetecting features, only features with dtype ``category`` or ``object`` are considered. Optionally, a ``max_unique_ratio`` can be passed, which allows ignoring features that have a high number of unique values, such as primary keys. Args: max_labels (int or None): Maximum number of labels to use by feature. Defaults to ``None``. max_unique_ratio (int): Max proportion of unique values that a feature must have in order to be considered a categorical feature. If ``0`` is given, the ratio is ignored. Defaults to ``0``. dropna (bool): Whether to drop null values before analyzing the features and fitting the encoders. >>> df = pd.DataFrame([ ... {'a': 'a', 'b': 1, 'c': 1}, ... {'a': 'a', 'b': 2, 'c': 2}, ... {'a': 'b', 'b': 2, 'c': 1}, ... ]) >>> df['c'] = d['c'].astype('category') >>> ce = CategoricalEncoder(features='auto') >>> ce.fit_transform(df) b a=a a=b c=1 c=2 0 1 1 0 1 0 1 2 1 0 0 1 2 2 0 1 1 0 """ def __init__(self, max_labels=None, max_unique_ratio=0, dropna=True, **kwargs): self.max_labels = max_labels self.max_unique_ratio = max_unique_ratio self.dropna = dropna super(CategoricalEncoder, self).__init__(**kwargs) def _detect_features(self, X): features = list() columns = X.select_dtypes(('object', 'category')).columns if not self.max_unique_ratio: return list(columns) for column in columns: x = X[column] if self.dropna: x = x.dropna() unique_ratio = len(x.unique()) / len(x) if unique_ratio < self.max_unique_ratio: features.append(column) return features
[docs] def fit(self, X, y=None): self.encoders = dict() super(CategoricalEncoder, self).fit(X)
def _fit(self, x): encoder = OneHotLabelEncoder(x.name, self.max_labels, self.dropna) encoder.fit(x) self.encoders[x.name] = encoder def _transform(self, x): encoder = self.encoders[x.name] return encoder.transform(x)
[docs]class StringVectorizer(FeatureExtractor): """FeatureExtractor that encodes text features using a scikit-learn CountVectorizer. When autodetecting features, only features with dtype ``object`` features are considered. Optionally, a ``min_words`` can be passed, which allows ignoring features have less than the given value of words in all their occurrences. Args: copy (bool): Whether to make a copy of the input data or modify it in place. Defaults to ``True``. features (list or str): List of features to apply the feature extractor to. If ``'auto'`` is passed, try to detect the feature automatically. Defaults to an empty list. keep (bool): Whether to keep the original features instead of replacing them. Defaults to ``False``. min_words (int): Minimum number of words that the features needs to have in order to be considered a text column. **kwargs: Any additional keywords arguments will be passed to the underlying StringVectorizer instances. """ def __init__(self, copy=True, features=None, keep=False, min_words=0, **kwargs): self.kwargs = kwargs self.min_words = min_words super(StringVectorizer, self).__init__(copy, features, keep) def _detect_features(self, X): columns = X.select_dtypes('object').columns if not self.min_words: return list(columns) features = [] analyzer = CountVectorizer(**self.kwargs).build_analyzer() for column in columns: try: if (X[column].apply(analyzer).str.len() >= self.min_words).any(): features.append(column) except (ValueError, AttributeError): pass return features
[docs] def fit(self, X, y=None): self.vectorizers = dict() super(StringVectorizer, self).fit(X)
def _fit(self, x): vectorizer = CountVectorizer(**self.kwargs) vectorizer.fit(x.fillna('').astype(str)) self.vectorizers[x.name] = vectorizer def _transform(self, x): vectorizer = self.vectorizers[x.name] bow = vectorizer.transform(x.fillna('').astype(str)) bow_columns = ['{}_{}'.format(x.name, f) for f in vectorizer.get_feature_names()] return pd.DataFrame(bow.toarray(), columns=bow_columns, index=x.index)
[docs]class DatetimeFeaturizer(FeatureExtractor): """Extract features from a datetime.""" def _detect_features(self, X): return list(X.select_dtypes('datetime').columns) def _transform(self, x): prefix = x.name + '_' features = { prefix + 'year': x.dt.year, prefix + 'month': x.dt.month, prefix + 'day': x.dt.day, prefix + 'weekday': x.dt.day, prefix + 'hour': x.dt.hour, } return pd.DataFrame(features)