Source code for mlprimitives.adapters.featuretools

# -*- coding: utf-8 -*-

import featuretools as ft
from featuretools.selection import remove_low_information_features


[docs]class DFS(object): features = None def __init__(self, encode=False, remove_low_information=False, index=None, time_index=None, target_entity=None, agg_primitives=None, trans_primitives=None, max_depth=2, max_features=-1, training_window=None, n_jobs=1, verbose=False, copy=True): self.encode = encode self.remove_low_information = remove_low_information self.index = index self.time_index = time_index self.target_entity = target_entity self.agg_primitives = agg_primitives self.trans_primitives = trans_primitives self.max_depth = max_depth self.max_features = max_features self.training_window = training_window self.n_jobs = n_jobs self.verbose = verbose self.copy = copy def __repr__(self): return ( "DFS(encode={encode},\n" " remove_low_information={remove_low_information},\n" " index={index},\n" " time_index={time_index},\n" " target_entity={target_entity},\n" " agg_primitives={agg_primitives},\n" " trans_primitives={trans_primitives},\n" " max_depth={max_depth},\n" " max_features={max_features},\n" " training_window={training_window},\n" " n_jobs={n_jobs},\n" " verbose={verbose},\n" " copy={copy})" ).format(**self.__dict__) def _get_index(self, X): if self.copy: X = X.copy() index = X.index.name or 'index' while index in X.columns: index = '_' + index X.index.name = index X.reset_index(inplace=True) return X, index def _get_entityset(self, X, target_entity, entities, relationships): if entities is None: X, index = self._get_index(X) entities = { 'X': (X, index) } if relationships is None: relationships = [] return ft.EntitySet('entityset', entities, relationships)
[docs] def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None): if not entities and not entityset: target_entity = 'X' else: target_entity = target_entity or self.target_entity if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'}) self.features = ft.dfs( cutoff_time=cutoff_time, max_depth=self.max_depth, entityset=entityset, target_entity=target_entity, features_only=True, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, max_features=self.max_features, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode or self.remove_low_information: X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) if self.encode: X, self.features = ft.encode_features(X, self.features) if self.remove_low_information: X, self.features = remove_low_information_features(X, self.features)
[docs] def calculate_feature_matrix(self, X, target_entity=None, entityset=None, entities=None, relationships=None): if entityset is None: entityset = self._get_entityset(X, target_entity, entities, relationships) if self.training_window is not None: entityset.add_last_time_indexes() cutoff_time = None if self.time_index: cutoff_time = X[[self.index, self.time_index]] cutoff_time = cutoff_time.rename(columns={self.time_index: 'time'}) X = ft.calculate_feature_matrix( self.features, entityset=entityset, cutoff_time=cutoff_time, training_window=self.training_window, n_jobs=self.n_jobs, verbose=self.verbose, ) return X
[docs]def entity_from_dataframe(entityset_id, entity_id, dataframe, entityset=None, index=None, variable_types=None, make_index=False, time_index=None, secondary_time_index=None, already_sorted=False): if entityset is None: entityset = ft.EntitySet(entityset_id) entityset.entity_from_dataframe(entity_id, dataframe.copy(), index, variable_types, make_index, time_index, secondary_time_index, already_sorted) return entityset
[docs]def add_relationship(entityset, parent, parent_column, child, child_column): parent_variable = entityset[parent][parent_column] child_variable = entityset[child][child_column] relationship = ft.Relationship(parent_variable, child_variable) entityset.add_relationship(relationship) return entityset