Source code for cardea.featurization.featurization

import featuretools as ft


[docs]class Featurization(): """A class that generates a feature matrix from its attributes.""" __name__ = 'Featurization' AGG_PRIMITIVES = ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] TRANS_PRIMITIVES = ["day", "month", "year", "weekday", "is_weekend"]
[docs] def generate_feature_matrix(self, es, target, label_times, instance_ids=None, agg_primitives=AGG_PRIMITIVES, trans_primitives=TRANS_PRIMITIVES, max_depth=2, ignore_entities=None, ignore_variables=None, seed_features=None, drop_contains=None, drop_exact=None, max_features=-1, training_window=None, n_jobs=1, verbose=False, include_cutoff_time=True, encode=False): """Calculates a feature matrix and features given in Featurization object. Args: es (featuretools.EntitySet): An already initialized entityset. target (str): Name of the entity (entity id) on which to make predictions. label_times (pandas.DataFrame): A data frame that specifies the times at which to calculate the features for each instance. This data frame contains three columns ``instance_id``, ``time``, ``label``. The ``instance_id`` specifies the instances for which to calculate features over. The ``time`` column specifies the cutoff time for each instance. Data before the cutoff time will be used for calculating the feature matrix. The ``label`` column specifies the ground truth label (value we want to predict) for each instance. instance_ids (list): List of instances on which to calculate features. agg_primitives (list): List of Aggregation Feature types to apply. trans_primitives (list): List of Transform Feature functions to apply. max_depth (int): Maximum allowed depth of features. ignore_entities (list): List of entities to blacklist when creating features. ignore_variables (dict): List of specific variables within each entity to blacklist when creating features. seed_features (list): List of manually defined features to use. drop_contains (list): Drop features that contains these strings in name. drop_exact (list): Drop features that exactly match these strings in name. max_features (int): Cap the number of generated features to this number. If -1, no limit. training_window (ft.Timedelta or str): Window defining how much time before the cutoff time data can be used when c alculating features. If ``None``, all data before cutoff time is used. Defaults to ``None``. Month and year units are not relative when Pandas Timedeltas are used. Relative units should be passed as a Featuretools Timedelta or a string. n_jobs (int): Number of parallel processes to use when calculating feature matrix. verbose (bool): An indicator of verbose option. include_cutoff_time (bool): Include data at cutoff times in feature calculations. Defaults to ``True``. encode (bool): Whether or not to encode categorical into one-hot features. Returns: pandas.DataFrame, list: * The generated feature matrix. * List of feature definitions in the feature matrix. """ feature_matrix, features_defs = ft.dfs(entityset=es, target_entity=target, cutoff_time=label_times, instance_ids=instance_ids, agg_primitives=agg_primitives, trans_primitives=trans_primitives, max_depth=max_depth, ignore_entities=ignore_entities, ignore_variables=ignore_variables, seed_features=seed_features, drop_contains=drop_contains, drop_exact=drop_exact, max_features=max_features, training_window=training_window, n_jobs=n_jobs, verbose=verbose, include_cutoff_time=include_cutoff_time) if encode: # encode categorical values return ft.encode_features(feature_matrix, features_defs) return feature_matrix, features_defs