Source code for cardea.featurization.featurization

import featuretools as ft


[docs]class Featurization():
    """A class that generates a feature matrix from its attributes."""

    __name__ = 'Featurization'

    AGG_PRIMITIVES = ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true",
                      "num_unique", "mode"]

    TRANS_PRIMITIVES = ["day", "month", "year", "weekday", "is_weekend"]

[docs]    def generate_feature_matrix(self, es, target, label_times,
                                instance_ids=None, agg_primitives=AGG_PRIMITIVES,
                                trans_primitives=TRANS_PRIMITIVES, max_depth=2,
                                ignore_entities=None, ignore_variables=None, seed_features=None,
                                drop_contains=None, drop_exact=None, max_features=-1,
                                training_window=None, n_jobs=1, verbose=False,
                                include_cutoff_time=True, encode=False):
        """Calculates a feature matrix and features given in Featurization object.

        Args:
            es (featuretools.EntitySet):
                An already initialized entityset.
            target (str):
                Name of the entity (entity id) on which to make predictions.
            label_times (pandas.DataFrame):
                A data frame that specifies the times at which to calculate the features
                for each instance. This data frame contains three columns ``instance_id``,
                ``time``, ``label``. The ``instance_id`` specifies the instances for
                which to calculate features over. The ``time`` column specifies the cutoff
                time for each instance. Data before the cutoff time will be used for
                calculating the feature matrix. The ``label`` column specifies the ground
                truth label (value we want to predict) for each instance.
            instance_ids (list):
                List of instances on which to calculate features.
            agg_primitives (list):
                List of Aggregation Feature types to apply.
            trans_primitives (list):
                List of Transform Feature functions to apply.
            max_depth (int):
                Maximum allowed depth of features.
            ignore_entities (list):
                List of entities to blacklist when creating features.
            ignore_variables (dict):
                List of specific variables within each entity to blacklist when creating features.
            seed_features (list):
                List of manually defined features to use.
            drop_contains (list):
                Drop features that contains these strings in name.
            drop_exact (list):
                Drop features that exactly match these strings in name.
            max_features (int):
                Cap the number of generated features to this number. If -1, no limit.
            training_window (ft.Timedelta or str):
                Window defining how much time before the cutoff time data can be used when c
                alculating features. If ``None``, all data before cutoff time is used.
                Defaults to ``None``. Month and year units are not relative when Pandas
                Timedeltas are used. Relative units should be passed as a Featuretools
                Timedelta or a string.
            n_jobs (int):
                Number of parallel processes to use when calculating feature matrix.
            verbose (bool):
                An indicator of verbose option.
            include_cutoff_time (bool):
                Include data at cutoff times in feature calculations. Defaults to ``True``.
            encode (bool):
                Whether or not to encode categorical into one-hot features.

        Returns:
            pandas.DataFrame, list:
                * The generated feature matrix.
                * List of feature definitions in the feature matrix.
        """

        feature_matrix, features_defs = ft.dfs(entityset=es,
                                               target_entity=target,
                                               cutoff_time=label_times,
                                               instance_ids=instance_ids,
                                               agg_primitives=agg_primitives,
                                               trans_primitives=trans_primitives,
                                               max_depth=max_depth,
                                               ignore_entities=ignore_entities,
                                               ignore_variables=ignore_variables,
                                               seed_features=seed_features,
                                               drop_contains=drop_contains,
                                               drop_exact=drop_exact,
                                               max_features=max_features,
                                               training_window=training_window,
                                               n_jobs=n_jobs,
                                               verbose=verbose,
                                               include_cutoff_time=include_cutoff_time)

        if encode:
            # encode categorical values
            return ft.encode_features(feature_matrix, features_defs)

        return feature_matrix, features_defs