Source code for cardea.data_loader.load_mimic

import os
import xml.etree.ElementTree as ET
from glob import glob

import featuretools as ft
import pandas as pd

path = os.path.dirname(os.path.abspath(__file__))
root = ET.parse(path + '/schema.xml').getroot()


def get_table_properties(name):
    """Returns a tuple containing the datatype of each column, the primary key of the table,
        and the time indices.

    Args:
        name: The name of the table in the formal XML file.

    Returns:
        A tuple with three components, a list with the datatypes of each column, the primary key
            of the table, and a list of columns that consider the time indices of the table.
    """

    types = {}
    arr_time = []
    prim_key = 'row_id'

    x = root.find('.//table[@name="' + name + '"]')
    for t in x.findall('column'):

        column = t.get('name')
        a_type = t.get('type')
        d_type = get_type(a_type)
        prim_key = column if 'Primary key' in t.get('remarks') else prim_key

        if a_type == 'timestamp':
            arr_time.append(column)

        types[column.lower()] = d_type

    return types, prim_key, arr_time


def get_table_relationships(name):
    """Returns a list of the relationships in the table.

    Args:
        name: The name of the table in the formal XML file.

    Returns:
        A list of the relationships in the table, formatted as a dictionary.
    """

    relations = []
    x = root.find('.//table[@name="' + name + '"]')

    for c in x.findall('column/child'):
        target_table = c.get('table')
        target_handle = c.get('column')

        handle = x.find('.//column/child/...').get('name')

        relations.append({'parent': name, 'primary_key': handle,
                          'child': target_table, 'foreign_key': target_handle})

    return relations


def get_type(x):
    return {
        'int4': float,
        'int2': float,
        'varchar': str,
        'float8': float,
        'text': str
    }.get(x, str)


[docs]def load_mimic_data(path=None, subset=None):
    """Returns an entityset loaded with the dataframes in the received path.

    Args:
        path (str):
            The folder path that contains the data.
        subset (str):
            List of tables to include.

    Returns:
        featuretools.EntitySet:
            An entityset with loaded data.
    """
    es = ft.EntitySet(id="mimic")

    relationships = []
    global_tables = []
    files = glob(path + '/*.csv')

    for tag in root.findall('tables/table'):
        table = tag.get('name')
        file = os.path.join(path, table.upper() + '.csv')

        if subset and table not in subset:
            continue

        if file in files:
            # table name
            global_tables.append(table)

            # get table relationships
            relationships = relationships + get_table_relationships(table)

            # get table properties
            prop, key, arr_time = get_table_properties(table)

            # load table into a dataframe
            df = pd.read_csv(file, dtype=prop, date_parser=pd.to_datetime)

            df.columns = [column.lower() for column in df.columns]

            # check if arr_time should be None (no time index)
            arr_time = arr_time[0] if len(arr_time) > 0 else None

            if arr_time and df[arr_time].isnull().all():
                arr_time = None

            # load dataframe into the entityset
            es.entity_from_dataframe(entity_id=table,
                                     dataframe=df,
                                     index=key,
                                     time_index=arr_time)

    for r in relationships:
        if (r['parent'] in global_tables and r['child'] in global_tables):
            new_relationship = ft.Relationship(es[r['parent']][r['primary_key']],
                                               es[r['child']][r['foreign_key']])

            es = es.add_relationship(new_relationship)

    return es