Source code for cardea.data_loader.load_mimic

import os
import xml.etree.ElementTree as ET
from glob import glob

import featuretools as ft
import pandas as pd

path = os.path.dirname(os.path.abspath(__file__))
root = ET.parse(path + '/schema.xml').getroot()


def get_table_properties(name):
    """Returns a tuple containing the datatype of each column, the primary key of the table,
        and the time indices.

    Args:
        name: The name of the table in the formal XML file.

    Returns:
        A tuple with three components, a list with the datatypes of each column, the primary key
            of the table, and a list of columns that consider the time indices of the table.
    """

    types = {}
    arr_time = []
    prim_key = 'row_id'

    x = root.find('.//table[@name="' + name + '"]')
    for t in x.findall('column'):

        column = t.get('name')
        a_type = t.get('type')
        d_type = get_type(a_type)
        prim_key = column if 'Primary key' in t.get('remarks') else prim_key

        if a_type == 'timestamp':
            arr_time.append(column)

        types[column.lower()] = d_type

    return types, prim_key, arr_time


def get_table_relationships(name):
    """Returns a list of the relationships in the table.

    Args:
        name: The name of the table in the formal XML file.

    Returns:
        A list of the relationships in the table, formatted as a dictionary.
    """

    relations = []
    x = root.find('.//table[@name="' + name + '"]')

    for c in x.findall('column/child'):
        target_table = c.get('table')
        target_handle = c.get('column')

        handle = x.find('.//column/child/...').get('name')

        relations.append({'parent': name, 'primary_key': handle,
                          'child': target_table, 'foreign_key': target_handle})

    return relations


def get_type(x):
    return {
        'int4': float,
        'int2': float,
        'varchar': str,
        'float8': float,
        'text': str
    }.get(x, str)


[docs]def load_mimic_data(path=None, subset=None): """Returns an entityset loaded with the dataframes in the received path. Args: path (str): The folder path that contains the data. subset (str): List of tables to include. Returns: featuretools.EntitySet: An entityset with loaded data. """ es = ft.EntitySet(id="mimic") relationships = [] global_tables = [] files = glob(path + '/*.csv') for tag in root.findall('tables/table'): table = tag.get('name') file = os.path.join(path, table.upper() + '.csv') if subset and table not in subset: continue if file in files: # table name global_tables.append(table) # get table relationships relationships = relationships + get_table_relationships(table) # get table properties prop, key, arr_time = get_table_properties(table) # load table into a dataframe df = pd.read_csv(file, dtype=prop, date_parser=pd.to_datetime) df.columns = [column.lower() for column in df.columns] # check if arr_time should be None (no time index) arr_time = arr_time[0] if len(arr_time) > 0 else None if arr_time and df[arr_time].isnull().all(): arr_time = None # load dataframe into the entityset es.entity_from_dataframe(entity_id=table, dataframe=df, index=key, time_index=arr_time) for r in relationships: if (r['parent'] in global_tables and r['child'] in global_tables): new_relationship = ft.Relationship(es[r['parent']][r['primary_key']], es[r['child']][r['foreign_key']]) es = es.add_relationship(new_relationship) return es