Cardea
import os import xml.etree.ElementTree as ET from glob import glob import featuretools as ft import pandas as pd path = os.path.dirname(os.path.abspath(__file__)) root = ET.parse(path + '/schema.xml').getroot() def get_table_properties(name): """Returns a tuple containing the datatype of each column, the primary key of the table, and the time indices. Args: name: The name of the table in the formal XML file. Returns: A tuple with three components, a list with the datatypes of each column, the primary key of the table, and a list of columns that consider the time indices of the table. """ types = {} arr_time = [] prim_key = 'row_id' x = root.find('.//table[@name="' + name + '"]') for t in x.findall('column'): column = t.get('name') a_type = t.get('type') d_type = get_type(a_type) prim_key = column if 'Primary key' in t.get('remarks') else prim_key if a_type == 'timestamp': arr_time.append(column) types[column.lower()] = d_type return types, prim_key, arr_time def get_table_relationships(name): """Returns a list of the relationships in the table. Args: name: The name of the table in the formal XML file. Returns: A list of the relationships in the table, formatted as a dictionary. """ relations = [] x = root.find('.//table[@name="' + name + '"]') for c in x.findall('column/child'): target_table = c.get('table') target_handle = c.get('column') handle = x.find('.//column/child/...').get('name') relations.append({'parent': name, 'primary_key': handle, 'child': target_table, 'foreign_key': target_handle}) return relations def get_type(x): return { 'int4': float, 'int2': float, 'varchar': str, 'float8': float, 'text': str }.get(x, str) [docs]def load_mimic_data(path=None, subset=None): """Returns an entityset loaded with the dataframes in the received path. Args: path (str): The folder path that contains the data. subset (str): List of tables to include. Returns: featuretools.EntitySet: An entityset with loaded data. """ es = ft.EntitySet(id="mimic") relationships = [] global_tables = [] files = glob(path + '/*.csv') for tag in root.findall('tables/table'): table = tag.get('name') file = os.path.join(path, table.upper() + '.csv') if subset and table not in subset: continue if file in files: # table name global_tables.append(table) # get table relationships relationships = relationships + get_table_relationships(table) # get table properties prop, key, arr_time = get_table_properties(table) # load table into a dataframe df = pd.read_csv(file, dtype=prop, date_parser=pd.to_datetime) df.columns = [column.lower() for column in df.columns] # check if arr_time should be None (no time index) arr_time = arr_time[0] if len(arr_time) > 0 else None if arr_time and df[arr_time].isnull().all(): arr_time = None # load dataframe into the entityset es.entity_from_dataframe(entity_id=table, dataframe=df, index=key, time_index=arr_time) for r in relationships: if (r['parent'] in global_tables and r['child'] in global_tables): new_relationship = ft.Relationship(es[r['parent']][r['primary_key']], es[r['child']][r['foreign_key']]) es = es.add_relationship(new_relationship) return es