Source code for mit_d3m.dataset

# -*- coding: utf-8 -*-

import json
import logging
import os
import re
import warnings
from urllib.parse import urlparse

import networkx as nx
import pandas as pd

logger = logging.getLogger(__name__)

RE_PYTHONIZE = re.compile(r'[A-Z]')


[docs]def pythonize(name): pythonized = re.sub('[A-Z]', r'_\g<0>', name).lower() if pythonized.startswith('_'): pythonized = pythonized[1:] return pythonized
DATASET_SCHEMA_VERSION = '3.0' PROBLEM_SCHEMA_VERSION = '3.0'
[docs]class D3MDataset: dsHome = None dsDoc = None learningDataFile = None def _get_learning_data_path(self): """ Returns the path of learningData.csv in a dataset """ for res in self.dsDoc['dataResources']: resPath = res['resPath'] if 'learningData.csv' in resPath: return os.path.join(self.dsHome, resPath) # if the for loop is over and learningDoc is not found, then return None raise RuntimeError('could not find learningData file the dataset') def __init__(self, dataset): # handle uris logger.info("Loading dataset: %s", dataset) dataset = urlparse(dataset).path self.dsHome = dataset # read the schema in dsHome if os.path.isdir(dataset): self.dsHome = dataset _dsDoc = os.path.join(self.dsHome, 'datasetDoc.json') else: self.dsHome = os.path.dirname(dataset) _dsDoc = dataset if not os.path.exists(_dsDoc): logger.critical('Error: Expected to find datasetDoc.json at path %s', _dsDoc) raise FileNotFoundError with open(_dsDoc, 'r') as f: self.dsDoc = json.load(f) # make sure the versions line up if self.get_dataset_schema_version() != DATASET_SCHEMA_VERSION: warnings.warn("the datasetSchemaVersions in the API and datasetDoc do not match!") # locate the special learningData file self.learningDataFile = self._get_learning_data_path()
[docs] def get_datasetID(self): """Get the datasetID from datasetDoc.""" return self.dsDoc['about']['datasetID']
[docs] def get_dataset_schema_version(self): """Get the dataset schema version that was used to create this dataset.""" return self.dsDoc['about']['datasetSchemaVersion']
[docs] def get_learning_data(self): """Get the contents of learningData.doc as a DataFrame.""" return pd.read_csv(self.learningDataFile, index_col='d3mIndex')
def _get_learning_data_resource(self): """ Returns the path of learningData.csv in a dataset """ for res in self.dsDoc['dataResources']: resPath = res['resPath'] resType = res['resType'] if resType == 'table': if 'learningData.csv' in resPath: return res else: raise RuntimeError('could not find learningData.csv') # if the for loop is over and learningDoc is not found, then return None raise RuntimeError('could not find learningData resource')
[docs] def get_learning_data_columns(self): res = self._get_learning_data_resource() return res['columns']
[docs] def get_resource_types(self): return [dr["resType"] for dr in self.dsDoc['dataResources']]
[docs] def get_data_modality(self): """Detect the data modality based on the resource_types. resource_types == ['table'] => 'single_table' resource_types == ['something_else'...] => 'something_else' # this is not likely resource_types == ['table', 'table'...] => 'multi_table' resource_types == ['table', 'something_else'...] => 'something_else' """ resource_types = self.get_resource_types() first_type = resource_types[0] if first_type != 'table': return first_type elif len(resource_types) == 1: return 'single_table' else: second_type = resource_types[1] if second_type == 'table': return 'multi_table' return second_type
[docs] def get_image_path(self): """ Returns the path of the directory containing images if they exist in this dataset. """ for res in self.dsDoc['dataResources']: resPath = res['resPath'] resType = res['resType'] isCollection = res['isCollection'] if resType == 'image' and isCollection: return os.path.join(self.dsHome, resPath) # if the for loop is over and no image directory is found, then return None raise RuntimeError('could not find learningData file the dataset')
[docs] def get_graph_resources(self): return [r for r in self.dsDoc['dataResources'] if r["resType"] == "graph"]
[docs] def get_graphs_as_nx(self): graph_res = self.get_graph_resources() graphs = {} # todo allow more than one graph resource for g in graph_res: graph_path = os.path.join(self.dsHome, g["resPath"]) try: graphs[g['resID']] = nx.read_gml(graph_path) except nx.exception.NetworkXError: graphs[g['resID']] = nx.read_gml(graph_path, label='id') return graphs
def _get_resources_by_type(self, resource_type): """ Returns the list of resources that are of the indicated type """ resources = [] for res in self.dsDoc['dataResources']: if res['resType'] == resource_type: resources.append(res) return resources
[docs] def get_text_path(self): """ Returns the path of the directory containing text if they exist in this dataset. """ for res in self.dsDoc['dataResources']: resPath = res['resPath'] resType = res['resType'] isCollection = res['isCollection'] if resType == 'text' and isCollection: return os.path.join(self.dsHome, resPath) # if the for loop is over and no image directory is found, then return None raise RuntimeError('could not find learningData file the dataset')
[docs]class D3MProblem: prHome = None prDoc = None splitsFile = None def __init__(self, problem): if isinstance(problem, dict): self.prDoc = problem else: self.prHome = problem # read the schema in prHome _prDoc = os.path.join(self.prHome, 'problemDoc.json') if not os.path.exists(_prDoc): logger.critical('Error: Expected to find problemDoc.json at path %s', _prDoc) raise FileNotFoundError with open(_prDoc, 'r') as f: self.prDoc = json.load(f) # make sure the versions line up if self.get_problem_schema_version() != PROBLEM_SCHEMA_VERSION: warnings.warn("the problemSchemaVersions in the API and datasetDoc do not match!")
[docs] def get_task_type(self): return self.prDoc["about"].get("taskType", "")
[docs] def get_task_subtype(self): return self.prDoc["about"].get("taskSubType", "")
[docs] def get_problem_id(self): """Get the problemID from problemDoc.""" return self.prDoc['about']['problemID']
[docs] def get_problem_schema_version(self): """Get the problem schema version that was used to create this dataset.""" return self.prDoc['about']['problemSchemaVersion']
[docs] def get_performance_metrics(self): return self.prDoc['inputs']['performanceMetrics']
[docs] def get_target_column_names(self): targets = self.prDoc['inputs']['data'][0]['targets'] target_columns = [] for target in targets: target_columns.append(target['colName']) return target_columns
[docs]class D3MDS: dataset = None problem = None def __init__(self, dataset, problem): if isinstance(dataset, D3MDataset): self.dataset = dataset else: self.dataset = D3MDataset(dataset) if isinstance(problem, D3MProblem): self.problem = problem else: self.problem = D3MProblem(problem) self.dataset_doc = self.dataset.dsDoc self.problem_doc = self.problem.prDoc self.dataset_root = self.dataset.dsHome self.dataset_id = self.dataset.get_datasetID() self.problem_id = self.problem.get_problem_id() self.target_column = self.problem.get_target_column_names()[0] self.targets = self.problem.get_target_column_names()
[docs] def get_data(self): X = self.dataset.get_learning_data() try: if len(self.targets) == 1: y = X[self.targets[0]] else: y = X[self.targets] X = X.drop(self.targets, axis=1, errors='ignore') except KeyError: y = pd.DataFrame(index=X.index) return X, y
[docs] def get_columns(self): return self.dataset.get_learning_data_columns()
[docs] def get_resources_dir(self, data_modality): if data_modality == 'image': return self.dataset.get_image_path() if data_modality == 'text': return self.dataset.get_text_path()
[docs] def load_graphs(self): return self.dataset.get_graphs_as_nx()
[docs] def get_data_modality(self): return self.dataset.get_data_modality()
[docs] def get_problem_id(self): return self.problem.get_problem_id()
[docs] def get_task_type(self): return pythonize(self.problem.get_task_type())
[docs] def get_task_subtype(self): return pythonize(self.problem.get_task_subtype())
[docs] def get_metric(self): return self.problem.get_performance_metrics()[0]['metric']