datacache-1.1.5/0000755000372000037200000000000013326760342014261 5ustar travistravis00000000000000datacache-1.1.5/datacache/0000755000372000037200000000000013326760342016156 5ustar travistravis00000000000000datacache-1.1.5/datacache/__init__.py0000644000372000037200000000263413326760043020272 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import from .download import fetch_file, fetch_and_transform, fetch_csv_dataframe from .database_helpers import ( db_from_dataframe, db_from_dataframes, db_from_dataframes_with_absolute_path, fetch_csv_db, connect_if_correct_version ) from .common import ( ensure_dir, get_data_dir, build_path, clear_cache, build_local_filename ) from .cache import Cache __version__ = '1.1.5' __all__ = [ 'fetch_file', 'fetch_and_transform', 'fetch_csv_dataframe', 'db_from_dataframe', 'db_from_dataframes', 'db_from_dataframes_with_absolute_path', 'fetch_csv_db', 'connect_if_correct_version', 'ensure_dir', 'get_data_dir', 'build_path', 'clear_cache', 'build_local_filename', 'Cache', ] datacache-1.1.5/datacache/cache.py0000644000372000037200000001075513326760043017601 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import from os.path import exists, join from os import remove from . import common from . import download from .database_helpers import db_from_dataframe class Cache(object): def __init__(self, subdir="datacache"): assert subdir self.subdir = subdir self.cache_directory_path = common.get_data_dir(subdir) # dictionary mapping from (URL, decompress) pair to local paths # TODO: handle decompression separately from download, # so we can use copies of compressed files we've already downloaded self._local_paths = {} def delete_url(self, url): """ Delete local files downloaded from given URL """ # file may exist locally in compressed and decompressed states # delete both for decompress in [False, True]: key = (url, decompress) if key in self._local_paths: path = self._local_paths[key] remove(path) del self._local_paths[key] # possible that file was downloaded via the download module without # using the Cache object, this wouldn't end up in the local_paths # but should still be deleted path = self.local_path( url, decompress=decompress, download=False) if exists(path): remove(path) def delete_all(self): self._local_paths.clear() common.clear_cache(self.cache_directory_path) common.ensure_dir(self.cache_directory_path) def exists(self, url, filename=None, decompress=False): """ Return True if a local file corresponding to these arguments exists. """ return download.file_exists( url, filename=filename, decompress=decompress, subdir=self.subdir) def fetch( self, url, filename=None, decompress=False, force=False, timeout=None, use_wget_if_available=True): """ Return the local path to the downloaded copy of a given URL. Don't download the file again if it's already present, unless `force` is True. """ key = (url, decompress) if not force and key in self._local_paths: path = self._local_paths[key] if exists(path): return path else: del self._local_paths[key] path = download.fetch_file( url, filename=filename, decompress=decompress, subdir=self.subdir, force=force, timeout=timeout, use_wget_if_available=use_wget_if_available) self._local_paths[key] = path return path def local_filename( self, url=None, filename=None, decompress=False): """ What local filename will we use within the cache directory for the given URL/filename/decompress options. """ return common.build_local_filename(url, filename, decompress) def local_path(self, url, filename=None, decompress=False, download=False): """ What will the full local path be if we download the given file? """ if download: return self.fetch(url=url, filename=filename, decompress=decompress) else: filename = self.local_filename(url, filename, decompress) return join(self.cache_directory_path, filename) def db_from_dataframe( self, db_filename, table_name, df, key_column_name=None): return db_from_dataframe( db_filename=db_filename, table_name=table_name, df=df, primary_key=key_column_name, subdir=self.subdir) datacache-1.1.5/datacache/common.py0000644000372000037200000000502013326760043020013 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import import hashlib from os import makedirs, environ from os.path import join, exists, split, splitext import re from shutil import rmtree import appdirs def ensure_dir(path): if not exists(path): makedirs(path) def get_data_dir(subdir=None, envkey=None): if envkey and environ.get(envkey): envdir = environ[envkey] if subdir: return join(envdir, subdir) else: return envdir return appdirs.user_cache_dir(subdir if subdir else "datacache") def build_path(filename, subdir=None): data_dir = get_data_dir(subdir) ensure_dir(data_dir) return join(data_dir, filename) def clear_cache(subdir=None): data_dir = get_data_dir(subdir) rmtree(data_dir) def normalize_filename(filename): """ Remove special characters and shorten if name is too long """ # if the url pointed to a directory then just replace all the special chars filename = re.sub("/|\\|;|:|\?|=", "_", filename) if len(filename) > 150: prefix = hashlib.md5(filename).hexdigest() filename = prefix + filename[-140:] return filename def build_local_filename(download_url=None, filename=None, decompress=False): """ Determine which local filename to use based on the file's source URL, an optional desired filename, and whether a compression suffix needs to be removed """ assert download_url or filename, "Either filename or URL must be specified" # if no filename provided, use the original filename on the server if not filename: digest = hashlib.md5(download_url.encode('utf-8')).hexdigest() parts = split(download_url) filename = digest + "." + "_".join(parts) filename = normalize_filename(filename) if decompress: (base, ext) = splitext(filename) if ext in (".gz", ".zip"): filename = base return filename datacache-1.1.5/datacache/database.py0000644000372000037200000001756313326760043020306 0ustar travistravis00000000000000# Copyright (c) 2015-2018 Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import import logging import sqlite3 from typechecks import require_integer, require_string, require_iterable_of logger = logging.getLogger(__name__) METADATA_TABLE_NAME = "_datacache_metadata" class Database(object): """ Wrapper object for sqlite3 database which provides helpers for querying and constructing the datacache metadata table, as well as creating and checking for existence of particular table names. Calls to methods other than Database.close() and Database.create() will not commit their changes. """ def __init__(self, path): self.path = path self.connection = sqlite3.connect(path) def _commit(self): self.connection.commit() def close(self): """Commit changes and close database connection""" self._commit() self.connection.close() def table_names(self): """Returns names of all tables in the database""" query = "SELECT name FROM sqlite_master WHERE type='table'" cursor = self.connection.execute(query) results = cursor.fetchall() return [result_tuple[0] for result_tuple in results] def has_table(self, table_name): """Does a table named `table_name` exist in the sqlite database?""" table_names = self.table_names() return table_name in table_names def drop_all_tables(self): """Drop all tables in the database""" for table_name in self.table_names(): self.execute_sql("DROP TABLE %s" % table_name) self.connection.commit() def execute_sql(self, sql, commit=False): """Log and then execute a SQL query""" logger.info("Running sqlite query: \"%s\"", sql) self.connection.execute(sql) if commit: self.connection.commit() def has_tables(self, table_names): """Are all of the given table names present in the database?""" return all(self.has_table(table_name) for table_name in table_names) def has_version(self): """Does this database have version information? The absence of version information indicates that this database was either not created by datacache or is incomplete. """ return self.has_table(METADATA_TABLE_NAME) def version(self): """What's the version of this database? Found in metadata attached by datacache when creating this database.""" query = "SELECT version FROM %s" % METADATA_TABLE_NAME cursor = self.connection.execute(query) version = cursor.fetchone() if not version: return 0 else: return int(version[0]) def _finalize_database(self, version): """ Create metadata table for database with version number. Parameters ---------- version : int Tag created database with user-specified version number """ require_integer(version, "version") create_metadata_sql = \ "CREATE TABLE %s (version INT)" % METADATA_TABLE_NAME self.execute_sql(create_metadata_sql) insert_version_sql = \ "INSERT INTO %s VALUES (%s)" % (METADATA_TABLE_NAME, version) self.execute_sql(insert_version_sql) def _create_table(self, table_name, column_types, primary=None, nullable=()): """Creates a sqlite3 table from the given metadata. Parameters ---------- column_types : list of (str, str) pairs First element of each tuple is the column name, second element is the sqlite3 type primary : str, optional Which column is the primary key nullable : iterable, optional Names of columns which have null values """ require_string(table_name, "table name") require_iterable_of(column_types, tuple, name="rows") if primary is not None: require_string(primary, "primary") require_iterable_of(nullable, str, name="nullable") column_decls = [] for column_name, column_type in column_types: decl = "%s %s" % (column_name, column_type) if column_name == primary: decl += " UNIQUE PRIMARY KEY" if column_name not in nullable: decl += " NOT NULL" column_decls.append(decl) column_decl_str = ", ".join(column_decls) create_table_sql = \ "CREATE TABLE %s (%s)" % (table_name, column_decl_str) self.execute_sql(create_table_sql) def _fill_table(self, table_name, rows): require_string(table_name, "table_name") require_iterable_of(rows, tuple, "rows") if not self.has_table(table_name): raise ValueError( "Table '%s' does not exist in database" % (table_name,)) if len(rows) == 0: raise ValueError("Rows must be non-empty sequence") first_row = rows[0] n_columns = len(first_row) if not all(len(row) == n_columns for row in rows): raise ValueError("Rows must all have %d values" % n_columns) blank_slots = ", ".join("?" for _ in range(n_columns)) logger.info("Inserting %d rows into table %s", len(rows), table_name) sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots) self.connection.executemany(sql, rows) def create(self, tables, version): """Do the actual work of creating the database, filling its tables with values, creating indices, and setting the datacache version metadata. Parameters ---------- tables : list List of datacache.DatabaseTable objects version : int """ for table in tables: self._create_table( table_name=table.name, column_types=table.column_types, primary=table.primary_key, nullable=table.nullable) self._fill_table(table.name, table.rows) self._create_indices(table.name, table.indices) self._finalize_database(version) self._commit() def _create_index(self, table_name, index_columns): """ Creates an index over multiple columns of a given table. Parameters ---------- table_name : str index_columns : iterable of str Which columns should be indexed """ logger.info( "Creating index on %s (%s)", table_name, ", ".join(index_columns)) index_name = "%s_index_%s" % ( table_name, "_".join(index_columns)) self.connection.execute( "CREATE INDEX IF NOT EXISTS %s ON %s (%s)" % ( index_name, table_name, ", ".join(index_columns))) def _create_indices(self, table_name, indices): """ Create multiple indices (each over multiple columns) on a given table. Parameters ---------- table_name : str indices : iterable of tuples Multiple groups of columns, each of which should be indexed. """ require_string(table_name, "table_name") require_iterable_of(indices, (tuple, list)) for index_column_set in indices: self._create_index(table_name, index_column_set) datacache-1.1.5/datacache/database_helpers.py0000644000372000037200000002064713326760043022025 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import from os import remove from os.path import splitext, exists import logging from typechecks import ( require_string, require_integer, require_iterable_of ) from .common import build_path from .download import fetch_csv_dataframe from .database import Database from .database_table import DatabaseTable from .database_types import db_type logger = logging.getLogger(__name__) def connect_if_correct_version(db_path, version): """Return a sqlite3 database connection if the version in the database's metadata matches the version argument. Also implicitly checks for whether the data in this database has been completely filled, since we set the version last. TODO: Make an explicit 'complete' flag to the metadata. """ db = Database(db_path) if db.has_version() and db.version() == version: return db.connection return None def _create_cached_db( db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info( "Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info( "Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning( "Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection def build_tables( table_names_to_dataframes, table_names_to_primary_keys={}, table_names_to_indices={}): """ Parameters ---------- table_names_to_dataframes : dict Dictionary mapping each table name to a DataFrame table_names_to_primary_keys : dict Dictionary mapping each table to its primary key table_names_to_indices : dict Dictionary mapping each table to a set of indices Returns list of DatabaseTable objects """ tables = [] for table_name, df in table_names_to_dataframes.items(): table_indices = table_names_to_indices.get(table_name, []) primary_key = table_names_to_primary_keys.get(table_name) table = DatabaseTable.from_dataframe( name=table_name, df=df, indices=table_indices, primary_key=primary_key) tables.append(table) return tables def db_from_dataframes_with_absolute_path( db_path, table_names_to_dataframes, table_names_to_primary_keys={}, table_names_to_indices={}, overwrite=False, version=1): """ Create a sqlite3 database from a collection of DataFrame objects Parameters ---------- db_path : str Path to database file to create table_names_to_dataframes : dict Dictionary from table names to DataFrame objects table_names_to_primary_keys : dict, optional Name of primary key column for each table table_names_to_indices : dict, optional Dictionary from table names to list of column name tuples overwrite : bool, optional If the database already exists, overwrite it? version : int, optional """ if overwrite and exists(db_path): remove(db_path) tables = build_tables( table_names_to_dataframes, table_names_to_primary_keys, table_names_to_indices) return _create_cached_db( db_path, tables=tables, version=version) def db_from_dataframes( db_filename, dataframes, primary_keys={}, indices={}, subdir=None, overwrite=False, version=1): """ Create a sqlite3 database from a collection of DataFrame objects Parameters ---------- db_filename : str Name of database file to create dataframes : dict Dictionary from table names to DataFrame objects primary_keys : dict, optional Name of primary key column for each table indices : dict, optional Dictionary from table names to list of column name tuples subdir : str, optional overwrite : bool, optional If the database already exists, overwrite it? version : int, optional """ if not (subdir is None or isinstance(subdir, str)): raise TypeError("Expected subdir to be None or str, got %s : %s" % ( subdir, type(subdir))) db_path = build_path(db_filename, subdir) return db_from_dataframes_with_absolute_path( db_path, table_names_to_dataframes=dataframes, table_names_to_primary_keys=primary_keys, table_names_to_indices=indices, overwrite=overwrite, version=version) def db_from_dataframe( db_filename, table_name, df, primary_key=None, subdir=None, overwrite=False, indices=(), version=1): """ Given a dataframe `df`, turn it into a sqlite3 database. Store values in a table called `table_name`. Returns full path to the sqlite database file. """ return db_from_dataframes( db_filename=db_filename, dataframes={table_name: df}, primary_keys={table_name: primary_key}, indices={table_name: indices}, subdir=subdir, overwrite=overwrite, version=version) def _db_filename_from_dataframe(base_filename, df): """ Generate database filename for a sqlite3 database we're going to fill with the contents of a DataFrame, using the DataFrame's column names and types. """ db_filename = base_filename + ("_nrows%d" % len(df)) for column_name in df.columns: column_db_type = db_type(df[column_name].dtype) column_name = column_name.replace(" ", "_") db_filename += ".%s_%s" % (column_name, column_db_type) return db_filename + ".db" def fetch_csv_db( table_name, download_url, csv_filename=None, db_filename=None, subdir=None, version=1, **pandas_kwargs): """ Download a remote CSV file and create a local sqlite3 database from its contents """ df = fetch_csv_dataframe( download_url=download_url, filename=csv_filename, subdir=subdir, **pandas_kwargs) base_filename = splitext(csv_filename)[0] if db_filename is None: db_filename = _db_filename_from_dataframe(base_filename, df) return db_from_dataframe( db_filename, table_name, df, subdir=subdir, version=version) datacache-1.1.5/datacache/database_table.py0000644000372000037200000000565713326760043021456 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import from .database_types import db_type class DatabaseTable(object): """Converts between a DataFrame and a sqlite3 database table""" def __init__( self, name, column_types, make_rows, indices=[], nullable=set(), primary_key=None): self.name = name self.column_types = column_types self.make_rows = make_rows self.indices = indices self.nullable = nullable self.primary_key = primary_key @property def rows(self): """Delay constructing list of row tuples""" return self.make_rows() @classmethod def from_dataframe(cls, name, df, indices, primary_key=None): """Infer table metadata from a DataFrame""" # ordered list (column_name, column_type) pairs column_types = [] # which columns have nullable values nullable = set() # tag cached database by dataframe's number of rows and columns for column_name in df.columns: values = df[column_name] if values.isnull().any(): nullable.add(column_name) column_db_type = db_type(values.dtype) column_types.append((column_name.replace(" ", "_"), column_db_type)) def make_rows(): return list(tuple(row) for row in df.values) return cls( name=name, column_types=column_types, make_rows=make_rows, indices=indices, nullable=nullable, primary_key=primary_key) @classmethod def from_fasta_dict(cls, name, fasta_dict, key_column, value_column): key_list = list(fasta_dict.keys()) key_set = set(key_list) assert len(key_set) == len(key_list), \ "FASTA file from contains %d non-unique sequence identifiers" % \ (len(key_list) - len(key_set)) column_types = [(key_column, "TEXT"), (value_column, "TEXT")] def make_rows(): return [ (idx, str(record.seq)) for (idx, record) in fasta_dict.items() ] return cls( name=name, column_types=column_types, make_rows=make_rows, primary_key=key_column) datacache-1.1.5/datacache/database_types.py0000644000372000037200000000610013326760043021513 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert from Python type names to sqlite3 column types""" from __future__ import print_function, division, absolute_import import numpy as np _dtype_to_db_type_dict = { 'int': 'INT', 'int8': 'INT', 'int16': 'INT', 'int32': 'INT', 'int64': 'INT', 'uint8': 'INT', 'uint16': 'INT', 'uint32': 'INT', 'uint64': 'INT', 'bool': 'INT', 'float': 'FLOAT', 'float32': 'FLOAT', 'float64': 'FLOAT', 'object': 'TEXT', 'object_': 'TEXT', 'string_': 'TEXT', 'str': 'TEXT', } def _lookup_type_name(type_name): if type_name in _dtype_to_db_type_dict: return _dtype_to_db_type_dict[type_name] else: return None def _candidate_type_names(python_type_representation): """Generator which yields possible type names to look up in the conversion dictionary. Parameters ---------- python_type_representation : object Any Python object which represents a type, such as `int`, `dtype('int8')`, `np.int8`, or `"int8"`. """ # if we get a single character code we should normalize to a NumPy type # using np.typeDict, which maps string representations of types to NumPy # type objects if python_type_representation in np.typeDict: python_type_representation = np.typeDict[python_type_representation] yield python_type_representation.__name__ # if we get a dtype object i.e. dtype('int16'), then pull out its name if hasattr(python_type_representation, 'name'): yield python_type_representation.name # convert Python types by adding their type's name if hasattr(python_type_representation, '__name__'): yield python_type_representation.__name__ # for a dtype like dtype('S3') need to access dtype.type.__name__ # to get 'string_' if hasattr(python_type_representation, 'type'): if hasattr(python_type_representation.type, '__name__'): yield python_type_representation.type.__name__ yield str(python_type_representation) def db_type(python_type_representation): """ Converts from any of: (1) Python type (2) NumPy/Pandas dtypes (3) string names of types ...to a sqlite3 type name """ for type_name in _candidate_type_names(python_type_representation): db_type_name = _lookup_type_name(type_name) if db_type_name: return db_type_name raise ValueError("Failed to find sqlite3 column type for %s" % ( python_type_representation)) datacache-1.1.5/datacache/download.py0000644000372000037200000002064713326760043020346 0ustar travistravis00000000000000# Copyright (c) 2015-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function, division, absolute_import import gzip import logging import os import subprocess from shutil import move from tempfile import NamedTemporaryFile import zipfile import requests import pandas as pd from six.moves import urllib from .common import build_path, build_local_filename logger = logging.getLogger(__name__) def _download(download_url, timeout=None): if download_url.startswith("http"): response = requests.get(download_url, timeout=timeout) response.raise_for_status() return response.content else: req = urllib.request.Request(download_url) response = urllib.request.urlopen(req, data=None, timeout=timeout) return response.read() def _download_to_temp_file( download_url, timeout=None, base_name="download", ext="tmp", use_wget_if_available=False): if not download_url: raise ValueError("URL not provided") with NamedTemporaryFile( suffix='.' + ext, prefix=base_name, delete=False) as tmp: tmp_path = tmp.name def download_using_python(): with open(tmp_path, mode="w+b") as tmp_file: tmp_file.write( _download(download_url, timeout=timeout)) if not use_wget_if_available: download_using_python() else: try: # first try using wget to download since this works on Travis # even when FTP otherwise fails wget_command_list = [ "wget", download_url, "-O", tmp_path, "--no-verbose", ] if download_url.startswith("ftp"): wget_command_list.extend(["--passive-ftp"]) if timeout: wget_command_list.extend(["-T", "%s" % timeout]) logger.info("Running: %s" % (" ".join(wget_command_list))) subprocess.call(wget_command_list) except OSError as e: if e.errno == os.errno.ENOENT: # wget not found download_using_python() else: raise return tmp_path def _download_and_decompress_if_necessary( full_path, download_url, timeout=None, use_wget_if_available=False): """ Downloads remote file at `download_url` to local file at `full_path` """ logger.info("Downloading %s to %s", download_url, full_path) filename = os.path.split(full_path)[1] base_name, ext = os.path.splitext(filename) tmp_path = _download_to_temp_file( download_url=download_url, timeout=timeout, base_name=base_name, ext=ext, use_wget_if_available=use_wget_if_available) if download_url.endswith("zip") and not filename.endswith("zip"): logger.info("Decompressing zip into %s...", filename) with zipfile.ZipFile(tmp_path) as z: names = z.namelist() assert len(names) > 0, "Empty zip archive" if filename in names: chosen_filename = filename else: # If zip archive contains multiple files, choose the biggest. biggest_size = 0 chosen_filename = names[0] for info in z.infolist(): if info.file_size > biggest_size: chosen_filename = info.filename biggest_size = info.file_size extract_path = z.extract(chosen_filename) move(extract_path, full_path) os.remove(tmp_path) elif download_url.endswith("gz") and not filename.endswith("gz"): logger.info("Decompressing gzip into %s...", filename) with gzip.GzipFile(tmp_path) as src: contents = src.read() os.remove(tmp_path) with open(full_path, 'wb') as dst: dst.write(contents) elif download_url.endswith(("html", "htm")) and full_path.endswith(".csv"): logger.info("Extracting HTML table into CSV %s...", filename) df = pd.read_html(tmp_path, header=0)[0] df.to_csv(full_path, sep=',', index=False, encoding='utf-8') else: move(tmp_path, full_path) def file_exists( download_url, filename=None, decompress=False, subdir=None): """ Return True if a local file corresponding to these arguments exists. """ filename = build_local_filename(download_url, filename, decompress) full_path = build_path(filename, subdir) return os.path.exists(full_path) def fetch_file( download_url, filename=None, decompress=False, subdir=None, force=False, timeout=None, use_wget_if_available=False): """ Download a remote file and store it locally in a cache directory. Don't download it again if it's already present (unless `force` is True.) Parameters ---------- download_url : str Remote URL of file to download. filename : str, optional Local filename, used as cache key. If omitted, then determine the local filename from the URL. decompress : bool, optional By default any file whose remote extension is one of (".zip", ".gzip") and whose local filename lacks this suffix is decompressed. If a local filename wasn't provided but you still want to decompress the stored data then set this option to True. subdir : str, optional Group downloads in a single subdirectory. force : bool, optional By default, a remote file is not downloaded if it's already present. However, with this argument set to True, it will be overwritten. timeout : float, optional Timeout for download in seconds, default is None which uses global timeout. use_wget_if_available: bool, optional If the `wget` command is available, use that for download instead of Python libraries (default True) Returns the full path of the local file. """ filename = build_local_filename(download_url, filename, decompress) full_path = build_path(filename, subdir) if not os.path.exists(full_path) or force: logger.info("Fetching %s from URL %s", filename, download_url) _download_and_decompress_if_necessary( full_path=full_path, download_url=download_url, timeout=timeout, use_wget_if_available=use_wget_if_available) else: logger.info("Cached file %s from URL %s", filename, download_url) return full_path def fetch_and_transform( transformed_filename, transformer, loader, source_filename, source_url, subdir=None): """ Fetch a remote file from `source_url`, save it locally as `source_filename` and then use the `loader` and `transformer` function arguments to turn this saved data into an in-memory object. """ transformed_path = build_path(transformed_filename, subdir) if not os.path.exists(transformed_path): source_path = fetch_file(source_url, source_filename, subdir) logger.info("Generating data file %s from %s", transformed_path, source_path) result = transformer(source_path, transformed_path) else: logger.info("Cached data file: %s", transformed_path) result = loader(transformed_path) assert os.path.exists(transformed_path) return result def fetch_csv_dataframe( download_url, filename=None, subdir=None, **pandas_kwargs): """ Download a remote file from `download_url` and save it locally as `filename`. Load that local file as a CSV into Pandas using extra keyword arguments such as sep='\t'. """ path = fetch_file( download_url=download_url, filename=filename, decompress=True, subdir=subdir) return pd.read_csv(path, **pandas_kwargs) datacache-1.1.5/datacache.egg-info/0000755000372000037200000000000013326760342017650 5ustar travistravis00000000000000datacache-1.1.5/datacache.egg-info/PKG-INFO0000644000372000037200000000262413326760342020751 0ustar travistravis00000000000000Metadata-Version: 1.1 Name: datacache Version: 1.1.5 Summary: Helpers for transparently downloading datasets Home-page: https://github.com/openvax/datacache Author: Alex Rubinsteyn Author-email: alex.rubinsteyn@mssm.edu License: http://www.apache.org/licenses/LICENSE-2.0.html Description: DataCache ========= Helpers for transparently downloading datasets API --- - **fetch_file**\ (download_url, filename = *None*, decompress = *False*, subdir = *None*) - **fetch_and_transform**\ (transformed_filename, transformer, loader, source_filename, source_url, subdir = *None*) - **fetch_fasta_dict**\ (download_url, filename = *None*, subdir = *None*) - **fetch_fasta_db**\ (table_name, download_url, fasta_filename = *None*, key_column = *‘id’*, value_column = *‘seq’*, subdir = *None*) - **fetch_csv_db**\ (table_name, download_url, csv_filename = *None*, subdir = *None*, \**pandas_kwargs) Platform: UNKNOWN Classifier: Development Status :: 3 - Alpha Classifier: Environment :: Console Classifier: Operating System :: OS Independent Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: Apache Software License Classifier: Programming Language :: Python Classifier: Topic :: Scientific/Engineering :: Bio-Informatics datacache-1.1.5/datacache.egg-info/SOURCES.txt0000644000372000037200000000105213326760342021532 0ustar travistravis00000000000000README.md setup.py datacache/__init__.py datacache/cache.py datacache/common.py datacache/database.py datacache/database_helpers.py datacache/database_table.py datacache/database_types.py datacache/download.py datacache.egg-info/PKG-INFO datacache.egg-info/SOURCES.txt datacache.egg-info/dependency_links.txt datacache.egg-info/requires.txt datacache.egg-info/top_level.txt test/test_cache_object.py test/test_database_objects.py test/test_database_types.py test/test_db_from_dataframes.py test/test_download.py test/test_fasta_dict.py test/test_names.pydatacache-1.1.5/datacache.egg-info/dependency_links.txt0000644000372000037200000000000113326760342023716 0ustar travistravis00000000000000 datacache-1.1.5/datacache.egg-info/requires.txt0000644000372000037200000000013013326760342022242 0ustar travistravis00000000000000pandas>=0.15.2 appdirs>=1.4.0 progressbar33>=2.4 requests>=2.5.1 typechecks>=0.0.2 mock datacache-1.1.5/datacache.egg-info/top_level.txt0000644000372000037200000000001213326760342022373 0ustar travistravis00000000000000datacache datacache-1.1.5/test/0000755000372000037200000000000013326760342015240 5ustar travistravis00000000000000datacache-1.1.5/test/test_cache_object.py0000644000372000037200000000436513326760043021250 0ustar travistravis00000000000000from os import remove from os.path import exists from mock import patch from nose.tools import eq_ from datacache import Cache CACHE_DIR = "datacache_test" TEST_URL = "http://www.google.com" TEST_FILENAME = "google" def test_cache_object_path(): cache = Cache(CACHE_DIR) assert cache.cache_directory_path.endswith(CACHE_DIR), \ "Expected directory path to end with %s but got %s" % ( CACHE_DIR, cache.cache_directory_path) def test_cache_object_local_filename(): filename = Cache(CACHE_DIR).local_filename(filename="test") assert filename.endswith("test") def test_cache_fetch_google(): cache = Cache(CACHE_DIR) path = cache.fetch(TEST_URL, filename=TEST_FILENAME) assert path.endswith(TEST_FILENAME), \ "Expected local file to be named %s but got %s" % ( TEST_FILENAME, path) assert exists(path), "File not found: %s" % path eq_(path, cache.local_path(TEST_URL, filename=TEST_FILENAME)) @patch('datacache.cache.download._download_and_decompress_if_necessary') def test_cache_fetch_force(mock_download): cache = Cache("datacache_test") cache.fetch("http://www.google.com", filename="google", force=True) cache.fetch("http://www.google.com", filename="google", force=True) assert len(mock_download.call_args_list) == 2, \ "Expected two separate calls to _download, given force=True" def test_cache_delete_url(): cache = Cache(CACHE_DIR) path = cache.fetch(TEST_URL, filename=TEST_FILENAME) assert exists(path), "Expected %s to exist after download" % path cache.delete_url(TEST_URL) assert not exists(path), \ "Expected %s to be deleted after call to delete_url" % path def test_cache_missing_file(): """test_cache_missing_file : Files can be deleted from the file system, Cache should be aware that these files no longer exist """ cache = Cache(CACHE_DIR) path = cache.fetch(TEST_URL, filename=TEST_FILENAME) # does the filename exist? assert exists(path) # does the cache know the URL has been downloaded? assert cache.exists(TEST_URL, filename=TEST_FILENAME) remove(path) assert not cache.exists(TEST_URL, filename=TEST_FILENAME), \ "Local file for %s has been deleted from the file system" % TEST_URL datacache-1.1.5/test/test_database_objects.py0000644000372000037200000000276513326760043022136 0ustar travistravis00000000000000""" Test that datacache constructs databases correctly (separately from downloading/caching them) """ import tempfile import datacache from nose.tools import eq_ TABLE_NAME = "test" INT_COL_NAME = "int_col" STR_COL_NAME = "str_col" COL_TYPES = [(INT_COL_NAME, "INT"), (STR_COL_NAME, "STR")] KEY_COLUMN_NAME = "int_col" NULLABLE = {STR_COL_NAME} ROWS = [(1, "darkness"), (2, "light"), (3, None)] INDICES = [["str_col"]] VERSION = 2 def make_table_object(): return datacache.database_table.DatabaseTable( name=TABLE_NAME, column_types=COL_TYPES, make_rows=lambda: ROWS, indices=INDICES, nullable=NULLABLE, primary_key=INT_COL_NAME) def test_database_table_object(): table = make_table_object() eq_(table.name, TABLE_NAME) eq_(table.indices, INDICES) eq_(table.nullable, NULLABLE) eq_(table.rows, ROWS) eq_(table.indices, INDICES) def test_create_db(): with tempfile.NamedTemporaryFile(suffix="test.db") as f: db = datacache.database.Database(f.name) table = make_table_object() db.create(tables=[table], version=VERSION) assert db.has_table(TABLE_NAME) assert db.has_version() assert db.version() == VERSION sql = """ SELECT %s from %s WHERE %s = '%s' """ % (INT_COL_NAME, TABLE_NAME, STR_COL_NAME, "light") cursor = db.connection.execute(sql) int_result_tuple = cursor.fetchone() int_result = int_result_tuple[0] eq_(int_result, 2) datacache-1.1.5/test/test_database_types.py0000644000372000037200000000064513326760043021644 0ustar travistravis00000000000000from nose.tools import eq_ import numpy as np from datacache.database_types import db_type def test_db_types(): for int_type in [ int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64]: eq_(db_type(int_type), "INT") for float_type in [float, np.float32, np.float64]: eq_(db_type(float), "FLOAT") eq_(db_type(str), "TEXT") datacache-1.1.5/test/test_db_from_dataframes.py0000644000372000037200000000251013326760043022444 0ustar travistravis00000000000000from nose.tools import eq_ import pandas as pd from tempfile import NamedTemporaryFile from datacache import db_from_dataframes, db_from_dataframe dfA = pd.DataFrame({"numbers": [1, 2, 3], "strings": ["a", "b", "c"]}) dfB = pd.DataFrame({"wuzzles": ["nuzzle", "ruzzle"]}) def test_database_from_dataframes(): with NamedTemporaryFile(suffix="test.db") as f: db = db_from_dataframes( db_filename=f.name, dataframes={"A": dfA, "B": dfB}, primary_keys={"A": "numbers"}, indices={"A": [("numbers", "strings")]}, subdir="test_datacache") cursor_A = db.execute("SELECT * FROM A") results_A = cursor_A.fetchall() eq_(results_A, [(1, "a"), (2, "b"), (3, "c")]) cursor_B = db.execute("SELECT * FROM B") results_B = cursor_B.fetchall() eq_(results_B, [("nuzzle",), ("ruzzle",)]) def test_database_from_single_dataframe(): with NamedTemporaryFile(suffix="test.db") as f: db = db_from_dataframe( db_filename=f.name, table_name="A", df=dfA, primary_key="numbers", indices=[("numbers", "strings")], subdir="test_datacache") cursor = db.execute("SELECT * FROM A") results = cursor.fetchall() eq_(results, [(1, "a"), (2, "b"), (3, "c")]) datacache-1.1.5/test/test_download.py0000644000372000037200000000327713326760043020467 0ustar travistravis00000000000000# Copyright (c) 2014. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from datacache import fetch_file FASTA_FILENAME = 'Homo_sapiens.GRCh37.75.dna_rm.chromosome.MT.fa' URL = \ 'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna_rm.chromosome.MT.fa.gz' def test_fetch_decompress(): for use_wget_if_available in [True, False]: for timeout in [None, 10**6]: path1 = fetch_file( URL, decompress=True, subdir="datacache", use_wget_if_available=use_wget_if_available, timeout=timeout) assert path1.endswith(FASTA_FILENAME) with open(path1, 'r') as f1: s1 = f1.read() assert "TCAATTTCGTGCCAG" in s1 def test_fetch_subdirs(): path = fetch_file(URL, decompress=True, subdir="datacache") assert path.endswith(FASTA_FILENAME) # if we change the subdir then data should end up in # something like /Users/me/Library/Caches/epitopes_test/ other_path = fetch_file(URL, decompress=True, subdir="datacache_test") assert other_path.endswith(FASTA_FILENAME) assert other_path != path, other_path datacache-1.1.5/test/test_fasta_dict.py0000644000372000037200000000304513326760043020752 0ustar travistravis00000000000000# Copyright (c) 2014. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from datacache import fetch_file import gzip URL = "".join([ 'ftp://ftp.ensembl.org/pub/release-75', '/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75', '.dna_rm.chromosome.MT.fa.gz', ]) def fetch_fasta_dict(path_or_url): path = fetch_file(path_or_url) d = {} value_buffer = [] key = None if path.endswith(".gz") or path.endswith(".gzip"): f = gzip.open(path, "r") else: f = open(path, "r") for line in f.readlines(): if type(line) is bytes: line = line.decode("ascii") if line.startswith(">"): if key is not None: d[key] = "".join(value_buffer) value_buffer = [] key = line.split()[0][1:] else: value_buffer.append(line.strip()) if key and value_buffer: d[key] = "".join(value_buffer) f.close() return d def test_download_fasta_dict(): d = fetch_fasta_dict(URL) assert len(d) > 0 datacache-1.1.5/test/test_names.py0000644000372000037200000000106413326760043017753 0ustar travistravis00000000000000from datacache import build_local_filename def test_url_without_filename(): filename = build_local_filename(download_url="http://www.google.com/") assert filename assert "google" in filename def test_multiple_domains_same_file(): filename_google = build_local_filename( download_url="http://www.google.com/index.html") filename_yahoo = build_local_filename( download_url="http://www.yahoo.com/index.html") assert "index" in filename_google assert "index" in filename_yahoo assert filename_yahoo != filename_googledatacache-1.1.5/README.md0000644000372000037200000000212513326760043015536 0ustar travistravis00000000000000 Build Status Coverage Status PyPI DataCache ========= Helpers for transparently downloading datasets ## API * **fetch_file**(download_url, filename = *None*, decompress = *False*, subdir = *None*) * **fetch_and_transform**(transformed_filename, transformer, loader, source_filename, source_url, subdir = *None*) * **fetch_fasta_dict**(download_url, filename = *None*, subdir = *None*) * **fetch_fasta_db**(table_name, download_url, fasta_filename = *None*, key_column = *'id'*, value_column = *'seq'*, subdir = *None*) * **fetch_csv_db**(table_name, download_url, csv_filename = *None*, subdir = *None*, \*\*pandas_kwargs) datacache-1.1.5/setup.py0000644000372000037200000000462013326760043015773 0ustar travistravis00000000000000# Copyright (c) 2014-2018. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import logging import re from setuptools import setup readme_filename = "README.md" current_directory = os.path.dirname(__file__) readme_path = os.path.join(current_directory, readme_filename) try: with open(readme_path, "r") as f: readme_markdown = f.read() except: logging.warn("Failed to load %s", readme_filename) readme_markdown = "" try: import pypandoc readme_restructured = pypandoc.convert(readme_markdown, to="rst", format="md") except: readme_restructured = readme_markdown logging.warn("Failed to convert %s to reStructuredText", readme_filename) with open('datacache/__init__.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE).group(1) if __name__ == "__main__": setup( name="datacache", version=version, description="Helpers for transparently downloading datasets", author="Alex Rubinsteyn", author_email="alex.rubinsteyn@mssm.edu", url="https://github.com/openvax/datacache", license="http://www.apache.org/licenses/LICENSE-2.0.html", classifiers=[ "Development Status :: 3 - Alpha", "Environment :: Console", "Operating System :: OS Independent", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", "Topic :: Scientific/Engineering :: Bio-Informatics", ], install_requires=[ "pandas>=0.15.2", "appdirs>=1.4.0", "progressbar33>=2.4", "requests>=2.5.1", "typechecks>=0.0.2", "mock", ], long_description=readme_restructured, packages=["datacache"], ) datacache-1.1.5/PKG-INFO0000644000372000037200000000262413326760342015362 0ustar travistravis00000000000000Metadata-Version: 1.1 Name: datacache Version: 1.1.5 Summary: Helpers for transparently downloading datasets Home-page: https://github.com/openvax/datacache Author: Alex Rubinsteyn Author-email: alex.rubinsteyn@mssm.edu License: http://www.apache.org/licenses/LICENSE-2.0.html Description: DataCache ========= Helpers for transparently downloading datasets API --- - **fetch_file**\ (download_url, filename = *None*, decompress = *False*, subdir = *None*) - **fetch_and_transform**\ (transformed_filename, transformer, loader, source_filename, source_url, subdir = *None*) - **fetch_fasta_dict**\ (download_url, filename = *None*, subdir = *None*) - **fetch_fasta_db**\ (table_name, download_url, fasta_filename = *None*, key_column = *‘id’*, value_column = *‘seq’*, subdir = *None*) - **fetch_csv_db**\ (table_name, download_url, csv_filename = *None*, subdir = *None*, \**pandas_kwargs) Platform: UNKNOWN Classifier: Development Status :: 3 - Alpha Classifier: Environment :: Console Classifier: Operating System :: OS Independent Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: Apache Software License Classifier: Programming Language :: Python Classifier: Topic :: Scientific/Engineering :: Bio-Informatics datacache-1.1.5/setup.cfg0000644000372000037200000000004613326760342016102 0ustar travistravis00000000000000[egg_info] tag_build = tag_date = 0