datacache-1.1.5/ 0000755 0003720 0003720 00000000000 13326760342 014261 5 ustar travis travis 0000000 0000000 datacache-1.1.5/datacache/ 0000755 0003720 0003720 00000000000 13326760342 016156 5 ustar travis travis 0000000 0000000 datacache-1.1.5/datacache/__init__.py 0000644 0003720 0003720 00000002634 13326760043 020272 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
from .download import fetch_file, fetch_and_transform, fetch_csv_dataframe
from .database_helpers import (
db_from_dataframe,
db_from_dataframes,
db_from_dataframes_with_absolute_path,
fetch_csv_db,
connect_if_correct_version
)
from .common import (
ensure_dir,
get_data_dir,
build_path,
clear_cache,
build_local_filename
)
from .cache import Cache
__version__ = '1.1.5'
__all__ = [
'fetch_file',
'fetch_and_transform',
'fetch_csv_dataframe',
'db_from_dataframe',
'db_from_dataframes',
'db_from_dataframes_with_absolute_path',
'fetch_csv_db',
'connect_if_correct_version',
'ensure_dir',
'get_data_dir',
'build_path',
'clear_cache',
'build_local_filename',
'Cache',
]
datacache-1.1.5/datacache/cache.py 0000644 0003720 0003720 00000010755 13326760043 017601 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
from os.path import exists, join
from os import remove
from . import common
from . import download
from .database_helpers import db_from_dataframe
class Cache(object):
def __init__(self, subdir="datacache"):
assert subdir
self.subdir = subdir
self.cache_directory_path = common.get_data_dir(subdir)
# dictionary mapping from (URL, decompress) pair to local paths
# TODO: handle decompression separately from download,
# so we can use copies of compressed files we've already downloaded
self._local_paths = {}
def delete_url(self, url):
"""
Delete local files downloaded from given URL
"""
# file may exist locally in compressed and decompressed states
# delete both
for decompress in [False, True]:
key = (url, decompress)
if key in self._local_paths:
path = self._local_paths[key]
remove(path)
del self._local_paths[key]
# possible that file was downloaded via the download module without
# using the Cache object, this wouldn't end up in the local_paths
# but should still be deleted
path = self.local_path(
url, decompress=decompress, download=False)
if exists(path):
remove(path)
def delete_all(self):
self._local_paths.clear()
common.clear_cache(self.cache_directory_path)
common.ensure_dir(self.cache_directory_path)
def exists(self, url, filename=None, decompress=False):
"""
Return True if a local file corresponding to these arguments
exists.
"""
return download.file_exists(
url,
filename=filename,
decompress=decompress,
subdir=self.subdir)
def fetch(
self,
url,
filename=None,
decompress=False,
force=False,
timeout=None,
use_wget_if_available=True):
"""
Return the local path to the downloaded copy of a given URL.
Don't download the file again if it's already present,
unless `force` is True.
"""
key = (url, decompress)
if not force and key in self._local_paths:
path = self._local_paths[key]
if exists(path):
return path
else:
del self._local_paths[key]
path = download.fetch_file(
url,
filename=filename,
decompress=decompress,
subdir=self.subdir,
force=force,
timeout=timeout,
use_wget_if_available=use_wget_if_available)
self._local_paths[key] = path
return path
def local_filename(
self,
url=None,
filename=None,
decompress=False):
"""
What local filename will we use within the cache directory
for the given URL/filename/decompress options.
"""
return common.build_local_filename(url, filename, decompress)
def local_path(self, url, filename=None, decompress=False, download=False):
"""
What will the full local path be if we download the given file?
"""
if download:
return self.fetch(url=url, filename=filename, decompress=decompress)
else:
filename = self.local_filename(url, filename, decompress)
return join(self.cache_directory_path, filename)
def db_from_dataframe(
self,
db_filename,
table_name,
df,
key_column_name=None):
return db_from_dataframe(
db_filename=db_filename,
table_name=table_name,
df=df,
primary_key=key_column_name,
subdir=self.subdir)
datacache-1.1.5/datacache/common.py 0000644 0003720 0003720 00000005020 13326760043 020013 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
import hashlib
from os import makedirs, environ
from os.path import join, exists, split, splitext
import re
from shutil import rmtree
import appdirs
def ensure_dir(path):
if not exists(path):
makedirs(path)
def get_data_dir(subdir=None, envkey=None):
if envkey and environ.get(envkey):
envdir = environ[envkey]
if subdir:
return join(envdir, subdir)
else:
return envdir
return appdirs.user_cache_dir(subdir if subdir else "datacache")
def build_path(filename, subdir=None):
data_dir = get_data_dir(subdir)
ensure_dir(data_dir)
return join(data_dir, filename)
def clear_cache(subdir=None):
data_dir = get_data_dir(subdir)
rmtree(data_dir)
def normalize_filename(filename):
"""
Remove special characters and shorten if name is too long
"""
# if the url pointed to a directory then just replace all the special chars
filename = re.sub("/|\\|;|:|\?|=", "_", filename)
if len(filename) > 150:
prefix = hashlib.md5(filename).hexdigest()
filename = prefix + filename[-140:]
return filename
def build_local_filename(download_url=None, filename=None, decompress=False):
"""
Determine which local filename to use based on the file's source URL,
an optional desired filename, and whether a compression suffix needs
to be removed
"""
assert download_url or filename, "Either filename or URL must be specified"
# if no filename provided, use the original filename on the server
if not filename:
digest = hashlib.md5(download_url.encode('utf-8')).hexdigest()
parts = split(download_url)
filename = digest + "." + "_".join(parts)
filename = normalize_filename(filename)
if decompress:
(base, ext) = splitext(filename)
if ext in (".gz", ".zip"):
filename = base
return filename
datacache-1.1.5/datacache/database.py 0000644 0003720 0003720 00000017563 13326760043 020306 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018 Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
import logging
import sqlite3
from typechecks import require_integer, require_string, require_iterable_of
logger = logging.getLogger(__name__)
METADATA_TABLE_NAME = "_datacache_metadata"
class Database(object):
"""
Wrapper object for sqlite3 database which provides helpers for
querying and constructing the datacache metadata table, as well as
creating and checking for existence of particular table names.
Calls to methods other than Database.close() and Database.create()
will not commit their changes.
"""
def __init__(self, path):
self.path = path
self.connection = sqlite3.connect(path)
def _commit(self):
self.connection.commit()
def close(self):
"""Commit changes and close database connection"""
self._commit()
self.connection.close()
def table_names(self):
"""Returns names of all tables in the database"""
query = "SELECT name FROM sqlite_master WHERE type='table'"
cursor = self.connection.execute(query)
results = cursor.fetchall()
return [result_tuple[0] for result_tuple in results]
def has_table(self, table_name):
"""Does a table named `table_name` exist in the sqlite database?"""
table_names = self.table_names()
return table_name in table_names
def drop_all_tables(self):
"""Drop all tables in the database"""
for table_name in self.table_names():
self.execute_sql("DROP TABLE %s" % table_name)
self.connection.commit()
def execute_sql(self, sql, commit=False):
"""Log and then execute a SQL query"""
logger.info("Running sqlite query: \"%s\"", sql)
self.connection.execute(sql)
if commit:
self.connection.commit()
def has_tables(self, table_names):
"""Are all of the given table names present in the database?"""
return all(self.has_table(table_name) for table_name in table_names)
def has_version(self):
"""Does this database have version information?
The absence of version information indicates that this database was
either not created by datacache or is incomplete.
"""
return self.has_table(METADATA_TABLE_NAME)
def version(self):
"""What's the version of this database? Found in metadata attached
by datacache when creating this database."""
query = "SELECT version FROM %s" % METADATA_TABLE_NAME
cursor = self.connection.execute(query)
version = cursor.fetchone()
if not version:
return 0
else:
return int(version[0])
def _finalize_database(self, version):
"""
Create metadata table for database with version number.
Parameters
----------
version : int
Tag created database with user-specified version number
"""
require_integer(version, "version")
create_metadata_sql = \
"CREATE TABLE %s (version INT)" % METADATA_TABLE_NAME
self.execute_sql(create_metadata_sql)
insert_version_sql = \
"INSERT INTO %s VALUES (%s)" % (METADATA_TABLE_NAME, version)
self.execute_sql(insert_version_sql)
def _create_table(self, table_name, column_types, primary=None, nullable=()):
"""Creates a sqlite3 table from the given metadata.
Parameters
----------
column_types : list of (str, str) pairs
First element of each tuple is the column name, second element is the sqlite3 type
primary : str, optional
Which column is the primary key
nullable : iterable, optional
Names of columns which have null values
"""
require_string(table_name, "table name")
require_iterable_of(column_types, tuple, name="rows")
if primary is not None:
require_string(primary, "primary")
require_iterable_of(nullable, str, name="nullable")
column_decls = []
for column_name, column_type in column_types:
decl = "%s %s" % (column_name, column_type)
if column_name == primary:
decl += " UNIQUE PRIMARY KEY"
if column_name not in nullable:
decl += " NOT NULL"
column_decls.append(decl)
column_decl_str = ", ".join(column_decls)
create_table_sql = \
"CREATE TABLE %s (%s)" % (table_name, column_decl_str)
self.execute_sql(create_table_sql)
def _fill_table(self, table_name, rows):
require_string(table_name, "table_name")
require_iterable_of(rows, tuple, "rows")
if not self.has_table(table_name):
raise ValueError(
"Table '%s' does not exist in database" % (table_name,))
if len(rows) == 0:
raise ValueError("Rows must be non-empty sequence")
first_row = rows[0]
n_columns = len(first_row)
if not all(len(row) == n_columns for row in rows):
raise ValueError("Rows must all have %d values" % n_columns)
blank_slots = ", ".join("?" for _ in range(n_columns))
logger.info("Inserting %d rows into table %s", len(rows), table_name)
sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots)
self.connection.executemany(sql, rows)
def create(self, tables, version):
"""Do the actual work of creating the database, filling its tables with
values, creating indices, and setting the datacache version metadata.
Parameters
----------
tables : list
List of datacache.DatabaseTable objects
version : int
"""
for table in tables:
self._create_table(
table_name=table.name,
column_types=table.column_types,
primary=table.primary_key,
nullable=table.nullable)
self._fill_table(table.name, table.rows)
self._create_indices(table.name, table.indices)
self._finalize_database(version)
self._commit()
def _create_index(self, table_name, index_columns):
"""
Creates an index over multiple columns of a given table.
Parameters
----------
table_name : str
index_columns : iterable of str
Which columns should be indexed
"""
logger.info(
"Creating index on %s (%s)",
table_name,
", ".join(index_columns))
index_name = "%s_index_%s" % (
table_name,
"_".join(index_columns))
self.connection.execute(
"CREATE INDEX IF NOT EXISTS %s ON %s (%s)" % (
index_name,
table_name,
", ".join(index_columns)))
def _create_indices(self, table_name, indices):
"""
Create multiple indices (each over multiple columns) on a given table.
Parameters
----------
table_name : str
indices : iterable of tuples
Multiple groups of columns, each of which should be indexed.
"""
require_string(table_name, "table_name")
require_iterable_of(indices, (tuple, list))
for index_column_set in indices:
self._create_index(table_name, index_column_set)
datacache-1.1.5/datacache/database_helpers.py 0000644 0003720 0003720 00000020647 13326760043 022025 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
from os import remove
from os.path import splitext, exists
import logging
from typechecks import (
require_string,
require_integer,
require_iterable_of
)
from .common import build_path
from .download import fetch_csv_dataframe
from .database import Database
from .database_table import DatabaseTable
from .database_types import db_type
logger = logging.getLogger(__name__)
def connect_if_correct_version(db_path, version):
"""Return a sqlite3 database connection if the version in the database's
metadata matches the version argument.
Also implicitly checks for whether the data in this database has
been completely filled, since we set the version last.
TODO: Make an explicit 'complete' flag to the metadata.
"""
db = Database(db_path)
if db.has_version() and db.version() == version:
return db.connection
return None
def _create_cached_db(
db_path,
tables,
version=1):
"""
Either create or retrieve sqlite database.
Parameters
--------
db_path : str
Path to sqlite3 database file
tables : dict
Dictionary mapping table names to datacache.DatabaseTable objects
version : int, optional
Version acceptable as cached data.
Returns sqlite3 connection
"""
require_string(db_path, "db_path")
require_iterable_of(tables, DatabaseTable)
require_integer(version, "version")
# if the database file doesn't already exist and we encounter an error
# later, delete the file before raising an exception
delete_on_error = not exists(db_path)
# if the database already exists, contains all the table
# names and has the right version, then just return it
db = Database(db_path)
# make sure to delete the database file in case anything goes wrong
# to avoid leaving behind an empty DB
table_names = [table.name for table in tables]
try:
if db.has_tables(table_names) and \
db.has_version() and \
db.version() == version:
logger.info("Found existing table in database %s", db_path)
else:
if len(db.table_names()) > 0:
logger.info(
"Dropping tables from database %s: %s",
db_path,
", ".join(db.table_names()))
db.drop_all_tables()
logger.info(
"Creating database %s containing: %s",
db_path,
", ".join(table_names))
db.create(tables, version)
except:
logger.warning(
"Failed to create tables %s in database %s",
table_names,
db_path)
db.close()
if delete_on_error:
remove(db_path)
raise
return db.connection
def build_tables(
table_names_to_dataframes,
table_names_to_primary_keys={},
table_names_to_indices={}):
"""
Parameters
----------
table_names_to_dataframes : dict
Dictionary mapping each table name to a DataFrame
table_names_to_primary_keys : dict
Dictionary mapping each table to its primary key
table_names_to_indices : dict
Dictionary mapping each table to a set of indices
Returns list of DatabaseTable objects
"""
tables = []
for table_name, df in table_names_to_dataframes.items():
table_indices = table_names_to_indices.get(table_name, [])
primary_key = table_names_to_primary_keys.get(table_name)
table = DatabaseTable.from_dataframe(
name=table_name,
df=df,
indices=table_indices,
primary_key=primary_key)
tables.append(table)
return tables
def db_from_dataframes_with_absolute_path(
db_path,
table_names_to_dataframes,
table_names_to_primary_keys={},
table_names_to_indices={},
overwrite=False,
version=1):
"""
Create a sqlite3 database from a collection of DataFrame objects
Parameters
----------
db_path : str
Path to database file to create
table_names_to_dataframes : dict
Dictionary from table names to DataFrame objects
table_names_to_primary_keys : dict, optional
Name of primary key column for each table
table_names_to_indices : dict, optional
Dictionary from table names to list of column name tuples
overwrite : bool, optional
If the database already exists, overwrite it?
version : int, optional
"""
if overwrite and exists(db_path):
remove(db_path)
tables = build_tables(
table_names_to_dataframes,
table_names_to_primary_keys,
table_names_to_indices)
return _create_cached_db(
db_path,
tables=tables,
version=version)
def db_from_dataframes(
db_filename,
dataframes,
primary_keys={},
indices={},
subdir=None,
overwrite=False,
version=1):
"""
Create a sqlite3 database from a collection of DataFrame objects
Parameters
----------
db_filename : str
Name of database file to create
dataframes : dict
Dictionary from table names to DataFrame objects
primary_keys : dict, optional
Name of primary key column for each table
indices : dict, optional
Dictionary from table names to list of column name tuples
subdir : str, optional
overwrite : bool, optional
If the database already exists, overwrite it?
version : int, optional
"""
if not (subdir is None or isinstance(subdir, str)):
raise TypeError("Expected subdir to be None or str, got %s : %s" % (
subdir, type(subdir)))
db_path = build_path(db_filename, subdir)
return db_from_dataframes_with_absolute_path(
db_path,
table_names_to_dataframes=dataframes,
table_names_to_primary_keys=primary_keys,
table_names_to_indices=indices,
overwrite=overwrite,
version=version)
def db_from_dataframe(
db_filename,
table_name,
df,
primary_key=None,
subdir=None,
overwrite=False,
indices=(),
version=1):
"""
Given a dataframe `df`, turn it into a sqlite3 database.
Store values in a table called `table_name`.
Returns full path to the sqlite database file.
"""
return db_from_dataframes(
db_filename=db_filename,
dataframes={table_name: df},
primary_keys={table_name: primary_key},
indices={table_name: indices},
subdir=subdir,
overwrite=overwrite,
version=version)
def _db_filename_from_dataframe(base_filename, df):
"""
Generate database filename for a sqlite3 database we're going to
fill with the contents of a DataFrame, using the DataFrame's
column names and types.
"""
db_filename = base_filename + ("_nrows%d" % len(df))
for column_name in df.columns:
column_db_type = db_type(df[column_name].dtype)
column_name = column_name.replace(" ", "_")
db_filename += ".%s_%s" % (column_name, column_db_type)
return db_filename + ".db"
def fetch_csv_db(
table_name,
download_url,
csv_filename=None,
db_filename=None,
subdir=None,
version=1,
**pandas_kwargs):
"""
Download a remote CSV file and create a local sqlite3 database
from its contents
"""
df = fetch_csv_dataframe(
download_url=download_url,
filename=csv_filename,
subdir=subdir,
**pandas_kwargs)
base_filename = splitext(csv_filename)[0]
if db_filename is None:
db_filename = _db_filename_from_dataframe(base_filename, df)
return db_from_dataframe(
db_filename,
table_name,
df,
subdir=subdir,
version=version)
datacache-1.1.5/datacache/database_table.py 0000644 0003720 0003720 00000005657 13326760043 021456 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
from .database_types import db_type
class DatabaseTable(object):
"""Converts between a DataFrame and a sqlite3 database table"""
def __init__(
self,
name,
column_types,
make_rows,
indices=[],
nullable=set(),
primary_key=None):
self.name = name
self.column_types = column_types
self.make_rows = make_rows
self.indices = indices
self.nullable = nullable
self.primary_key = primary_key
@property
def rows(self):
"""Delay constructing list of row tuples"""
return self.make_rows()
@classmethod
def from_dataframe(cls, name, df, indices, primary_key=None):
"""Infer table metadata from a DataFrame"""
# ordered list (column_name, column_type) pairs
column_types = []
# which columns have nullable values
nullable = set()
# tag cached database by dataframe's number of rows and columns
for column_name in df.columns:
values = df[column_name]
if values.isnull().any():
nullable.add(column_name)
column_db_type = db_type(values.dtype)
column_types.append((column_name.replace(" ", "_"), column_db_type))
def make_rows():
return list(tuple(row) for row in df.values)
return cls(
name=name,
column_types=column_types,
make_rows=make_rows,
indices=indices,
nullable=nullable,
primary_key=primary_key)
@classmethod
def from_fasta_dict(cls, name, fasta_dict, key_column, value_column):
key_list = list(fasta_dict.keys())
key_set = set(key_list)
assert len(key_set) == len(key_list), \
"FASTA file from contains %d non-unique sequence identifiers" % \
(len(key_list) - len(key_set))
column_types = [(key_column, "TEXT"), (value_column, "TEXT")]
def make_rows():
return [
(idx, str(record.seq))
for (idx, record)
in fasta_dict.items()
]
return cls(
name=name,
column_types=column_types,
make_rows=make_rows,
primary_key=key_column)
datacache-1.1.5/datacache/database_types.py 0000644 0003720 0003720 00000006100 13326760043 021513 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert from Python type names to sqlite3 column types"""
from __future__ import print_function, division, absolute_import
import numpy as np
_dtype_to_db_type_dict = {
'int': 'INT',
'int8': 'INT',
'int16': 'INT',
'int32': 'INT',
'int64': 'INT',
'uint8': 'INT',
'uint16': 'INT',
'uint32': 'INT',
'uint64': 'INT',
'bool': 'INT',
'float': 'FLOAT',
'float32': 'FLOAT',
'float64': 'FLOAT',
'object': 'TEXT',
'object_': 'TEXT',
'string_': 'TEXT',
'str': 'TEXT',
}
def _lookup_type_name(type_name):
if type_name in _dtype_to_db_type_dict:
return _dtype_to_db_type_dict[type_name]
else:
return None
def _candidate_type_names(python_type_representation):
"""Generator which yields possible type names to look up in the conversion
dictionary.
Parameters
----------
python_type_representation : object
Any Python object which represents a type, such as `int`,
`dtype('int8')`, `np.int8`, or `"int8"`.
"""
# if we get a single character code we should normalize to a NumPy type
# using np.typeDict, which maps string representations of types to NumPy
# type objects
if python_type_representation in np.typeDict:
python_type_representation = np.typeDict[python_type_representation]
yield python_type_representation.__name__
# if we get a dtype object i.e. dtype('int16'), then pull out its name
if hasattr(python_type_representation, 'name'):
yield python_type_representation.name
# convert Python types by adding their type's name
if hasattr(python_type_representation, '__name__'):
yield python_type_representation.__name__
# for a dtype like dtype('S3') need to access dtype.type.__name__
# to get 'string_'
if hasattr(python_type_representation, 'type'):
if hasattr(python_type_representation.type, '__name__'):
yield python_type_representation.type.__name__
yield str(python_type_representation)
def db_type(python_type_representation):
"""
Converts from any of:
(1) Python type
(2) NumPy/Pandas dtypes
(3) string names of types
...to a sqlite3 type name
"""
for type_name in _candidate_type_names(python_type_representation):
db_type_name = _lookup_type_name(type_name)
if db_type_name:
return db_type_name
raise ValueError("Failed to find sqlite3 column type for %s" % (
python_type_representation))
datacache-1.1.5/datacache/download.py 0000644 0003720 0003720 00000020647 13326760043 020346 0 ustar travis travis 0000000 0000000 # Copyright (c) 2015-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function, division, absolute_import
import gzip
import logging
import os
import subprocess
from shutil import move
from tempfile import NamedTemporaryFile
import zipfile
import requests
import pandas as pd
from six.moves import urllib
from .common import build_path, build_local_filename
logger = logging.getLogger(__name__)
def _download(download_url, timeout=None):
if download_url.startswith("http"):
response = requests.get(download_url, timeout=timeout)
response.raise_for_status()
return response.content
else:
req = urllib.request.Request(download_url)
response = urllib.request.urlopen(req, data=None, timeout=timeout)
return response.read()
def _download_to_temp_file(
download_url,
timeout=None,
base_name="download",
ext="tmp",
use_wget_if_available=False):
if not download_url:
raise ValueError("URL not provided")
with NamedTemporaryFile(
suffix='.' + ext,
prefix=base_name,
delete=False) as tmp:
tmp_path = tmp.name
def download_using_python():
with open(tmp_path, mode="w+b") as tmp_file:
tmp_file.write(
_download(download_url, timeout=timeout))
if not use_wget_if_available:
download_using_python()
else:
try:
# first try using wget to download since this works on Travis
# even when FTP otherwise fails
wget_command_list = [
"wget",
download_url,
"-O", tmp_path,
"--no-verbose",
]
if download_url.startswith("ftp"):
wget_command_list.extend(["--passive-ftp"])
if timeout:
wget_command_list.extend(["-T", "%s" % timeout])
logger.info("Running: %s" % (" ".join(wget_command_list)))
subprocess.call(wget_command_list)
except OSError as e:
if e.errno == os.errno.ENOENT:
# wget not found
download_using_python()
else:
raise
return tmp_path
def _download_and_decompress_if_necessary(
full_path,
download_url,
timeout=None,
use_wget_if_available=False):
"""
Downloads remote file at `download_url` to local file at `full_path`
"""
logger.info("Downloading %s to %s", download_url, full_path)
filename = os.path.split(full_path)[1]
base_name, ext = os.path.splitext(filename)
tmp_path = _download_to_temp_file(
download_url=download_url,
timeout=timeout,
base_name=base_name,
ext=ext,
use_wget_if_available=use_wget_if_available)
if download_url.endswith("zip") and not filename.endswith("zip"):
logger.info("Decompressing zip into %s...", filename)
with zipfile.ZipFile(tmp_path) as z:
names = z.namelist()
assert len(names) > 0, "Empty zip archive"
if filename in names:
chosen_filename = filename
else:
# If zip archive contains multiple files, choose the biggest.
biggest_size = 0
chosen_filename = names[0]
for info in z.infolist():
if info.file_size > biggest_size:
chosen_filename = info.filename
biggest_size = info.file_size
extract_path = z.extract(chosen_filename)
move(extract_path, full_path)
os.remove(tmp_path)
elif download_url.endswith("gz") and not filename.endswith("gz"):
logger.info("Decompressing gzip into %s...", filename)
with gzip.GzipFile(tmp_path) as src:
contents = src.read()
os.remove(tmp_path)
with open(full_path, 'wb') as dst:
dst.write(contents)
elif download_url.endswith(("html", "htm")) and full_path.endswith(".csv"):
logger.info("Extracting HTML table into CSV %s...", filename)
df = pd.read_html(tmp_path, header=0)[0]
df.to_csv(full_path, sep=',', index=False, encoding='utf-8')
else:
move(tmp_path, full_path)
def file_exists(
download_url,
filename=None,
decompress=False,
subdir=None):
"""
Return True if a local file corresponding to these arguments
exists.
"""
filename = build_local_filename(download_url, filename, decompress)
full_path = build_path(filename, subdir)
return os.path.exists(full_path)
def fetch_file(
download_url,
filename=None,
decompress=False,
subdir=None,
force=False,
timeout=None,
use_wget_if_available=False):
"""
Download a remote file and store it locally in a cache directory. Don't
download it again if it's already present (unless `force` is True.)
Parameters
----------
download_url : str
Remote URL of file to download.
filename : str, optional
Local filename, used as cache key. If omitted, then determine the local
filename from the URL.
decompress : bool, optional
By default any file whose remote extension is one of (".zip", ".gzip")
and whose local filename lacks this suffix is decompressed. If a local
filename wasn't provided but you still want to decompress the stored
data then set this option to True.
subdir : str, optional
Group downloads in a single subdirectory.
force : bool, optional
By default, a remote file is not downloaded if it's already present.
However, with this argument set to True, it will be overwritten.
timeout : float, optional
Timeout for download in seconds, default is None which uses
global timeout.
use_wget_if_available: bool, optional
If the `wget` command is available, use that for download instead
of Python libraries (default True)
Returns the full path of the local file.
"""
filename = build_local_filename(download_url, filename, decompress)
full_path = build_path(filename, subdir)
if not os.path.exists(full_path) or force:
logger.info("Fetching %s from URL %s", filename, download_url)
_download_and_decompress_if_necessary(
full_path=full_path,
download_url=download_url,
timeout=timeout,
use_wget_if_available=use_wget_if_available)
else:
logger.info("Cached file %s from URL %s", filename, download_url)
return full_path
def fetch_and_transform(
transformed_filename,
transformer,
loader,
source_filename,
source_url,
subdir=None):
"""
Fetch a remote file from `source_url`, save it locally as `source_filename` and then use
the `loader` and `transformer` function arguments to turn this saved data into an in-memory
object.
"""
transformed_path = build_path(transformed_filename, subdir)
if not os.path.exists(transformed_path):
source_path = fetch_file(source_url, source_filename, subdir)
logger.info("Generating data file %s from %s", transformed_path, source_path)
result = transformer(source_path, transformed_path)
else:
logger.info("Cached data file: %s", transformed_path)
result = loader(transformed_path)
assert os.path.exists(transformed_path)
return result
def fetch_csv_dataframe(
download_url,
filename=None,
subdir=None,
**pandas_kwargs):
"""
Download a remote file from `download_url` and save it locally as `filename`.
Load that local file as a CSV into Pandas using extra keyword arguments such as sep='\t'.
"""
path = fetch_file(
download_url=download_url,
filename=filename,
decompress=True,
subdir=subdir)
return pd.read_csv(path, **pandas_kwargs)
datacache-1.1.5/datacache.egg-info/ 0000755 0003720 0003720 00000000000 13326760342 017650 5 ustar travis travis 0000000 0000000 datacache-1.1.5/datacache.egg-info/PKG-INFO 0000644 0003720 0003720 00000002624 13326760342 020751 0 ustar travis travis 0000000 0000000 Metadata-Version: 1.1
Name: datacache
Version: 1.1.5
Summary: Helpers for transparently downloading datasets
Home-page: https://github.com/openvax/datacache
Author: Alex Rubinsteyn
Author-email: alex.rubinsteyn@mssm.edu
License: http://www.apache.org/licenses/LICENSE-2.0.html
Description: DataCache
=========
Helpers for transparently downloading datasets
API
---
- **fetch_file**\ (download_url, filename = *None*, decompress =
*False*, subdir = *None*)
- **fetch_and_transform**\ (transformed_filename, transformer, loader,
source_filename, source_url, subdir = *None*)
- **fetch_fasta_dict**\ (download_url, filename = *None*, subdir =
*None*)
- **fetch_fasta_db**\ (table_name, download_url, fasta_filename =
*None*, key_column = *‘id’*, value_column = *‘seq’*, subdir = *None*)
- **fetch_csv_db**\ (table_name, download_url, csv_filename = *None*,
subdir = *None*, \**pandas_kwargs)
Platform: UNKNOWN
Classifier: Development Status :: 3 - Alpha
Classifier: Environment :: Console
Classifier: Operating System :: OS Independent
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Programming Language :: Python
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
datacache-1.1.5/datacache.egg-info/SOURCES.txt 0000644 0003720 0003720 00000001052 13326760342 021532 0 ustar travis travis 0000000 0000000 README.md
setup.py
datacache/__init__.py
datacache/cache.py
datacache/common.py
datacache/database.py
datacache/database_helpers.py
datacache/database_table.py
datacache/database_types.py
datacache/download.py
datacache.egg-info/PKG-INFO
datacache.egg-info/SOURCES.txt
datacache.egg-info/dependency_links.txt
datacache.egg-info/requires.txt
datacache.egg-info/top_level.txt
test/test_cache_object.py
test/test_database_objects.py
test/test_database_types.py
test/test_db_from_dataframes.py
test/test_download.py
test/test_fasta_dict.py
test/test_names.py datacache-1.1.5/datacache.egg-info/dependency_links.txt 0000644 0003720 0003720 00000000001 13326760342 023716 0 ustar travis travis 0000000 0000000
datacache-1.1.5/datacache.egg-info/requires.txt 0000644 0003720 0003720 00000000130 13326760342 022242 0 ustar travis travis 0000000 0000000 pandas>=0.15.2
appdirs>=1.4.0
progressbar33>=2.4
requests>=2.5.1
typechecks>=0.0.2
mock
datacache-1.1.5/datacache.egg-info/top_level.txt 0000644 0003720 0003720 00000000012 13326760342 022373 0 ustar travis travis 0000000 0000000 datacache
datacache-1.1.5/test/ 0000755 0003720 0003720 00000000000 13326760342 015240 5 ustar travis travis 0000000 0000000 datacache-1.1.5/test/test_cache_object.py 0000644 0003720 0003720 00000004365 13326760043 021250 0 ustar travis travis 0000000 0000000 from os import remove
from os.path import exists
from mock import patch
from nose.tools import eq_
from datacache import Cache
CACHE_DIR = "datacache_test"
TEST_URL = "http://www.google.com"
TEST_FILENAME = "google"
def test_cache_object_path():
cache = Cache(CACHE_DIR)
assert cache.cache_directory_path.endswith(CACHE_DIR), \
"Expected directory path to end with %s but got %s" % (
CACHE_DIR, cache.cache_directory_path)
def test_cache_object_local_filename():
filename = Cache(CACHE_DIR).local_filename(filename="test")
assert filename.endswith("test")
def test_cache_fetch_google():
cache = Cache(CACHE_DIR)
path = cache.fetch(TEST_URL, filename=TEST_FILENAME)
assert path.endswith(TEST_FILENAME), \
"Expected local file to be named %s but got %s" % (
TEST_FILENAME, path)
assert exists(path), "File not found: %s" % path
eq_(path, cache.local_path(TEST_URL, filename=TEST_FILENAME))
@patch('datacache.cache.download._download_and_decompress_if_necessary')
def test_cache_fetch_force(mock_download):
cache = Cache("datacache_test")
cache.fetch("http://www.google.com", filename="google", force=True)
cache.fetch("http://www.google.com", filename="google", force=True)
assert len(mock_download.call_args_list) == 2, \
"Expected two separate calls to _download, given force=True"
def test_cache_delete_url():
cache = Cache(CACHE_DIR)
path = cache.fetch(TEST_URL, filename=TEST_FILENAME)
assert exists(path), "Expected %s to exist after download" % path
cache.delete_url(TEST_URL)
assert not exists(path), \
"Expected %s to be deleted after call to delete_url" % path
def test_cache_missing_file():
"""test_cache_missing_file : Files can be deleted from the file system,
Cache should be aware that these files no longer exist
"""
cache = Cache(CACHE_DIR)
path = cache.fetch(TEST_URL, filename=TEST_FILENAME)
# does the filename exist?
assert exists(path)
# does the cache know the URL has been downloaded?
assert cache.exists(TEST_URL, filename=TEST_FILENAME)
remove(path)
assert not cache.exists(TEST_URL, filename=TEST_FILENAME), \
"Local file for %s has been deleted from the file system" % TEST_URL
datacache-1.1.5/test/test_database_objects.py 0000644 0003720 0003720 00000002765 13326760043 022136 0 ustar travis travis 0000000 0000000 """
Test that datacache constructs databases correctly
(separately from downloading/caching them)
"""
import tempfile
import datacache
from nose.tools import eq_
TABLE_NAME = "test"
INT_COL_NAME = "int_col"
STR_COL_NAME = "str_col"
COL_TYPES = [(INT_COL_NAME, "INT"), (STR_COL_NAME, "STR")]
KEY_COLUMN_NAME = "int_col"
NULLABLE = {STR_COL_NAME}
ROWS = [(1, "darkness"), (2, "light"), (3, None)]
INDICES = [["str_col"]]
VERSION = 2
def make_table_object():
return datacache.database_table.DatabaseTable(
name=TABLE_NAME,
column_types=COL_TYPES,
make_rows=lambda: ROWS,
indices=INDICES,
nullable=NULLABLE,
primary_key=INT_COL_NAME)
def test_database_table_object():
table = make_table_object()
eq_(table.name, TABLE_NAME)
eq_(table.indices, INDICES)
eq_(table.nullable, NULLABLE)
eq_(table.rows, ROWS)
eq_(table.indices, INDICES)
def test_create_db():
with tempfile.NamedTemporaryFile(suffix="test.db") as f:
db = datacache.database.Database(f.name)
table = make_table_object()
db.create(tables=[table], version=VERSION)
assert db.has_table(TABLE_NAME)
assert db.has_version()
assert db.version() == VERSION
sql = """
SELECT %s from %s WHERE %s = '%s'
""" % (INT_COL_NAME, TABLE_NAME, STR_COL_NAME, "light")
cursor = db.connection.execute(sql)
int_result_tuple = cursor.fetchone()
int_result = int_result_tuple[0]
eq_(int_result, 2)
datacache-1.1.5/test/test_database_types.py 0000644 0003720 0003720 00000000645 13326760043 021644 0 ustar travis travis 0000000 0000000 from nose.tools import eq_
import numpy as np
from datacache.database_types import db_type
def test_db_types():
for int_type in [
int,
np.int8, np.int16, np.int32, np.int64,
np.uint8, np.uint16, np.uint32, np.uint64]:
eq_(db_type(int_type), "INT")
for float_type in [float, np.float32, np.float64]:
eq_(db_type(float), "FLOAT")
eq_(db_type(str), "TEXT")
datacache-1.1.5/test/test_db_from_dataframes.py 0000644 0003720 0003720 00000002510 13326760043 022444 0 ustar travis travis 0000000 0000000 from nose.tools import eq_
import pandas as pd
from tempfile import NamedTemporaryFile
from datacache import db_from_dataframes, db_from_dataframe
dfA = pd.DataFrame({"numbers": [1, 2, 3], "strings": ["a", "b", "c"]})
dfB = pd.DataFrame({"wuzzles": ["nuzzle", "ruzzle"]})
def test_database_from_dataframes():
with NamedTemporaryFile(suffix="test.db") as f:
db = db_from_dataframes(
db_filename=f.name,
dataframes={"A": dfA, "B": dfB},
primary_keys={"A": "numbers"},
indices={"A": [("numbers", "strings")]},
subdir="test_datacache")
cursor_A = db.execute("SELECT * FROM A")
results_A = cursor_A.fetchall()
eq_(results_A, [(1, "a"), (2, "b"), (3, "c")])
cursor_B = db.execute("SELECT * FROM B")
results_B = cursor_B.fetchall()
eq_(results_B, [("nuzzle",), ("ruzzle",)])
def test_database_from_single_dataframe():
with NamedTemporaryFile(suffix="test.db") as f:
db = db_from_dataframe(
db_filename=f.name,
table_name="A",
df=dfA,
primary_key="numbers",
indices=[("numbers", "strings")],
subdir="test_datacache")
cursor = db.execute("SELECT * FROM A")
results = cursor.fetchall()
eq_(results, [(1, "a"), (2, "b"), (3, "c")])
datacache-1.1.5/test/test_download.py 0000644 0003720 0003720 00000003277 13326760043 020467 0 ustar travis travis 0000000 0000000 # Copyright (c) 2014. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datacache import fetch_file
FASTA_FILENAME = 'Homo_sapiens.GRCh37.75.dna_rm.chromosome.MT.fa'
URL = \
'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna_rm.chromosome.MT.fa.gz'
def test_fetch_decompress():
for use_wget_if_available in [True, False]:
for timeout in [None, 10**6]:
path1 = fetch_file(
URL,
decompress=True,
subdir="datacache",
use_wget_if_available=use_wget_if_available,
timeout=timeout)
assert path1.endswith(FASTA_FILENAME)
with open(path1, 'r') as f1:
s1 = f1.read()
assert "TCAATTTCGTGCCAG" in s1
def test_fetch_subdirs():
path = fetch_file(URL, decompress=True, subdir="datacache")
assert path.endswith(FASTA_FILENAME)
# if we change the subdir then data should end up in
# something like /Users/me/Library/Caches/epitopes_test/
other_path = fetch_file(URL, decompress=True, subdir="datacache_test")
assert other_path.endswith(FASTA_FILENAME)
assert other_path != path, other_path
datacache-1.1.5/test/test_fasta_dict.py 0000644 0003720 0003720 00000003045 13326760043 020752 0 ustar travis travis 0000000 0000000 # Copyright (c) 2014. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datacache import fetch_file
import gzip
URL = "".join([
'ftp://ftp.ensembl.org/pub/release-75',
'/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75',
'.dna_rm.chromosome.MT.fa.gz',
])
def fetch_fasta_dict(path_or_url):
path = fetch_file(path_or_url)
d = {}
value_buffer = []
key = None
if path.endswith(".gz") or path.endswith(".gzip"):
f = gzip.open(path, "r")
else:
f = open(path, "r")
for line in f.readlines():
if type(line) is bytes:
line = line.decode("ascii")
if line.startswith(">"):
if key is not None:
d[key] = "".join(value_buffer)
value_buffer = []
key = line.split()[0][1:]
else:
value_buffer.append(line.strip())
if key and value_buffer:
d[key] = "".join(value_buffer)
f.close()
return d
def test_download_fasta_dict():
d = fetch_fasta_dict(URL)
assert len(d) > 0
datacache-1.1.5/test/test_names.py 0000644 0003720 0003720 00000001064 13326760043 017753 0 ustar travis travis 0000000 0000000 from datacache import build_local_filename
def test_url_without_filename():
filename = build_local_filename(download_url="http://www.google.com/")
assert filename
assert "google" in filename
def test_multiple_domains_same_file():
filename_google = build_local_filename(
download_url="http://www.google.com/index.html")
filename_yahoo = build_local_filename(
download_url="http://www.yahoo.com/index.html")
assert "index" in filename_google
assert "index" in filename_yahoo
assert filename_yahoo != filename_google datacache-1.1.5/README.md 0000644 0003720 0003720 00000002125 13326760043 015536 0 ustar travis travis 0000000 0000000
DataCache
=========
Helpers for transparently downloading datasets
## API
* **fetch_file**(download_url, filename = *None*, decompress = *False*, subdir = *None*)
* **fetch_and_transform**(transformed_filename, transformer, loader,
source_filename, source_url, subdir = *None*)
* **fetch_fasta_dict**(download_url, filename = *None*, subdir = *None*)
* **fetch_fasta_db**(table_name, download_url, fasta_filename = *None*,
key_column = *'id'*, value_column = *'seq'*, subdir = *None*)
* **fetch_csv_db**(table_name, download_url, csv_filename = *None*, subdir = *None*,
\*\*pandas_kwargs)
datacache-1.1.5/setup.py 0000644 0003720 0003720 00000004620 13326760043 015773 0 ustar travis travis 0000000 0000000 # Copyright (c) 2014-2018. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
import re
from setuptools import setup
readme_filename = "README.md"
current_directory = os.path.dirname(__file__)
readme_path = os.path.join(current_directory, readme_filename)
try:
with open(readme_path, "r") as f:
readme_markdown = f.read()
except:
logging.warn("Failed to load %s", readme_filename)
readme_markdown = ""
try:
import pypandoc
readme_restructured = pypandoc.convert(readme_markdown, to="rst", format="md")
except:
readme_restructured = readme_markdown
logging.warn("Failed to convert %s to reStructuredText", readme_filename)
with open('datacache/__init__.py', 'r') as f:
version = re.search(
r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
f.read(),
re.MULTILINE).group(1)
if __name__ == "__main__":
setup(
name="datacache",
version=version,
description="Helpers for transparently downloading datasets",
author="Alex Rubinsteyn",
author_email="alex.rubinsteyn@mssm.edu",
url="https://github.com/openvax/datacache",
license="http://www.apache.org/licenses/LICENSE-2.0.html",
classifiers=[
"Development Status :: 3 - Alpha",
"Environment :: Console",
"Operating System :: OS Independent",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Topic :: Scientific/Engineering :: Bio-Informatics",
],
install_requires=[
"pandas>=0.15.2",
"appdirs>=1.4.0",
"progressbar33>=2.4",
"requests>=2.5.1",
"typechecks>=0.0.2",
"mock",
],
long_description=readme_restructured,
packages=["datacache"],
)
datacache-1.1.5/PKG-INFO 0000644 0003720 0003720 00000002624 13326760342 015362 0 ustar travis travis 0000000 0000000 Metadata-Version: 1.1
Name: datacache
Version: 1.1.5
Summary: Helpers for transparently downloading datasets
Home-page: https://github.com/openvax/datacache
Author: Alex Rubinsteyn
Author-email: alex.rubinsteyn@mssm.edu
License: http://www.apache.org/licenses/LICENSE-2.0.html
Description: DataCache
=========
Helpers for transparently downloading datasets
API
---
- **fetch_file**\ (download_url, filename = *None*, decompress =
*False*, subdir = *None*)
- **fetch_and_transform**\ (transformed_filename, transformer, loader,
source_filename, source_url, subdir = *None*)
- **fetch_fasta_dict**\ (download_url, filename = *None*, subdir =
*None*)
- **fetch_fasta_db**\ (table_name, download_url, fasta_filename =
*None*, key_column = *‘id’*, value_column = *‘seq’*, subdir = *None*)
- **fetch_csv_db**\ (table_name, download_url, csv_filename = *None*,
subdir = *None*, \**pandas_kwargs)
Platform: UNKNOWN
Classifier: Development Status :: 3 - Alpha
Classifier: Environment :: Console
Classifier: Operating System :: OS Independent
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Programming Language :: Python
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
datacache-1.1.5/setup.cfg 0000644 0003720 0003720 00000000046 13326760342 016102 0 ustar travis travis 0000000 0000000 [egg_info]
tag_build =
tag_date = 0