pax_global_header00006660000000000000000000000064140603345020014507gustar00rootroot0000000000000052 comment=a31dfe1d7fbd6db87d6e3e04b706c16ce1e97032 pyrle-0.0.33/000077500000000000000000000000001406033450200127255ustar00rootroot00000000000000pyrle-0.0.33/.gitignore000066400000000000000000000000601406033450200147110ustar00rootroot00000000000000.hypothesis/ *.html *.bk *build/ *.so *.c *.pyc pyrle-0.0.33/.travis.yml000066400000000000000000000014011406033450200150320ustar00rootroot00000000000000language: python python: - "3.6" install: - pip install cython pytest hypothesis pybigwig - pip install sorted_nearest ncls pyranges - sudo sh -c 'echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list' - gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 - gpg -a --export E084DAB9 | sudo apt-key add - - sudo apt-get update - sudo apt-get install -y r-base - echo 'source("http://bioconductor.org/biocLite.R"); biocLite("S4Vectors"); biocLite("GenomicRanges"); biocLite("rtracklayer")' > install.R - cat install.R - sudo Rscript --vanilla install.R - python setup.py install - python setup.py build_ext --inplace - ls tests script: - python -c "import pyrle; print(pyrle.__version__)" - py.test -v pyrle-0.0.33/CHANGELOG.txt000066400000000000000000000021271406033450200147570ustar00rootroot00000000000000# 0.0.33 (10.06.21) - fix bug in setup.py # 0.0.32 (25.02.21) - fix bug in .stranded # 0.0.31 (unreleased) - helper method make_strands_same_length to rledicts # 0.0.30 (14.01.20) - add unary negation to rle/rledict - add apply, apply_values and apply_runs to rle/rledict - add iter to rledict to for loop through keys/values - subsetting an RLE with a dataframe now returns a dataframe - printing rles respects terminal width - add copy() to rledict - add shift to rle/rledict - add .length property to Rle which gives sum of runs - allow for empty constructors (Rle() and PyRles()) - add .chromosomes property to rledict - add len() to PyRles - rledicts now also have a to_csv method - add to_table to rledict # 0.0.29 (21.01.19) - try to fix bug when subsetting with pyranges # 0.0.28 (17.01.19) - improve speed when subsetting with pyranges # 0.0.27 (10.01.19) - better api for subsetting rles with dfs # 0.0.26 (07.01.19) - better api for subsetting with pyranges # 0.0.25 (24.10.19) - allow creating pyrle from values only (thanks Alistair Miles) # 0.0.23 (09.05.19) - remove pyranges dependency pyrle-0.0.33/LICENSE000066400000000000000000000020501406033450200137270ustar00rootroot00000000000000Copyright (c) 2019 Endre Bakken Stovner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pyrle-0.0.33/README.md000066400000000000000000000010741406033450200142060ustar00rootroot00000000000000# pyrle [![Build Status](https://travis-ci.org/endrebak/pyrle.svg?branch=master)](https://travis-ci.org/endrebak/pyrle) [![hypothesis tested](graphs/hypothesis-tested-brightgreen.svg)](http://hypothesis.readthedocs.io/) [![PyPI version](https://badge.fury.io/py/pyrle.svg)](https://badge.fury.io/py/pyrle) Run length arithmetic in Python using Cython. Inspired by the Rle class in R's S4Vectors. As opposed to S4Vectors, pyrle does not rotate the shortest vector, but rather extends the shorter Rle with zeroes. This is likely the desired behavior in almost all cases. pyrle-0.0.33/graphs/000077500000000000000000000000001406033450200142115ustar00rootroot00000000000000pyrle-0.0.33/graphs/hypothesis-tested-brightgreen.svg000066400000000000000000000017011406033450200227140ustar00rootroot00000000000000hypothesishypothesistestedtested pyrle-0.0.33/pyrle/000077500000000000000000000000001406033450200140605ustar00rootroot00000000000000pyrle-0.0.33/pyrle/__init__.py000066400000000000000000000011041406033450200161650ustar00rootroot00000000000000from pyrle.rle import Rle from pyrle.version import __version__ from pyrle.rledict import RleDict from pyrle.methods import coverage import pandas as pd import numpy as np from collections import defaultdict, OrderedDict PyRles = RleDict def from_csv(f, sep="\t"): """Read PyRle from CSV. >>> """ d = {} df = pd.read_csv(f, sep=sep, index_col=None) if "Strand" in df: keys = "Chromosome Strand".split() else: keys = "Chromosome" for c, cdf in df.groupby(keys): d[c] = Rle(cdf.Runs, cdf.Values) return PyRles(d) pyrle-0.0.33/pyrle/methods.py000066400000000000000000000144541406033450200161050ustar00rootroot00000000000000import pandas as pd import numpy as np from pyrle import Rle from pyrle import rledict as rd from pyrle.src.coverage import _coverage from natsort import natsorted from sys import stderr from collections import defaultdict import os class suppress_stdout_stderr(object): ''' A context manager for doing a "deep suppression" of stdout and stderr in Python, i.e. will suppress all print, even if the print originates in a compiled C/Fortran sub-function. This will not suppress raised exceptions, since exceptions are printed to stderr just before a script exits, and after the context manager has exited (at least, I think that is why it lets exceptions through). ''' def __init__(self): # Open a pair of null files self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] # Save the actual stdout (1) and stderr (2) file descriptors. self.save_fds = (os.dup(1), os.dup(2)) def __enter__(self): # Assign the null pointers to stdout and stderr. os.dup2(self.null_fds[0], 1) os.dup2(self.null_fds[1], 2) def __exit__(self, *_): # Re-assign the real stdout/stderr back to (1) and (2) os.dup2(self.save_fds[0], 1) os.dup2(self.save_fds[1], 2) # Close the null files os.close(self.null_fds[0]) os.close(self.null_fds[1]) def _merge_rles(rle): new_dict = {} dd = defaultdict(list) for chromosome, strand in rle.rles.keys(): dd[chromosome].append(strand) for c, s in dd.items(): if len(s) == 1: new_dict[c] = rle.rles[c, s[0]] else: new_dict[c] = rle.rles[c, "+"] + rle.rles[c, "-"] return new_dict def ensure_both_or_none_stranded(self, other): # means other not stranded if self.stranded: self.rles = _merge_rles(self) else: other.rles = _merge_rles(other) return self, other def chromosomes_in_both_self_other(self, other): chromosomes_in_both = natsorted( set(self.rles.keys()).intersection(other.rles.keys())) chromosomes_in_self_not_other = natsorted( set(self.rles.keys()) - set(other.rles.keys())) chromosomes_in_other_not_self = natsorted( set(other.rles.keys()) - set(self.rles.keys())) return chromosomes_in_both, chromosomes_in_self_not_other, chromosomes_in_other_not_self def binary_operation(operation, self, other, nb_cpu=1): func = {"div": __div, "mul": __mul, "add": __add, "sub": __sub}[operation] func, get = rd.get_multithreaded_funcs(func, nb_cpu) if nb_cpu > 1: import ray with suppress_stdout_stderr(): ray.init(num_cpus=nb_cpu) if self.stranded != other.stranded: self, other = ensure_both_or_none_stranded(self, other) chromosomes_in_both, chromosomes_in_self_not_other, chromosomes_in_other_not_self = chromosomes_in_both_self_other( self, other) both_results = [] for c in chromosomes_in_both: both_results.append(func.remote(self.rles[c], other.rles[c])) self_results = [] for c in chromosomes_in_self_not_other: _other = Rle([np.sum(self.rles[c].runs)], [0]) self_results.append(func.remote(self.rles[c], _other)) other_results = [] for c in chromosomes_in_other_not_self: _self = Rle([np.sum(other.rles[c].runs)], [0]) other_results.append(func.remote(_self, other.rles[c])) rles = { k: v for k, v in zip( chromosomes_in_both + chromosomes_in_self_not_other + chromosomes_in_other_not_self, get(both_results + self_results + other_results)) } return rd.RleDict(rles) def __add(self, other): return self + other def __sub(self, other): return self - other def __div(self, other): return self / other def __mul(self, other): return self * other def coverage(df, **kwargs): value_col = kwargs.get("value_col", None) if value_col: values = df[value_col].astype(np.float64).values else: values = np.ones(len(df)) starts_df = pd.DataFrame({ "Position": df.Start, "Value": values })["Position Value".split()] ends_df = pd.DataFrame({ "Position": df.End, "Value": -1 * values })["Position Value".split()] _df = pd.concat([starts_df, ends_df], ignore_index=True) _df = _df.sort_values("Position", kind="mergesort") if _df.Position.dtype.name == "int32": _df.Position = _df.Position.astype(np.int64) runs, values = _coverage(_df.Position.values, _df.Value.values) return Rle(runs, values) def to_ranges_df_strand(rle, k): chromosome, strand = k starts, ends, values = _to_ranges(rle) df = pd.concat([pd.Series(r) for r in [starts, ends, values]], axis=1) df.columns = "Start End Score".split() df.insert(0, "Chromosome", chromosome) df.insert(df.shape[1], "Strand", strand) df = df[df.Score != 0] return df def to_ranges_df_no_strand(rle, k): starts, ends, values = _to_ranges(rle) df = pd.concat([pd.Series(r) for r in [starts, ends, values]], axis=1) df.columns = "Start End Score".split() df.insert(0, "Chromosome", k) df = df[df.Score != 0] return df def to_ranges(grles, nb_cpu=1): from pyranges import PyRanges func = to_ranges_df_strand if grles.stranded else to_ranges_df_no_strand if nb_cpu > 1: import ray ray.init(num_cpus=nb_cpu) func = ray.remote(func) get = ray.get else: func.remote = func get = lambda x: x dfs, keys = [], [] for k, v in grles.items(): result = func.remote(v, k) dfs.append(result) keys.append(k) dfs = {k: v for (k, v) in zip(keys, get(dfs))} if nb_cpu > 1: ray.shutdown() return PyRanges(dfs) def _to_ranges(rle): runs = pd.Series(rle.runs) starts = pd.Series([0] + list(runs)).cumsum() ends = starts + runs values = pd.Series(rle.values) start_idx = values[values.shift(-1) != values].index end_idx = values[values.shift(1) != values].index starts = starts.loc[start_idx] ends = ends.loc[end_idx] values = values[start_idx].reset_index(drop=True) return starts.astype(int).reset_index( drop=True), ends.astype(int).reset_index(drop=True), values pyrle-0.0.33/pyrle/rle.py000066400000000000000000001047031406033450200152210ustar00rootroot00000000000000"""Data structure for run length encoding representation and arithmetic.""" from pyrle.src.rle import sub_rles, add_rles, mul_rles, div_rles_zeroes, div_rles_nonzeroes from pyrle.src.coverage import _remove_dupes from pyrle.src.getitem import getitem, getlocs, getitems import pyrle as rle import pandas as pd import numpy as np import shutil from tabulate import tabulate from numbers import Number __all__ = ["Rle"] def _make_rles_equal_length(self, other, value=0): if not isinstance(other, Number): ls = np.sum(self.runs) lo = np.sum(other.runs) if ls > lo: new_runs = np.append(other.runs, ls - lo) new_values = np.append(other.values, value) other = Rle(new_runs, new_values) elif lo > ls: new_runs = np.append(self.runs, lo - ls) new_values = np.append(self.values, value) self = Rle(new_runs, new_values) return self, other import numpy as np def find_runs(x): """Find runs of consecutive items in an array. Author: Alistair Miles https://gist.github.com/alimanfoo/c5977e87111abe8127453b21204c1065 """ # ensure array x = np.asanyarray(x) if x.ndim != 1: raise ValueError('only 1D array supported') n = x.shape[0] # handle empty array if n == 0: return np.array([]), np.array([]) else: # find run starts loc_run_start = np.empty(n, dtype=bool) loc_run_start[0] = True np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) run_starts = np.nonzero(loc_run_start)[0] # find run values run_values = np.array(x[loc_run_start], dtype=np.double) # find run lengths run_lengths = np.diff(np.append(run_starts, n)) return run_values, run_lengths class Rle: """Data structure to represent and manipulate Run Length Encodings. An Rle contains two vectors, one with runs (int) and one with values (double). Operations between Rles act as if it was a regular vector. There are three ways to build an Rle: from a vector of runs or a vector of values, or a vector of values. Parameters ---------- runs : array-like Run lengths. values : array-like Run values. See Also -------- pyrle.rledict.RleDict : genomic collection of Rles Examples -------- >>> r = Rle([1, 2, 1, 5], [0, 2.1, 3, 4]) >>> r +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 1 | 5 | |--------+-----+-----+-----+-----| | Values | 0.0 | 2.1 | 3.0 | 4.0 | +--------+-----+-----+-----+-----+ Rle of length 9 containing 4 elements (avg. length 2.25) >>> r2 = Rle([1, 1, 1, 0, 0, 2, 2, 3, 4, 2]) >>> r2 +--------+-----+-----+-----+-----+-----+-----+ | Runs | 3 | 2 | 2 | 1 | 1 | 1 | |--------+-----+-----+-----+-----+-----+-----| | Values | 1.0 | 0.0 | 2.0 | 3.0 | 4.0 | 2.0 | +--------+-----+-----+-----+-----+-----+-----+ Rle of length 10 containing 6 elements (avg. length 1.667) When one Rle is longer than the other, the shorter is extended with zeros: >>> r - r2 +--------+------+-----+-----+-----+-----+-----+-----+------+ | Runs | 1 | 2 | 1 | 1 | 2 | 1 | 1 | 1 | |--------+------+-----+-----+-----+-----+-----+-----+------| | Values | -1.0 | 1.1 | 3.0 | 4.0 | 2.0 | 1.0 | 0.0 | -2.0 | +--------+------+-----+-----+-----+-----+-----+-----+------+ Rle of length 10 containing 8 elements (avg. length 1.25) Scalar operations work with Rles: >>> r * 5 +--------+-----+------+------+------+ | Runs | 1 | 2 | 1 | 5 | |--------+-----+------+------+------| | Values | 0.0 | 10.5 | 15.0 | 20.0 | +--------+-----+------+------+------+ Rle of length 9 containing 4 elements (avg. length 2.25) """ runs = None values = None def __init__(self, runs=None, values=None): if values is not None and runs is not None: assert len(runs) == len(values) runs = np.copy(runs) values = np.copy(values) runs = np.array(runs, dtype=np.int) values = np.array(values, dtype=np.double) s = pd.Series(values, dtype=np.double) zero_length_runs = runs == 0 if np.any(zero_length_runs): runs = runs[~zero_length_runs] values = values[~zero_length_runs] if (np.isclose(s.shift(), s, equal_nan=True)).any() and len(s) > 1: runs, values = _remove_dupes(runs, values, len(values)) self.runs = np.copy(runs) self.values = np.copy(values) elif runs is not None: values = runs self.values, self.runs = find_runs(values) else: self.runs = np.array([], dtype=np.int) self.values = np.array([], dtype=np.double) def __add__(self, other): """Add number or Rle to Rle. The shortest Rle is extended with zeros. Examples -------- >>> r1 = Rle([1, 2], [0, 1]) >>> r2 = Rle([2, 2], [2, 3]) >>> r1 + r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 1 | 1 | 1 | |--------+-----+-----+-----+-----| | Values | 2.0 | 3.0 | 4.0 | 3.0 | +--------+-----+-----+-----+-----+ Rle of length 4 containing 4 elements (avg. length 1.0) >>> r1 * 10 +--------+-----+------+ | Runs | 1 | 2 | |--------+-----+------| | Values | 0.0 | 10.0 | +--------+-----+------+ Rle of length 3 containing 2 elements (avg. length 1.5) """ if isinstance(other, Number): return Rle(self.runs, self.values + other) else: self, other = _make_rles_equal_length(self, other) runs, values = add_rles(self.runs, self.values, other.runs, other.values) return Rle(runs, values) def __array_ufunc__(self, *args, **kwargs): """Apply unary numpy-function to the values. Notes ----- Function must produce a vector of length equal to self. Examples -------- >>> r = Rle([1, 2, 3, 4], [1, 4, 9, 16]) >>> r +--------+-----+-----+-----+------+ | Runs | 1 | 2 | 3 | 4 | |--------+-----+-----+-----+------| | Values | 1.0 | 4.0 | 9.0 | 16.0 | +--------+-----+-----+-----+------+ Rle of length 10 containing 4 elements (avg. length 2.5) >>> np.sqrt(r) +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 3 | 4 | |--------+-----+-----+-----+-----| | Values | 1.0 | 2.0 | 3.0 | 4.0 | +--------+-----+-----+-----+-----+ Rle of length 10 containing 4 elements (avg. length 2.5) >>> np.log10(np.sqrt(r)) +--------+-----+--------------------+---------------------+--------------------+ | Runs | 1 | 2 | 3 | 4 | |--------+-----+--------------------+---------------------+--------------------| | Values | 0.0 | 0.3010299956639812 | 0.47712125471966244 | 0.6020599913279624 | +--------+-----+--------------------+---------------------+--------------------+ Rle of length 10 containing 4 elements (avg. length 2.5) """ self = self.copy() func, call, gr = args self.values = getattr(func, call)(self.values, **kwargs) return self def __eq__(self, other): """Return where Rle equal. Examples -------- >>> r = Rle([1, 2, 1], [1, 2, 3]) >>> r2 = Rle([1, 1, 1], [1, 2, 1]) >>> r == r2 +--------+-----+-----+ | Runs | 2 | 2 | |--------+-----+-----| | Values | 1.0 | 0.0 | +--------+-----+-----+ Rle of length 4 containing 2 elements (avg. length 2.0) >>> r == 3 +--------+-----+-----+ | Runs | 3 | 1 | |--------+-----+-----| | Values | 0.0 | 1.0 | +--------+-----+-----+ Rle of length 4 containing 2 elements (avg. length 2.0) """ self, other = _make_rles_equal_length(self, other, np.nan) r = self - other r.values = np.where(r.values == 0, 1.0, 0.0) return r.defragment() def __getitem__(self, val): if isinstance(val, int): values = getlocs(self.runs, self.values, np.array([val], dtype=np.long)) return values[0] elif isinstance(val, slice): end = val.stop or np.sum(self.runs) start = val.start or 0 runs, values = getitem(self.runs, self.values, start, end) return Rle(runs, values) elif isinstance(val, pd.DataFrame): intype = val.dtypes["Start"] val = val["Start End".split()].astype(np.long) ids, starts, ends, runs, values = getitems(self.runs, self.values, val.Start.values, val.End.values) df = pd.DataFrame({"Start": starts, "End": ends, "ID": ids, "Run": runs, "Value": values}).astype({"Start": intype, "End": intype}) # val = val["Start End".split()].astype(np.long) # values = getitems(self.runs, self.values, val.Start.values, val.End.values) return df elif "PyRanges" in str(type(val)): # hack to avoid isinstance(key, pr.PyRanges) so that we # do not need a dep on PyRanges in this library import pyranges as pr val = val.drop().df if val.empty: return pd.DataFrame(columns="Chromosome Start End ID Run Value".split()) chromosome = val.Chromosome.iloc[0] intype = val.dtypes["Start"] if "Strand" in val: strand = val.Strand.iloc[0] else: strand = None val = val["Start End".split()].astype(np.long) ids, starts, ends, runs, values = getitems(self.runs, self.values, val.Start.values, val.End.values) df = pd.DataFrame({"Chromosome": chromosome, "Start": starts, "End": ends, "ID": ids, "Run": runs, "Value": values}).astype({"Start": intype, "End": intype}) if strand: df.insert(3, "Strand", strand) return pr.PyRanges(df) else: locs = np.sort(np.array(val, dtype=np.long)) values = getlocs(self.runs, self.values, locs) return values def __ge__(self, other): """Check if greater or equal to other. Examples -------- >>> r = Rle([1, 2, 3], [0, 2, 1]) >>> r2 = Rle([2, 1, 2], [2, 1, 2]) >>> r >= r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 2 | 1 | |--------+-----+-----+-----+-----| | Values | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+-----+ Rle of length 6 containing 4 elements (avg. length 1.5) >>> r >= 1 +--------+-----+-----+ | Runs | 1 | 5 | |--------+-----+-----| | Values | 0.0 | 1.0 | +--------+-----+-----+ Rle of length 6 containing 2 elements (avg. length 3.0) """ r = self - other r.values = np.where(r.values >= 0, 1.0, 0.0) return r.defragment() def __gt__(self, other): """Check if greater than other. Examples -------- >>> r = Rle([1, 2, 3], [0, 5, 1]) >>> r2 = Rle([2, 1, 2], [2, 3, 9]) >>> r > r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 2 | 1 | |--------+-----+-----+-----+-----| | Values | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+-----+ Rle of length 6 containing 4 elements (avg. length 1.5) >>> r > 2 +--------+-----+-----+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-----+-----| | Values | 0.0 | 1.0 | 0.0 | +--------+-----+-----+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) """ r = self - other r.values = np.where(r.values > 0, 1.0, 0.0) return r.defragment() def __le__(self, other): """Check if less than or equal to other. Examples -------- >>> r = Rle([1, 2, 3], [0, 5, 1]) >>> r2 = Rle([2, 1, 2], [2, 3, 9]) >>> r <= r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 2 | 1 | |--------+-----+-----+-----+-----| | Values | 1.0 | 0.0 | 1.0 | 0.0 | +--------+-----+-----+-----+-----+ Rle of length 6 containing 4 elements (avg. length 1.5) >>> r <= 2 +--------+-----+-----+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-----+-----| | Values | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) """ r = self - other r.values = np.where(r.values <= 0, 1.0, 0.0) return r.defragment() def __len__(self): """Return number of runs in Rle. See Also -------- pyrle.Rle.length : return length of Rle.""" return len(self.runs) def __lt__(self, other): """Check if less than other. Examples -------- >>> r = Rle([1, 2, 3], [0, 5, 1]) >>> r2 = Rle([2, 1, 2], [2, 3, 9]) >>> r < r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 2 | 1 | |--------+-----+-----+-----+-----| | Values | 1.0 | 0.0 | 1.0 | 0.0 | +--------+-----+-----+-----+-----+ Rle of length 6 containing 4 elements (avg. length 1.5) >>> r < 2 +--------+-----+-----+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-----+-----| | Values | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) """ r = self - other r.values = np.where(r.values < 0, 1.0, 0.0) return r.defragment() def __mul__(self, other): """Subtract number or Rle from Rle. The shortest Rle is extended with zeros. Examples -------- >>> r1 = Rle([1, 2], [0, 1]) >>> r2 = Rle([2, 2], [2, 3]) >>> r1 * r2 +--------+-----+-----+-----+-----+ | Runs | 1 | 1 | 1 | 1 | |--------+-----+-----+-----+-----| | Values | 0.0 | 2.0 | 3.0 | 0.0 | +--------+-----+-----+-----+-----+ Rle of length 4 containing 4 elements (avg. length 1.0) >>> r1 * 10 +--------+-----+------+ | Runs | 1 | 2 | |--------+-----+------| | Values | 0.0 | 10.0 | +--------+-----+------+ Rle of length 3 containing 2 elements (avg. length 1.5) """ if isinstance(other, Number): return Rle(self.runs, self.values * other) else: self, other = _make_rles_equal_length(self, other) runs, values = mul_rles(self.runs, self.values, other.runs, other.values) return Rle(runs, values) def __ne__(self, other): """Return where not equal. Examples -------- >>> r = Rle([1, 2, 1], [1, 2, 3]) >>> r2 = Rle([1, 1, 1], [1, 2, 1]) >>> r != r2 +--------+-----+-----+ | Runs | 2 | 2 | |--------+-----+-----| | Values | 0.0 | 1.0 | +--------+-----+-----+ Rle of length 4 containing 2 elements (avg. length 2.0) """ self, other = _make_rles_equal_length(self, other, np.nan) r = self - other r.values = np.where(r.values != 0, 1.0, 0.0) return r.defragment() def __neg__(self): """Negate values. Examples -------- >>> r = Rle([1, 2, 3], [5, -20, 1]) >>> r +--------+-----+-------+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-------+-----| | Values | 5.0 | -20.0 | 1.0 | +--------+-----+-------+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) >>> -r +--------+------+------+------+ | Runs | 1 | 2 | 3 | |--------+------+------+------| | Values | -5.0 | 20.0 | -1.0 | +--------+------+------+------+ Rle of length 6 containing 3 elements (avg. length 2.0) """ self = self.copy() self.values = -self.values return self def __radd__(self, other): """Add scalar to Rle values. Examples -------- >>> 5 + Rle([1, 2], [3, 4]) +--------+-----+-----+ | Runs | 1 | 2 | |--------+-----+-----| | Values | 8.0 | 9.0 | +--------+-----+-----+ Rle of length 3 containing 2 elements (avg. length 1.5) """ return Rle(self.runs, self.values + other) def __repr__(self): """Return REPL string representation.""" return str(self) def __rmul__(self, other): """Multiply scalar with Rle-values. Examples -------- >>> 5 * Rle([1, 2], [0.5, 1]) +--------+-----+-----+ | Runs | 1 | 2 | |--------+-----+-----| | Values | 2.5 | 5.0 | +--------+-----+-----+ Rle of length 3 containing 2 elements (avg. length 1.5) """ return Rle(self.runs, self.values * other) def __rsub__(self, other): """Subtract Rle-values from scalar. Examples -------- >>> 5 - Rle([1, 2], [0.5, 1]) +--------+-----+-----+ | Runs | 1 | 2 | |--------+-----+-----| | Values | 4.5 | 4.0 | +--------+-----+-----+ Rle of length 3 containing 2 elements (avg. length 1.5) """ return Rle(self.runs, other - self.values) def __rtruediv__(self, other): """Divide scalar with Rle-values. Examples -------- >>> 5 / Rle([1, 2], [0.5, 1]) +--------+------+-----+ | Runs | 1 | 2 | |--------+------+-----| | Values | 10.0 | 5.0 | +--------+------+-----+ Rle of length 3 containing 2 elements (avg. length 1.5) """ return Rle(self.runs, other / self.values) def __str__(self): """Return string representation of Rle.""" terminal_width = shutil.get_terminal_size().columns entries = min(len(self.runs), 10) half_entries = int(entries/2) start_runs, end_runs = [str(i) for i in self.runs[:half_entries]], [str(i) for i in self.runs[-half_entries:]] start_values, end_values = [str(i) for i in self.values[:half_entries]], [str(i) for i in self.values[-half_entries:]] if entries < len(self.runs): runs = start_runs + ["..."] + end_runs values = start_values + ["..."] + end_values else: runs, values = self.runs, self.values df = pd.Series(values).to_frame().T df.columns = list(runs) df.index = ["Values"] df.index.name = "Runs" outstr = tabulate(df, tablefmt='psql', showindex=True, headers="keys", disable_numparse=True) while len(outstr.split("\n", 1)[0]) > terminal_width: half_entries -= 1 runs = start_runs[:half_entries] + ["..."] + end_runs[-half_entries:] values = start_values[:half_entries] + ["..."] + end_values[-half_entries:] df = pd.Series(values).to_frame().T df.columns = list(runs) df.index = ["Values"] df.index.name = "Runs" outstr = tabulate(df, tablefmt='psql', showindex=True, headers="keys", disable_numparse=True) length = np.sum(self.runs) elements = len(self.runs) info = "\nRle of length {} containing {} elements (avg. length {})".format(str(length), str(elements), str(np.round(length/elements, 3))) return outstr + info def __sub__(self, other): """Subtract number or Rle from Rle. The shortest Rle is extended with zeros. Examples -------- >>> r1 = Rle([1, 2], [0, 1]) >>> r2 = Rle([2, 2], [2, 3]) >>> r1 - r2 +--------+------+------+------+------+ | Runs | 1 | 1 | 1 | 1 | |--------+------+------+------+------| | Values | -2.0 | -1.0 | -2.0 | -3.0 | +--------+------+------+------+------+ Rle of length 4 containing 4 elements (avg. length 1.0) >>> r1 - 10 +--------+-------+------+ | Runs | 1 | 2 | |--------+-------+------| | Values | -10.0 | -9.0 | +--------+-------+------+ Rle of length 3 containing 2 elements (avg. length 1.5) """ if isinstance(other, Number): return Rle(self.runs, self.values - other) else: self, other = _make_rles_equal_length(self, other) runs, values = sub_rles(self.runs, self.values, other.runs, other.values) return Rle(runs, values) def __truediv__(self, other): """Divide Rle with number or Rle. The shortest Rle is extended with zeros. Examples -------- >>> r1 = Rle([1, 2], [0, 1]) >>> r2 = Rle([2, 2], [2, 3]) >>> r1 / r2 +--------+-----+-----+--------------------+-----+ | Runs | 1 | 1 | 1 | 1 | |--------+-----+-----+--------------------+-----| | Values | 0.0 | 0.5 | 0.3333333333333333 | 0.0 | +--------+-----+-----+--------------------+-----+ Rle of length 4 containing 4 elements (avg. length 1.0) >>> r1 / 10 +--------+-----+-----+ | Runs | 1 | 2 | |--------+-----+-----| | Values | 0.0 | 0.1 | +--------+-----+-----+ Rle of length 3 containing 2 elements (avg. length 1.5) """ if isinstance(other, Number): return Rle(self.runs, self.values / other) else: self, other = _make_rles_equal_length(self, other) if (other.values == 0).any() or np.sum(other.runs) < np.sum(self.runs): runs, values = div_rles_zeroes(self.runs, self.values, other.runs, other.values) else: runs, values = div_rles_nonzeroes(self.runs, self.values, other.runs, other.values) return Rle(runs, values) def apply_values(self, f, defragment=True): """Apply function to the values. Parameters ---------- f : function Must return vector of double with same length as Rle. defragment : bool, default True Whether to merge consecutive runs of same value after application. See Also -------- pyrle.__array_ufunc__ : apply numpy functions to pyrle. Examples -------- >>> r = Rle([1, 3, 5], [100, 200, -300]) >>> r.apply_values(lambda v: np.sqrt(v)) +--------+------+--------------------+-----+ | Runs | 1 | 3 | 5 | |--------+------+--------------------+-----| | Values | 10.0 | 14.142135620117188 | nan | +--------+------+--------------------+-----+ Rle of length 9 containing 3 elements (avg. length 3.0) >>> def gt0_to_1(v): ... v[v > 0] = 1 ... return v >>> r.apply_values(gt0_to_1, defragment=False) +--------+-----+-----+--------+ | Runs | 1 | 3 | 5 | |--------+-----+-----+--------| | Values | 1.0 | 1.0 | -300.0 | +--------+-----+-----+--------+ Rle of length 9 containing 3 elements (avg. length 3.0) >>> r.apply_values(gt0_to_1, defragment=True) +--------+-----+--------+ | Runs | 4 | 5 | |--------+-----+--------| | Values | 1.0 | -300.0 | +--------+-----+--------+ Rle of length 9 containing 2 elements (avg. length 4.5) """ self = self.copy() self.values = f(self.values) if defragment: self = self.defragment() return self def apply_runs(self, f, defragment=True): """Apply function to the runs. Parameters ---------- f : function Must return vector of ints with same length as Rle. defragment : bool, default True Whether to merge consecutive runs of same value after application. Examples -------- >>> r = Rle([1, 3, 5], [100, 200, -300]) >>> r.apply_runs(lambda v: (v ** 2).astype(int)) +--------+-------+-------+--------+ | Runs | 1 | 9 | 25 | |--------+-------+-------+--------| | Values | 100.0 | 200.0 | -300.0 | +--------+-------+-------+--------+ Rle of length 35 containing 3 elements (avg. length 11.667) """ self = self.copy() self.runs = f(self.runs) if defragment: self = self.defragment() return self def apply(self, f, defragment=True): """Apply function to the Rle. Parameters ---------- f : function Must return Rle. defragment : bool, default True Whether to merge consecutive runs of same value after application. Examples -------- >>> r = Rle([1, 3, 5], [100, 200, -300]) >>> def shuffle(rle): ... np.random.seed(0) ... np.random.shuffle(rle.values) ... np.random.shuffle(rle.runs) ... return rle >>> r.apply(shuffle) +--------+--------+-------+-------+ | Runs | 5 | 1 | 3 | |--------+--------+-------+-------| | Values | -300.0 | 200.0 | 100.0 | +--------+--------+-------+-------+ Rle of length 9 containing 3 elements (avg. length 3.0) """ self = self.copy() self = f(self) if defragment: self = self.defragment() return self def copy(self): """Return copy of Rle.""" return Rle(np.copy(self.runs), np.copy(self.values)) def defragment(self): """Merge consecutive values. Examples -------- >>> r = Rle([1, 2, 3], [1, 0, 1]) >>> r +--------+-----+-----+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-----+-----| | Values | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) >>> r.values[1] = 1 >>> r.values[2] = 2 >>> r +--------+-----+-----+-----+ | Runs | 1 | 2 | 3 | |--------+-----+-----+-----| | Values | 1.0 | 1.0 | 2.0 | +--------+-----+-----+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) >>> r.defragment() +--------+-----+-----+ | Runs | 3 | 3 | |--------+-----+-----| | Values | 1.0 | 2.0 | +--------+-----+-----+ Rle of length 6 containing 2 elements (avg. length 3.0) """ runs, values = _remove_dupes(self.runs, self.values, len(self)) values[values == -0] = 0 return Rle(runs, values) @property def length(self): """Return sum of runs vector. See Also -------- pyrle.Rle.__len__ : return number of runs. Examples -------- >>> Rle([5], [0]).length 5 >>> gauss = Rle(np.arange(1, 101), [0, 1] * 50) >>> gauss +--------+-----+-----+-----+-----+-----+-------+------+------+------+------+-------+ | Runs | 1 | 2 | 3 | 4 | 5 | ... | 96 | 97 | 98 | 99 | 100 | |--------+-----+-----+-----+-----+-----+-------+------+------+------+------+-------| | Values | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----+-----+-----+-----+-----+-------+------+------+------+------+-------+ Rle of length 5050 containing 100 elements (avg. length 50.5) >>> gauss.length 5050 """ return np.sum(self.runs) def mean(self): """Return mean of values. The values are multiplied with their run length. Examples -------- >>> Rle([1, 2, 1], [1, 2, 3]).mean() 1.5 >>> # ((1 * 1) + (2 * 2) + (1 * 3)) / (1 + 2 + 1) """ length = self.length _sum = np.sum(self.values) return _sum / length def numbers_only(self, nan=0.0, posinf=2147483647, neginf=-2147483648): """Fill inf with large values and nan with 0. Parameters ---------- nan : double, default 0 Value to represent nan posinf : double, default 2147483647 Value to represent inf. neginf : double, default -2147483648 Value to represent -inf. Examples -------- >>> r = Rle([1, 2, 1, 2, 1], [-np.inf, 1, np.nan, 1, np.inf]) >>> r +--------+------+-----+-----+-----+-----+ | Runs | 1 | 2 | 1 | 2 | 1 | |--------+------+-----+-----+-----+-----| | Values | -inf | 1.0 | nan | 1.0 | inf | +--------+------+-----+-----+-----+-----+ Rle of length 7 containing 5 elements (avg. length 1.4) >>> r.numbers_only() +--------+---------------+-----+-----+-----+--------------+ | Runs | 1 | 2 | 1 | 2 | 1 | |--------+---------------+-----+-----+-----+--------------| | Values | -2147483648.0 | 1.0 | 0.0 | 1.0 | 2147483648.0 | +--------+---------------+-----+-----+-----+--------------+ Rle of length 7 containing 5 elements (avg. length 1.4) """ return Rle(self.runs, np.nan_to_num(self.values, nan=nan, posinf=posinf, neginf=neginf)).defragment() def shift(self, dist=1, preserve_length=True, fill=0): """Shift values. Parameters ---------- dist : int, default 1 Shift distance. Negative means shift left. preserve_length : bool, default True Fill end when shifting left, or truncate end when shifting right. fill : int, default 0 Fill for values shifted out of bounds. Examples -------- >>> r = Rle([3, 2, 1], [1, -1, 2]) >>> r +--------+-----+------+-----+ | Runs | 3 | 2 | 1 | |--------+-----+------+-----| | Values | 1.0 | -1.0 | 2.0 | +--------+-----+------+-----+ Rle of length 6 containing 3 elements (avg. length 2.0) >>> r.shift(2, preserve_length=False, fill=np.nan) +--------+-----+-----+------+-----+ | Runs | 2 | 3 | 2 | 1 | |--------+-----+-----+------+-----| | Values | nan | 1.0 | -1.0 | 2.0 | +--------+-----+-----+------+-----+ Rle of length 8 containing 4 elements (avg. length 2.0) >>> r.shift(2) +--------+-----+-----+------+ | Runs | 2 | 3 | 1 | |--------+-----+-----+------| | Values | 0.0 | 1.0 | -1.0 | +--------+-----+-----+------+ Rle of length 6 containing 3 elements (avg. length 2.0) >>> r.shift(-2, fill=np.nan) +--------+-----+------+-----+-----+ | Runs | 1 | 2 | 1 | 2 | |--------+-----+------+-----+-----| | Values | 1.0 | -1.0 | 2.0 | nan | +--------+-----+------+-----+-----+ Rle of length 6 containing 4 elements (avg. length 1.5) >>> r.shift(-4, preserve_length=False) +--------+------+-----+ | Runs | 1 | 1 | |--------+------+-----| | Values | -1.0 | 2.0 | +--------+------+-----+ Rle of length 2 containing 2 elements (avg. length 1.0) """ self = self.copy() if dist > 0: original_length = self.length if self.values[0] == fill: self.runs[0] += dist else: self.values = np.r_[fill, self.values] self.runs = np.r_[dist, self.runs] if preserve_length: self = self[:original_length] elif dist < 0: dist = -dist # remember dist is negative if dist < self.runs[0]: self.runs[0] -= dist else: cs = np.cumsum(self.runs) ix = np.argmax(cs > dist) leftover = (np.sum(self.runs[:ix]) - dist) self = Rle(self.runs[ix:], self.values[ix:]) self.runs[0] += leftover if self.runs[0] < 0: self = Rle([], []) if preserve_length: if self.values[-1] == fill: self.runs[-1] += dist else: self.values = np.r_[self.values, fill] self.runs = np.r_[self.runs, dist] return self def std(self): """Return standard deviation. See Also -------- pyrle.Rle.mean : return mean Examples -------- >>> Rle([1, 2, 1], [1, 2, 3]).std() 0.8660254037844386 """ _sum = np.sum(self.values - self.mean()) ** 2 return np.sqrt(_sum/(self.length - 1)) def to_frame(self): """Return Rle as DataFrame. See Also -------- pyrle.Rle.to_csv : write Rle to csv Examples -------- >>> df = Rle([1, 5, 18], [0, 1, 0]).to_frame() >>> df Runs Values 0 1 0.0 1 5 1.0 2 18 0.0 """ return pd.DataFrame(data={"Runs": self.runs, "Values": self.values})["Runs Values".split()] def to_csv(self, **kwargs): """Return Rle as DataFrame. Parameters ---------- **kwargs See the docs for pandas.DataFrame.to_csv See Also -------- pyrle.Rle.to_frame : return Rle as DataFrame Examples -------- >>> df = Rle([1, 5, 18], [0, 1, 0]).to_frame() >>> df Runs Values 0 1 0.0 1 5 1.0 2 18 0.0 """ self.to_frame().to_csv(**kwargs) pyrle-0.0.33/pyrle/rledict.py000066400000000000000000000607271406033450200160740ustar00rootroot00000000000000"""Data structure for collection of genomic Rles. It has the same methods as the Rle object, but align these on the chromosome or chromosome and strand pairs. See the documentation for pyrle.Rle. """ from pyrle.src.getitem import getitems from pyrle import Rle from numbers import Number import pyrle.methods as m from natsort import natsorted import numpy as np import logging __all__ = ["RleDict"] def get_multithreaded_funcs(function, nb_cpu): if nb_cpu > 1: import ray get = ray.get function = ray.remote(function) else: get = lambda x: x function.remote = function return function, get class RleDict(): """Data structure to represent and manipulate a genomic collection of Rles. Parameters ---------- ranges : dict of Rles, DataFrame or PyRanges, default None Data to build RleDict from. stranded : bool, default False Whether to make separate Rles for each strand. Default False. value_col : str, default None Column to use for Rle values cols. nb_cpu : int, default 1 Number of CPUs used to create the RleDict. See Also -------- pyrle.rle.Rle : Numerical run length encoding Examples -------- >>> r = Rle([1, 2, 1, 5], [0, 2.1, 3, 4]) >>> r2 = Rle([1, 1, 1, 0, 0, 2, 2, 3, 4, 2]) >>> rd = RleDict({"chr1": r, "chr2": r2}) >>> rd chr1 ---- +--------+-----+-----+-----+-----+ | Runs | 1 | 2 | 1 | 5 | |--------+-----+-----+-----+-----| | Values | 0.0 | 2.1 | 3.0 | 4.0 | +--------+-----+-----+-----+-----+ Rle of length 9 containing 4 elements (avg. length 2.25) chr2 ---- +--------+-----+-----+-----+-----+-----+-----+ | Runs | 3 | 2 | 2 | 1 | 1 | 1 | |--------+-----+-----+-----+-----+-----+-----| | Values | 1.0 | 0.0 | 2.0 | 3.0 | 4.0 | 2.0 | +--------+-----+-----+-----+-----+-----+-----+ Rle of length 10 containing 6 elements (avg. length 1.667) Unstranded RleDict object with 2 chromosomes. >>> import pyranges as pr >>> gr = pr.data.chipseq() >>> df = pr.data.chipseq_background().df >>> cs = RleDict(gr, stranded=True) >>> bg = RleDict(df, stranded=True) >>> cs chr1 + +--------+-----------+------+---------+------+-----------+-------+------+-----------+------+-----------+------+ | Runs | 1541598 | 25 | 57498 | 25 | 1904886 | ... | 25 | 2952580 | 25 | 1156833 | 25 | |--------+-----------+------+---------+------+-----------+-------+------+-----------+------+-----------+------| | Values | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----------+------+---------+------+-----------+-------+------+-----------+------+-----------+------+ Rle of length 247134924 containing 894 elements (avg. length 276437.275) ... chrY - +--------+-----------+------+----------+------+----------+-------+------+----------+------+----------+------+ | Runs | 7046809 | 25 | 358542 | 25 | 296582 | ... | 25 | 143271 | 25 | 156610 | 25 | |--------+-----------+------+----------+------+----------+-------+------+----------+------+----------+------| | Values | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----------+------+----------+------+----------+-------+------+----------+------+----------+------+ Rle of length 22210662 containing 32 elements (avg. length 694083.188) RleDict object with 48 chromosomes/strand pairs. >>> cs - (bg * 5) chr1 + +--------+-----------+------+----------+------+---------+-------+------+----------+------+-----------+------+ | Runs | 1041102 | 25 | 500471 | 25 | 57498 | ... | 25 | 363693 | 25 | 1156833 | 25 | |--------+-----------+------+----------+------+---------+-------+------+----------+------+-----------+------| | Values | 0.0 | -5.0 | 0.0 | 1.0 | 0.0 | ... | -5.0 | 0.0 | 1.0 | 0.0 | 1.0 | +--------+-----------+------+----------+------+---------+-------+------+----------+------+-----------+------+ Rle of length 247134924 containing 1618 elements (avg. length 152740.991) ... chrY - +--------+-----------+------+----------+------+----------+-------+------+----------+------+------------+------+ | Runs | 7046809 | 25 | 358542 | 25 | 296582 | ... | 25 | 156610 | 25 | 35191552 | 25 | |--------+-----------+------+----------+------+----------+-------+------+----------+------+------------+------| | Values | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | -5.0 | +--------+-----------+------+----------+------+----------+-------+------+----------+------+------------+------+ Rle of length 57402239 containing 42 elements (avg. length 1366719.976) RleDict object with 50 chromosomes/strand pairs. """ def __init__(self, ranges=None, stranded=False, value_col=None, nb_cpu=1): # Construct RleDict from dict of rles if isinstance(ranges, dict): self.rles = ranges self.__dict__["stranded"] = True if len(list( ranges.keys())[0]) == 2 else False elif ranges is None: self.rles = {} # Construct RleDict from ranges else: if stranded: grpby_keys = "Chromosome Strand".split() else: grpby_keys = "Chromosome" try: df = ranges.df except: df = ranges grpby = list(natsorted(df.groupby(grpby_keys))) if nb_cpu > 1: import ray with m.suppress_stdout_stderr(): ray.init(num_cpus=nb_cpu) m_coverage, get = get_multithreaded_funcs(m.coverage, nb_cpu) _rles = {} kwargs = {"value_col": value_col} if stranded: for (c, s), cdf in grpby: _rles[c, s] = m_coverage.remote(cdf, **kwargs) else: s = None for k, cdf in grpby: _rles[k] = m_coverage.remote(cdf, **kwargs) _rles = { k: v for k, v in zip(_rles.keys(), get(list(_rles.values()))) } if nb_cpu > 1: ray.shutdown() self.rles = _rles self.__dict__["stranded"] = stranded def __add__(self, other): if isinstance(other, Number): return RleDict({cs: v + other for cs, v in self.items()}) return m.binary_operation("add", self, other) def __eq__(self, other): if not self.rles.keys() == other.rles.keys(): return False for c in self.rles.keys(): if self.rles[c] != other.rles[c]: return False return True def __iter__(self): """Iterate over key and Rle. Examples -------- >>> r = RleDict({("chr1", "+"): Rle([1, 1], [1, 2]), ... ("chr1", "-"): Rle([1, 1], [3, 2.0])}) >>> for k, v in r: ... print(k) ... print(v) ('chr1', '+') +--------+-----+-----+ | Runs | 1 | 1 | |--------+-----+-----| | Values | 1.0 | 2.0 | +--------+-----+-----+ Rle of length 2 containing 2 elements (avg. length 1.0) ('chr1', '-') +--------+-----+-----+ | Runs | 1 | 1 | |--------+-----+-----| | Values | 3.0 | 2.0 | +--------+-----+-----+ Rle of length 2 containing 2 elements (avg. length 1.0) """ return iter(self.rles.items()) def __getitem__(self, key): key_is_string = isinstance(key, str) key_is_int = isinstance(key, int) if key_is_int: raise Exception("Integer indexing not allowed!") if key_is_string and self.stranded and key not in ["+", "-"]: plus = self.rles.get((key, "+"), Rle()) rev = self.rles.get((key, "-"), Rle()) return RleDict({(key, "+"): plus, (key, "-"): rev}) # only return particular strand, but from all chromos elif key_is_string and self.stranded and key in ["+", "-"]: to_return = dict() for (c, s), rle in self.items(): if s == key: to_return[c, s] = rle if len(to_return) > 1: return RleDict(to_return) else: # return just the rle return list(to_return.values())[0] elif key_is_string: return self.rles.get(key, Rle()) elif "PyRanges" in str(type(key)): # hack to avoid isinstance(key, pr.PyRanges) so that we # do not need a dep on PyRanges in this library import pyranges as pr import pandas as pd from pyrle.rle import find_runs if not len(key): return pd.DataFrame(columns="Chromosome Start End ID Run Value".split()) result = {} for k, v in key.dfs.items(): if k not in self.rles: continue v = v["Start End".split()].astype(np.long) ids, starts, ends, runs, values = getitems(self.rles[k].runs, self.rles[k].values, v.Start.values, v.End.values) df = pd.DataFrame({"Start": starts, "End": ends, "ID": ids, "Run": runs, "Value": values}) if isinstance(k, tuple): df.insert(0, "Chromosome", k[0]) df.insert(df.shape[1], "Strand", k[1]) else: df.insert(0, "Chromosome", k) result[k] = df return pr.PyRanges(result) elif len(key) == 2: return self.rles.get(key, Rle([1], [0])) else: raise IndexError( "Must use chromosome, strand or (chromosome, strand) to get items from RleDict." ) def __len__(self): """Return number of keys in RleDict.""" return len(self.rles) def __mul__(self, other): if isinstance(other, Number): return RleDict({cs: v * other for cs, v in self.items()}) return m.binary_operation("mul", self, other) def __radd__(self, other): return RleDict({cs: other + v for cs, v in self.items()}) def __repr__(self): return str(self) def __rsub__(self, other): return RleDict({cs: other - v for cs, v in self.items()}) def __rtruediv__(self, other): return RleDict({cs: other / v for cs, v in self.items()}) def __rmul__(self, other): return RleDict({cs: other * v for cs, v in self.items()}) def __setitem__(self, key, item): self.rles[key] = item def __str__(self): if not self.rles: return "Empty RleDict." keys = natsorted(self.rles.keys()) stranded = True if len(list(keys)[0]) == 2 else False if not stranded: if len(keys) > 2: str_list = [ keys[0], str(self.rles[keys[0]]), "...", keys[-1], str(self.rles[keys[-1]]), "Unstranded RleDict object with {} chromosomes.".format( len(self.rles.keys())) ] elif len(keys) == 2: str_list = [ keys[0], "-" * len(keys[0]), str(self.rles[keys[0]]), "", keys[-1], "-" * len(keys[-1]), str(self.rles[keys[-1]]), "Unstranded RleDict object with {} chromosomes.".format( len(self.rles.keys())) ] else: str_list = [ keys[0], str(self.rles[keys[0]]), "Unstranded RleDict object with {} chromosome.".format( len(self.rles.keys())) ] else: if len(keys) > 2: str_list = [ " ".join(keys[0]), str(self.rles[keys[0]]), "...", " ".join(keys[-1]), str(self.rles[keys[-1]]), "RleDict object with {} chromosomes/strand pairs.".format( len(self.rles.keys())) ] elif len(keys) == 2: str_list = [ " ".join(keys[0]), "-" * len(keys[0]), str(self.rles[keys[0]]), "", " ".join(keys[-1]), "-" * len(keys[-1]), str(self.rles[keys[-1]]), "RleDict object with {} chromosomes/strand pairs.".format( len(self.rles.keys())) ] else: str_list = [ " ".join(keys[0]), str(self.rles[keys[0]]), "RleDict object with {} chromosome/strand pairs.".format( len(self.rles.keys())) ] outstr = "\n".join(str_list) return outstr def __sub__(self, other): if isinstance(other, Number): return RleDict({cs: v - other for cs, v in self.items()}) return m.binary_operation("sub", self, other) def __truediv__(self, other): if isinstance(other, Number): return RleDict({cs: v / other for cs, v in self.items()}) return m.binary_operation("div", self, other) def add(self, other, nb_cpu=1): """Add two RleDicts. Same as +, but add takes nb_cpu argument.""" return m.binary_operation("add", self, other, nb_cpu) def add_pseudocounts(self, pseudo=0.01): for k, rle in self.items(): rle.values.loc[rle.values == 0] = pseudo def apply(self, f, defragment=True): """Apply function to each Rle. Parameters ---------- f : callable Takes and returns an Rle defragment : bool, default True Merge consecutive runs of equal values afterwards. **kwargs : Arguments given to f. See Also -------- pyrle.RleDict.__array_ufunc__ : apply numpy function to RleDict Examples -------- >>> r = RleDict({("chr1", "+"): Rle([1, 4], [1, 2]), ... ("chr1", "-"): Rle([2, 1], [3, 2.0])}) >>> def nonsense(rle): ... rle.runs = rle.runs[::-1].copy() ... rle.values = np.sqrt(rle.values) ... return rle >>> r.apply(nonsense) chr1 + -- +--------+-----+--------------------+ | Runs | 4 | 1 | |--------+-----+--------------------| | Values | 1.0 | 1.4142135381698608 | +--------+-----+--------------------+ Rle of length 5 containing 2 elements (avg. length 2.5) chr1 - -- +--------+--------------------+--------------------+ | Runs | 1 | 2 | |--------+--------------------+--------------------| | Values | 1.7320508075688772 | 1.4142135381698608 | +--------+--------------------+--------------------+ Rle of length 3 containing 2 elements (avg. length 1.5) RleDict object with 2 chromosomes/strand pairs. """ new_rles = {} for k, r in self: new_rle = r.copy() # new_rle.runs = f(new_rle.runs).astype(np.int) new_rle = f(new_rle) new_rle = new_rle.defragment() new_rles[k] = new_rle return RleDict(new_rles) def apply_runs(self, f, defragment=True): """Apply a function to the runs of RleDict. Parameters ---------- f : callable Takes the runs and returns an array of type int64 with same length as the input. defragment : bool, default True Merge consecutive runs of equal values afterwards. **kwargs : Arguments given to f. See Also -------- pyrle.RleDict.apply_values : apply function to values of RleDict Examples -------- >>> r = RleDict({("chr1", "+"): Rle([1, 4], [1, 2]), ... ("chr1", "-"): Rle([2, 1], [3, 2.0])}) >>> def even_times_hundred(runs): ... runs[runs % 2 == 0] *= 100 ... return runs >>> r.apply_runs(even_times_hundred) chr1 + -- +--------+-----+-------+ | Runs | 1 | 400 | |--------+-----+-------| | Values | 1.0 | 2.0 | +--------+-----+-------+ Rle of length 401 containing 2 elements (avg. length 200.5) chr1 - -- +--------+-------+-----+ | Runs | 200 | 1 | |--------+-------+-----| | Values | 3.0 | 2.0 | +--------+-------+-----+ Rle of length 201 containing 2 elements (avg. length 100.5) RleDict object with 2 chromosomes/strand pairs. """ new_rles = {} for k, r in self: new_rle = r.copy() new_rle.runs = f(new_rle.runs).astype(np.int) new_rle = new_rle.defragment() new_rles[k] = new_rle return RleDict(new_rles) def apply_values(self, f, defragment=True, **kwargs): """Apply a function to the values of each Rle. Parameters ---------- f : callable Takes the values and returns an array of type double with the same length as the input. defragment : bool, default True Merge consecutive runs of equal values afterwards. **kwargs : Arguments given to f. See Also -------- pyrle.RleDict.__array_ufunc__ : apply numpy function to RleDict Examples -------- >>> r = RleDict({("chr1", "+"): Rle([1, 1], [1, 2]), ... ("chr1", "-"): Rle([1, 1], [3, 2.0])}) >>> f = lambda v, **kwargs: v ** kwargs["exponent"] >>> r.apply_values(f, exponent=3) chr1 + -- +--------+-----+-----+ | Runs | 1 | 1 | |--------+-----+-----| | Values | 1.0 | 8.0 | +--------+-----+-----+ Rle of length 2 containing 2 elements (avg. length 1.0) chr1 - -- +--------+------+-----+ | Runs | 1 | 1 | |--------+------+-----| | Values | 27.0 | 8.0 | +--------+------+-----+ Rle of length 2 containing 2 elements (avg. length 1.0) RleDict object with 2 chromosomes/strand pairs. """ new_rles = {} for k, r in self: new_rle = r.copy() new_rle.values = f(new_rle.values, **kwargs).astype(np.double) new_rle = new_rle.defragment() new_rles[k] = new_rle return RleDict(new_rles) @property def chromosomes(self): cs = [] for k in self.rles: if isinstance(k, tuple): cs.append(k[0]) else: cs.append(k) return natsorted(set(cs)) def copy(self): d = {} for k, r in self: d[k] = r.copy() return RleDict(d) def defragment(self, numbers_only=False): if not numbers_only: d = {k: v.defragment() for k, v in self.items()} else: d = {k: v.numbers_only().defragment() for k, v in self.items()} return RleDict(d) def div(self, other, nb_cpu=1): """Divide two RleDicts. Same as /, but div takes nb_cpu argument.""" return m.binary_operation("div", self, other, nb_cpu) def items(self): _items = list(self.rles.items()) return natsorted(_items, key=lambda x: x[0]) def keys(self): return natsorted(list(self.rles.keys())) def make_strands_same_length(self, fill_value=0): self = self.copy() if not self.stranded: return self for c in self.chromosomes: p = self[c]["+"] n = self[c]["-"] pl = p.length nl = n.length diff = abs(pl - nl) if pl > nl: if n.values[-1] == fill_value: n.runs[-1] += diff else: n.runs = np.r_[n.runs, diff] n.values = np.r_[n.values, fill_value] elif pl < nl: if p.values[-1] == fill_value: p.runs[-1] += diff else: p.runs = np.r_[p.runs, diff] p.values = np.r_[p.values, fill_value] return self def mul(self, other, nb_cpu=1): """Multiply two RleDicts. Same as *, but mul takes nb_cpu argument.""" return m.binary_operation("mul", self, other, nb_cpu) def numbers_only(self): return RleDict({k: v.numbers_only() for k, v in self.items()}) def shift(self, distance): return self.apply(lambda r: r.shift(distance)) def sub(self, other, nb_cpu=1): """Subtract two RleDicts. Same as -, but sub takes nb_cpu argument.""" return m.binary_operation("sub", self, other, nb_cpu) @property def stranded(self): if len(self) == 0: return True return isinstance(self.keys()[0], tuple) def to_csv(self, f, sep="\t"): self.to_table().to_csv(f, sep=sep, index=False) def to_ranges(self, dtype=np.int32, stranded=None): """Turn RleDict into PyRanges. Parameters ---------- dtype : {np.int32 or np.int64} Type of starts and ends in PyRanges. stranded : bool, default None, i.e. auto Whether to return stranded PyRanges. Example ------- >>> r = RleDict({("chr1", "+"): Rle([1, 1], [1, 2]), ... ("chr1", "-"): Rle([1, 1], [3, 2.0])}) >>> r.to_ranges() +--------------+-----------+-----------+-------------+--------------+ | Chromosome | Start | End | Score | Strand | | (category) | (int32) | (int32) | (float64) | (category) | |--------------+-----------+-----------+-------------+--------------| | chr1 | 0 | 1 | 1 | + | | chr1 | 1 | 2 | 2 | + | | chr1 | 0 | 1 | 3 | - | | chr1 | 1 | 2 | 2 | - | +--------------+-----------+-----------+-------------+--------------+ Stranded PyRanges object has 4 rows and 5 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. """ assert dtype in [np.int32, np.int64] dtypes = {"Chromosome": "category", "Start": dtype, "End": dtype} if self.stranded: dtypes["Strand"] = "category" return m.to_ranges(self).apply(lambda df: df.astype(dtypes)) def to_table(self): import pandas as pd dfs = [] for k, r in self.rles.items(): df = pd.DataFrame(data={"Runs": r.runs, "Values": r.values}) if self.stranded: df.insert(0, "Chromosome", k[0]) df.insert(1, "Strand", k[1]) else: df.insert(0, "Chromosome", k) dfs.append(df) return pd.concat(dfs) def values(self): return [self.rles[k] for k in natsorted(self.rles.keys())] if __name__ == "__main__": # Must turn on macros in setup.py for line tracing to work "kernprof -l pyrle/rledict.py && python -m line_profiler coverage.py.lprof" from time import time import datetime import pandas as pd test_file = "/mnt/scratch/endrebak/genomes/chip/UCSD.Aorta.Input.STL002.bed.gz" nrows = None df = pd.read_table( test_file, sep="\t", usecols=[0, 1, 2, 5], header=None, names="Chromosome Start End Strand".split(), nrows=nrows) print("Done reading") start = time() result = RleDict(df, stranded=True) end = time() total = end - start total_dt = datetime.datetime.fromtimestamp(total) minutes_seconds = total_dt.strftime('%M\t%S\n') print(result) print(minutes_seconds) pyrle-0.0.33/pyrle/src/000077500000000000000000000000001406033450200146475ustar00rootroot00000000000000pyrle-0.0.33/pyrle/src/__init__.py000066400000000000000000000000001406033450200167460ustar00rootroot00000000000000pyrle-0.0.33/pyrle/src/coverage.pyx000066400000000000000000000104141406033450200172040ustar00rootroot00000000000000import numpy as np import pandas as pd cimport cython from libc.math cimport isnan cdef extern from "math.h": float INFINITY try: dummy = profile except: profile = lambda x: x def insort(a, b, kind='mergesort'): # took mergesort as it seemed a tiny bit faster for my sorted large array try. c = np.concatenate((a, b)) # we still need to do this unfortunatly. c.sort(kind=kind) flag = np.ones(len(c), dtype=bool) np.not_equal(c[1:], c[:-1], out=flag[1:]) return c[flag] @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) def _coverage(long [::1] positions, double [::1] values): d = {} cdef int i = 0 cdef int j = 0 cdef int pos = -1 cdef int oldpos = positions[0] cdef double value inlength = len(positions) unique = np.unique(positions) n_unique = len(unique) outlength = n_unique if 0 == positions[0]: first_value = values[0] else: first_value = 0 values_arr = np.zeros(outlength) cdef long[::1] outposition cdef double[::1] outvalue outvalue = values_arr outposition = unique while i < inlength: if positions[i] != oldpos: j += 1 oldpos = positions[i] outvalue[j] += values[i] i += 1 value_series = pd.Series(values_arr) runs = pd.Series(unique, dtype=np.long) value_series = value_series.cumsum().shift() value_series[0] = first_value shifted = runs.shift() shifted[0] = 0 runs = (runs - shifted) if len(value_series) > 1 and first_value == value_series[1]: runs[1] += runs[0] value_series = value_series[1:] runs = runs[1:] return runs.values, value_series.values @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) def _remove_dupes(long [::1] runs, double [::1] values, int length): cdef long[::1] _runs cdef double[::1] _vals _runs = runs _vals = values i = 0 cdef int counter = 0 cdef double old_val = _vals[i] cdef int old_run = _runs[i] cdef int run cdef float value cdef int last_different = 0 nrs_arr = np.zeros(len(runs), dtype=np.long) nvs_arr = np.zeros(len(runs), dtype=np.float64) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr #print("indata runs", list(runs)) #print("indata values", list(values)) for i in range(1, len(values)): run = _runs[i] value = _vals[i] #print("run, value", run, value) if isnan(value) and isnan(old_val): old_run += run last_insert = 0 elif (value == INFINITY and old_val == INFINITY) or (value == -INFINITY and old_val == -INFINITY): #print("elif inf") old_run += run last_insert = 0 elif abs(value - old_val) < 1e-5: #print("elif abs") old_run += run last_insert = 0 else: #print("else inserting", old_run, old_val) nrs[counter] = old_run nvs[counter] = old_val old_run = run old_val = value counter += 1 last_insert = 1 ##print("nrs_arr", nrs_arr) ##print("nvs_arr", nvs_arr) #print("old_val", old_val) #print("nvs[counter]", nvs[counter]) #print("counter", counter) #print("last_insert", last_insert) if counter == 0: nvs[counter] = old_val nrs[counter] = old_run counter += 1 elif not last_insert: #print("in last if " * 10) nvs[counter] = old_val nrs[counter] = old_run counter += 1 else: nvs[counter] = value nrs[counter] = run counter += 1 if len(values) == 1: ##print("len value series one") return runs, values # if np.isclose(value, old_val, equal_nan=True) and counter > 0: # #print("value == old val and counter > 0") # nrs[counter - 1] += old_run # if np.isclose(value, old_val, equal_nan=True): # #print("value == old val") # nrs[counter] = old_run # nvs[counter] = old_val # counter += 1 #print("nrs_arr", nrs_arr) #print("nvs_arr", nvs_arr) #print("counter", counter) return nrs_arr[:counter], nvs_arr[:counter] pyrle-0.0.33/pyrle/src/getitem.pyx000066400000000000000000000214351406033450200170540ustar00rootroot00000000000000 import numpy as np import pandas as pd cimport cython @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef getitem(const long [::1] runs, const double [::1] values, int start, int end): cdef: int i = 0 int arr_length = 100 int nfound = 0 # int foundsum = 0 int rsum = 0 int r = 0 int l = 0 int started = 0 cdef double[::1] vs cdef long[::1] rs values_arr = np.zeros(arr_length) vs = values_arr runs_arr = np.zeros(arr_length, dtype=np.long) rs = runs_arr for i in range(len(runs)): # print("i", i) r = runs[i] # print("r", r) rsum += r # print("rsum", rsum) if started == 0: # print("not started") if rsum > start: # print("rsum > start") if not rsum > end: l = rsum - start # this is always the first entry, no need to check size # print("l1", l) rs[nfound] = l # foundsum += l # print("v1", values[i]) vs[nfound] = values[i] nfound += 1 else: return [end - start], [values[i]] started = 1 else: if nfound >= arr_length: arr_length = arr_length * 2 values_arr = np.resize(values_arr, arr_length) runs_arr = np.resize(runs_arr, arr_length) rs = runs_arr vs = values_arr if rsum < end: l = runs[i] # print("l2", l) # print("v2", values[i]) rs[nfound] = l vs[nfound] = values[i] nfound += 1 else: l = runs[i] - (rsum - end) # print("l3", l) rs[nfound] = l vs[nfound] = values[i] # print("v3", values[i]) nfound += 1 break return runs_arr[:nfound], values_arr[:nfound] @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) def getlocs(const long [::1] runs, const double [::1] values, const long [::1] locs): cdef: int i = 0 int j = 0 int cumsum = 0 cdef double[::1] vs int loc_len = len(locs) values_arr = np.zeros(loc_len) vs = values_arr for i in range(len(runs)): cumsum += runs[i] while locs[j] < cumsum: vs[j] = values[i] j += 1 if j == loc_len: return values_arr return values_arr @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef _getitem(const long [::1] runs, const double [::1] values, const long [::1] run_cumsum, int start, int end): cdef: int i = 0 int arr_length = 100 int nfound = 0 # int foundsum = 0 int rsum = 0 int r = 0 int l = 0 long search_start = np.searchsorted(run_cumsum, start) int started = 0 cdef double[::1] vs cdef long[::1] rs values_arr = np.ones(arr_length) * -1 vs = values_arr runs_arr = np.ones(arr_length, dtype=np.long) * -1 rs = runs_arr for i in range(search_start, len(runs)): # print("i", i) r = runs[i] # print("r", r) rsum = run_cumsum[i] # print("rsum", rsum) if started == 0: # print("not started") if rsum > start: # print("rsum > start") if not rsum > end: l = rsum - start # this is always the first entry, no need to check size # print("l1", l) rs[nfound] = l # foundsum += l # print("v1", values[i]) vs[nfound] = values[i] nfound += 1 else: return [end - start], [values[i]] started = 1 else: if nfound >= arr_length: arr_length = arr_length * 2 values_arr = np.resize(values_arr, arr_length) runs_arr = np.resize(runs_arr, arr_length) rs = runs_arr vs = values_arr if rsum < end: l = runs[i] # print("l2", l) # print("v2", values[i]) rs[nfound] = l vs[nfound] = values[i] nfound += 1 else: l = runs[i] - (rsum - end) # print("l3", l) if l == 0: break rs[nfound] = l vs[nfound] = values[i] # print("v3", values[i]) nfound += 1 break return runs_arr[:nfound], values_arr[:nfound] @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef getitems(const long [::1] runs, const double [::1] values, const long [::1] starts, const long [::1] ends): cdef: long i = 0 long counter = 0 long start = 0 long end = 0 long old_start = -1 long old_end = -1 cdef long[::1] run_cumsum int x = 0 int arr_length int nfound = 0 # int foundsum = 0 int rsum = 0 int r = 0 int l = 0 int started = 0 cdef double[::1] vs cdef long[::1] rs cdef long[::1] ids cdef long[::1] new_starts cdef long[::1] new_ends cdef long[::1] search_starts run_cumsum_arr = np.cumsum(runs) run_cumsum = run_cumsum_arr search_starts_arr = np.searchsorted(run_cumsum, starts) search_starts = search_starts_arr arr_length = len(run_cumsum_arr) ids_arr = np.ones(arr_length, dtype=np.long) * -1 starts_arr = np.ones(arr_length, dtype=np.long) * -1 ends_arr = np.ones(arr_length, dtype=np.long) * -1 new_starts = starts_arr new_ends = ends_arr ids = ids_arr values_arr = np.ones(arr_length) * -1 vs = values_arr runs_arr = np.ones(arr_length, dtype=np.long) * -1 rs = runs_arr for x in range(len(search_starts)): search_start = search_starts_arr[x] start = starts[x] end = ends[x] started = 0 # print("-------") # print("x", x) for i in range(search_start, len(runs)): r = runs[i] rsum = run_cumsum[i] if nfound >= arr_length: arr_length = arr_length * 2 values_arr = np.resize(values_arr, arr_length) starts_arr = np.resize(starts_arr, arr_length) ends_arr = np.resize(ends_arr, arr_length) runs_arr = np.resize(runs_arr, arr_length) ids_arr = np.resize(ids_arr, arr_length) ids = ids_arr rs = runs_arr vs = values_arr new_starts = starts_arr new_ends = ends_arr if started == 0: if rsum > start: if not rsum > end: l = rsum - start rs[nfound] = l new_starts[nfound] = start new_ends[nfound] = end vs[nfound] = values[i] ids[nfound] = x nfound += 1 else: new_starts[nfound] = start new_ends[nfound] = end rs[nfound] = end - start vs[nfound] = values[i] ids[nfound] = x nfound += 1 # print("first_break", started) break started = 1 else: if rsum < end: l = runs[i] rs[nfound] = l new_starts[nfound] = start new_ends[nfound] = end vs[nfound] = values[i] ids[nfound] = x nfound += 1 else: l = runs[i] - (rsum - end) if l == 0: # print("second_break") break rs[nfound] = l new_starts[nfound] = start new_ends[nfound] = end ids[nfound] = x vs[nfound] = values[i] nfound += 1 # print("third_break") break return ids_arr[:nfound], starts_arr[:nfound], ends_arr[:nfound], runs_arr[:nfound], values_arr[:nfound] pyrle-0.0.33/pyrle/src/rle.pyx000066400000000000000000000207431406033450200162010ustar00rootroot00000000000000# cython: infer_types=True import numpy as np cimport cython from numpy import nan from libc.math cimport copysign, INFINITY, NAN cdef float inf = INFINITY # s/boundscheck(True/boundscheck(False # s/boundscheck(False/boundscheck(True @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef add_rles(const long [::1] runs1, const double [::1] values1, const long [::1] runs2, const double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] + values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr # s/boundscheck(True/boundscheck(False # s/boundscheck(False/boundscheck(True @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef sub_rles(const long [::1] runs1, const double [::1] values1, const long [::1] runs2, const double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] - values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr @cython.cdivision(True) @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef div_rles_nonzeroes(const long [::1] runs1, const double [::1] values1, const long [::1] runs2, const double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef double sign = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] / values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr @cython.cdivision(True) @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef div_rles_zeroes(const long [::1] runs1, const double [::1] values1, const long [::1] runs2, const double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef double sign = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 if values2[x2] != 0: nv = values1[x1] / values2[x2] elif values1[x1] != 0: sign = copysign(1, values1[x1]) * copysign(1, values2[x2]) nv = inf * sign else: nv = NAN if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef mul_rles(const long [::1] runs1, const double [::1] values1, const long [::1] runs2, const double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] * values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 return nrs_arr, nvs_arr pyrle-0.0.33/pyrle/version.py000066400000000000000000000000271406033450200161160ustar00rootroot00000000000000__version__ = "0.0.33" pyrle-0.0.33/rle_arithmetic_template.j2000066400000000000000000000125161406033450200200550ustar00rootroot00000000000000# cython: infer_types=True import numpy as np cimport cython from numpy import nan {# cdef float NAN = float("NaN") #} from libc.math cimport copysign, isfinite, INFINITY, NAN cdef float inf = INFINITY {# cdef float = INFINITY #} {% for config_add_sub in configs_add_sub %} # s/boundscheck(True/boundscheck(False # s/boundscheck(False/boundscheck(True @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef {{config_add_sub.operation}}_rles(long [::1] runs1, {{config_add_sub.value_dtype}} [::1] values1, long [::1] runs2, {{config_add_sub.value_dtype}} [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef {{config_add_sub.value_dtype}} nv = 0 cdef {{config_add_sub.value_dtype}} diff = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef {{config_add_sub.run_dtype}} r1 = runs1[x1] cdef {{config_add_sub.run_dtype}} r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.{{config_add_sub.run_dtype}}) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.{{config_add_sub.value_dtype}}) cdef {{config_add_sub.run_dtype}}[::1] nrs cdef {{config_add_sub.value_dtype}}[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] {{config_add_sub.op}} values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr {% endfor %} {% for config_div in configs_div %} @cython.cdivision(True) @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef div_rles_{{config_div.name}}(long [::1] runs1, double [::1] values1, long [::1] runs2, double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef double sign = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 {{config_div.nv}} if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 # Must use resize because initial guess for array was likely way too large nrs_arr.resize(xn, refcheck=False) nvs_arr.resize(xn, refcheck=False) return nrs_arr, nvs_arr {% endfor %} @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) cpdef mul_rles(long [::1] runs1, double [::1] values1, long [::1] runs2, double [::1] values2): cdef int x1 = 0 cdef int x2 = 0 cdef int xn = 0 cdef int nr = 0 cdef double nv = 0 cdef double diff = 0 cdef int l1 = len(runs1) cdef int l2 = len(runs2) cdef long r1 = runs1[x1] cdef long r2 = runs2[x2] nrs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.long) nvs_arr = np.zeros(len(runs1) + len(runs2), dtype=np.double) cdef long[::1] nrs cdef double[::1] nvs nrs = nrs_arr nvs = nvs_arr while(x1 < l1 and x2 < l2): diff = r1 - r2 nv = values1[x1] * values2[x2] if diff < 0: nr = r1 r2 = r2 - r1 x1 += 1 if x1 < l1: r1 = runs1[x1] elif diff > 0: nr = r2 r1 = r1 - r2 x2 += 1 if x2 < l2: r2 = runs2[x2] else: nr = r2 x1 += 1 x2 += 1 if x1 < l1: r1 = runs1[x1] if x2 < l2: r2 = runs2[x2] # if the new value is the same as the old, merge the runs if xn > 0 and nv == nvs[xn - 1]: nrs[xn - 1] += nr else: nrs[xn] = nr nvs[xn] = nv xn += 1 return nrs_arr, nvs_arr pyrle-0.0.33/setup.py000066400000000000000000000061541406033450200144450ustar00rootroot00000000000000 # setup.py from distutils.core import setup # from distutils.extension import Extension from setuptools import find_packages, Extension, Command from Cython.Build import cythonize __version__ = open("pyrle/version.py").readline().split(" = ")[1].replace('"', '').strip() # example_module = Extension('convolve', sources=['convolve.c']) macros = [("CYTHON_TRACE", "1")] macros = None if macros: from Cython.Compiler.Options import get_directive_defaults directive_defaults = get_directive_defaults() directive_defaults['linetrace'] = True directive_defaults['binding'] = True e1 = Extension("pyrle.src.rle", ["pyrle/src/rle.pyx"], define_macros = macros) e2 = Extension("pyrle.src.coverage", ["pyrle/src/coverage.pyx"], define_macros = macros) e3 = Extension("pyrle.src.getitem", ["pyrle/src/getitem.pyx"], define_macros = macros) extensions = [e1, e2, e3] install_requires = ["cython", "pandas", "tabulate", "numpy", "natsort"] setup(name='pyrle', version=__version__, packages=find_packages(), ext_modules=cythonize(extensions, language_level=3), install_requires=install_requires, author="Endre Bakken Stovner", author_email="endrebak85@gmail.com", url="https://github.com/endrebak/pyrle", license="MIT", classifiers=[ "Programming Language :: Python :: 3", "Development Status :: 4 - Beta", "Environment :: Other Environment", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Topic :: Scientific/Engineering" ], package_data={'': ['*.pyx', '*.pxd', '*.h', '*.c']}, include_dirs=["."]) # import os # import sys # from setuptools import setup, find_packages # # from Cython.Build import cythonize # from pyranges.version import __version__ # install_requires = ["pandas", "tabulate"] # # try: # # os.getenv("TRAVIS") # # install_requires.append("coveralls") # # except: # # pass # # if sys.version_info[0] == 2: # # install_requires.append("functools32") # setup( # name="pyranges", # packages=find_packages(), # # scripts=["bin/featurefetch"], # version=__version__, # description="PyRanges for Python.", # author="Endre Bakken Stovner", # author_email="endrebak85@gmail.com", # url="http://github.com/endrebak/pyranges", # keywords=["Bioinformatics"], # license=["MIT"], # install_requires=install_requires, # classifiers=[ # "Programming Language :: Python :: 2.7", # "Programming Language :: Python :: 3", # "Development Status :: 4 - Beta", # "Environment :: Other Environment", "Intended Audience :: Developers", # "Intended Audience :: Science/Research", # "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", # "Operating System :: POSIX :: Linux", # "Operating System :: MacOS :: MacOS X", # "Topic :: Scientific/Engineering" # ], # long_description=("Pythonic Genomic Ranges.")) pyrle-0.0.33/template_filler_add_sub.yaml000066400000000000000000000002431406033450200204410ustar00rootroot00000000000000add_double: operation: add op: + value_dtype: double run_dtype: long subtract_double: operation: sub op: "-" value_dtype: double run_dtype: long pyrle-0.0.33/template_filler_div.yaml000066400000000000000000000005721406033450200176270ustar00rootroot00000000000000 div_nonzeroes: name: nonzeroes nv: "nv = values1[x1] / values2[x2]" div_zeroes: name: zeroes nv: "\n\ \ if values2[x2] != 0:\n\ \ nv = values1[x1] / values2[x2]\n\ \ elif values1[x1] != 0:\n\ \ sign = copysign(1, values1[x1]) * copysign(1, values2[x2])\n\ \ nv = inf * sign\n\ \ else:\n\ \ nv = NAN\n" pyrle-0.0.33/test.py000066400000000000000000000031511406033450200142560ustar00rootroot00000000000000import pyranges as pr gr = pr.load_dataset("epigenome_roadmap") rle = gr["chr1"].coverage() print(list(rle.runs)[:20]) print(list(rle.values)[:20]) raise import numpy as np from pyrle import Rle import pandas as pd r = pd.Series([1, 2, 3, 4], dtype=np.int16) # v = pd.Series([-1, 2.3, 3, 4.976], dtype=np.float) r1 = Rle(r, r) r2 = Rle(r * 2, r * 2) # > r2 # numeric-Rle of length 20 with 4 runs # Lengths: 2 4 6 8 # Values : 2 4 6 8 # > r4 # numeric-Rle of length 20 with 5 runs # Lengths: 1 2 3 4 10 # Values : 1 2 3 4 0 # > r2 + r4 # numeric-Rle of length 20 with 7 runs # Lengths: 1 1 1 3 4 2 8 # Values : 3 4 6 7 10 6 8 r3 = r1 + r2 print(r3.runs) print(r3.values) print(r3.runs.dtype) print(r3.values.dtype) print(r3.runs.shape) print(r3.values.shape) r4 = r2 + r1 print(r4.runs) print(r4.values) print(r4.runs.dtype) print(r4.values.dtype) print(r4.runs.shape) print(r4.values.shape) def resize_test(): """ test of workign with a numpy array that needs to be re-sized. """ # create an ndarray and a memview to work with it. # cdef cnp.ndarray[double, ndim=1, mode="c"] arr # cdef double[:] memview # ## allocate the array: # arr = np.zeros( (1,) ) # ## Assign the memview to it: # memview = arr # ## manipulate it # memview[0] = 3.14 # ## resize the array # arr.resize((4,), refcheck = False) # ## re-assign the memview -- so you get the new post-resize pointer # memview = arr # ## now use it # memview[1] = 5.6 # memview[2] = 7.1 # memview[3] = 4.3 # ## return the numpy array # return arr pyrle-0.0.33/tests/000077500000000000000000000000001406033450200140675ustar00rootroot00000000000000pyrle-0.0.33/tests/__init__.py000066400000000000000000000000001406033450200161660ustar00rootroot00000000000000pyrle-0.0.33/tests/compute_Rle.R000066400000000000000000000022011406033450200164630ustar00rootroot00000000000000#!/usr/bin/env Rscript args = commandArgs(trailingOnly=TRUE) f1 = args[1] f2 = args[2] op = args[3] rf = args[4] print(args) print("We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! ") library(S4Vectors) ## suppressMessages(library(S4Vectors)) df1 = read.table(f1, sep="\t", header=TRUE) print("read table 1") df2 = read.table(f2, sep="\t", header=TRUE) sum1 = sum(df1$Runs) print("found sum 1") sum2 = sum(df2$Runs) df1$Values = df1$Values * 1.0 df2$Values = df2$Values * 1.0 print(sum1) print(sum2) print(sum2 > sum1) if (sum1 > sum2){ row = data.frame(sum1 - sum2, 0) colnames(row) = c("Runs", "Values") df2 = rbind(df2, row) } else if (sum2 > sum1){ row = data.frame(sum2 - sum1, 0) colnames(row) = c("Runs", "Values") df1 = rbind(df1, row) } print(df1) print(df2) r1 = Rle(df1$Values, df1$Runs) r2 = Rle(df2$Values, df2$Runs) print(r1) print(r2) print(op) f = match.fun(op) ## f = * result = f(r1, r2) print(result) df = data.frame(Runs=runLength(result), Values=runValue(result)) write.table(df, rf, sep="\t") pyrle-0.0.33/tests/compute_coverage.R000066400000000000000000000006301406033450200175400ustar00rootroot00000000000000#!/usr/bin/env Rscript args = commandArgs(trailingOnly=TRUE) options(warn=-1) f1 = args[1] rf = args[2] df1 = read.table(f1, sep="\t", header=TRUE) suppressMessages(library(GenomicRanges)) result = coverage(GRanges(seqnames = df1$Chromosome, ranges = IRanges(start = df1$Start, end = df1$End))) df = data.frame(Runs=runLength(result), Values=runValue(result)) print(df) write.table(df, rf, sep="\t") pyrle-0.0.33/tests/helpers.py000066400000000000000000000027531406033450200161120ustar00rootroot00000000000000 import pandas as pd def assert_df_equal(df1, df2): pd.options.mode.chained_assignment = None if "Strand" in df1 and "Strand" in df2: sort_on = "Chromosome Start End Strand".split() df1.Strand = df1.Strand.astype("object") df2.Strand = df2.Strand.astype("object") else: sort_on = "Chromosome Start End".split() if "Strand_b" in df1: sort_on += "Start_b End_b Strand_b".split() df1.Strand_b = df1.Strand_b.astype("object") df2.Strand_b = df2.Strand_b.astype("object") elif "Start_b" in df2: sort_on += "Start_b End_b".split() df1 = df1.sort_values(sort_on) df2 = df2.sort_values(sort_on) df1 = df1.reset_index(drop=True) df2 = df2.reset_index(drop=True) df1.Chromosome = df1.Chromosome.astype("object") df2.Chromosome = df2.Chromosome.astype("object") print("Actual") print(df1.to_csv(sep=" ")) print("Expected") print(df2.to_csv(sep=" ")) print("Actual dtypes") print(df1.dtypes) print("Expected dtypes") print(df2.dtypes) # print("dtypes Strand\n", "1", df1.Strand.dtype, "2", df2.Strand.dtype) # print("dtypes Strand\n", df1.Strand.dtype == df2.Strand.dtype) # print("dtypes equal\n", df1.dtypes == df2.dtypes) print("Actual index") print(df1.index) print("Expected index") print(df2.index) print("index equal", df1.index == df2.index) pd.testing.assert_frame_equal(df1, df2) pd.options.mode.chained_assignment = "warn" pyrle-0.0.33/tests/hypothesis_helper.py000066400000000000000000000072371406033450200202100ustar00rootroot00000000000000 import pytest from hypothesis import given, settings, reproduce_failure, unlimited, HealthCheck, seed from hypothesis.extra.pandas import data_frames, columns, range_indexes, column, indexes from hypothesis.extra.numpy import arrays import hypothesis.strategies as st from pyranges import PyRanges import numpy as np lengths = st.integers(min_value=1, max_value=int(1e7)) small_lengths = st.integers(min_value=1, max_value=int(1e4)) strands = st.sampled_from("+ -".split()) names = st.text("abcdefghijklmnopqrstuvxyz", min_size=1) scores = st.integers(min_value=0, max_value=256) chromosomes = st.sampled_from(["chr{}".format(str(e)) for e in list(range(1, 23)) + "X Y M".split()]) chromosomes_small = st.sampled_from(["chr1"]) cs = st.one_of(chromosomes, chromosomes_small) runlengths = data_frames(index=indexes(dtype=np.int64, min_size=1, unique=True), columns=[column("Runs", st.integers(min_value=1, max_value=int(1e7))), # must have a min/max on floats because R S4vectors translates too big ones into inf. # which is unequal to eg -1.79769e+308 so the tests fail column("Values", st.integers(min_value=-int(1e7), max_value=int(1e7)))]) better_dfs_min = data_frames(index=indexes(dtype=np.int64, min_size=1, unique=True, elements=lengths), columns=[column("Chromosome", cs), column("Start", elements=lengths), column("End", elements=small_lengths), # column("Name", elements=names), # column("Score", elements=scores), column("Strand", strands)]) better_dfs_min_single_chromosome = data_frames(index=indexes(dtype=np.int64, min_size=1, unique=True, elements=lengths), columns=[column("Chromosome", chromosomes_small), column("Start", elements=lengths), column("End", elements=small_lengths), # column("Name", elements=names), # column("Score", elements=scores), column("Strand", strands)]) runlengths_same_length_integers = data_frames(index=indexes(dtype=np.int64, min_size=1, unique=True), columns=[column("Runs", st.integers(min_value=1, max_value=int(1e4))), column("Values", st.integers(min_value=1, max_value=int(1e4))), column("Values2", st.integers(min_value=1, max_value=int(1e4)))]) @st.composite def _slice(draw): start = draw(lengths) - 1 diff = draw(lengths) return start, start + diff @st.composite def dfs_min(draw): df = draw(better_dfs_min) # strand = draw(use_strand) df.loc[:, "End"] += df.Start df.insert(3, "Name", "a") df.insert(4, "Score", 0) # if not strand: # df = df.drop("Strand", axis=1) gr = PyRanges(df) # gr = PyRanges(df) # do not sort like this, use pyranges sort # np.random.seed(draw(st.integers(min_value=0, max_value=int(1e6)))) # gr.df = df.reindex(np.random.permutation(df.index.values)) return gr @st.composite def dfs_min_single_chromosome(draw): df = draw(better_dfs_min_single_chromosome) df.loc[:, "End"] += df.Start df.insert(3, "Name", "a") df.insert(4, "Score", 0) return df pyrle-0.0.33/tests/subset_coverage.R000066400000000000000000000020201406033450200173640ustar00rootroot00000000000000## library(S4Vectors) ## library(GenomicRanges) ## library(rtracklayer) ## f = "/home/endrebak/code/pyranges/pyranges/example_data/chipseq.bed" ## gr = import(f) #!/usr/bin/env Rscript args = commandArgs(trailingOnly=TRUE) f1 = args[1] start = strtoi(args[2]) end = strtoi(args[3]) rf = args[4] print(args) print(f1) print(start) print(end) print(rf) print("We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! We are starting in R! ") ## library(S4Vectors) suppressMessages(library(S4Vectors)) df1 = read.table(f1, sep="\t", header=TRUE) print("read table 1") sum1 = sum(df1$Runs) print("found sum 1") df1$Values = df1$Values * 1.0 print(sum1) print(df1) r1 = Rle(df1$Values, df1$Runs) print(r1) if (start > sum1){ start = sum1 } if (end > sum1){ end = sum1 } print("start end") print(start) print(end) result = r1[start:end] print(result) df = data.frame(Runs=runLength(result), Values=runValue(result)) write.table(df, rf, sep="\t") pyrle-0.0.33/tests/test_hypothesis.py000066400000000000000000000111221406033450200176740ustar00rootroot00000000000000import pytest from hypothesis import given, settings, reproduce_failure, unlimited, HealthCheck, seed from tests.hypothesis_helper import runlengths, dfs_min, runlengths_same_length_integers from itertools import product import tempfile import subprocess from io import StringIO from pyrle import Rle import pandas as pd import numpy as np # using assert df equal, because we want to consider output from bedtools and # pyranges equal even if they have different sort order from tests.helpers import assert_df_equal import numpy as np from os import environ if environ.get("TRAVIS"): max_examples = 100 deadline = None else: max_examples = 100 deadline = None rle_operations = "+ - / *".split() rle_operation_cmd = "Rscript --vanilla tests/compute_Rle.R {} {} '{}' {}" @pytest.mark.r @given(runlengths=runlengths, runlengths2=runlengths) @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) @pytest.mark.parametrize("operation", rle_operations) def test_rle(runlengths, runlengths2, operation): # Only compared against bioc with integers because float equality is hard, # for both libraries, sometimes end up with slightly different runlengths # when consecutive values are almost equal pyop = {"+": "__add__", "-": "__sub__", "*": "__mul__", "/": "__truediv__"}[operation] print("runlengths", runlengths) print("runlengths2", runlengths2) r = Rle(runlengths.Runs, runlengths.Values) r2 = Rle(runlengths2.Runs, runlengths2.Values) print("r\n", r) print("r2\n", r2) m = getattr(r, pyop) result_pyranges = m(r2) print("pyranges result\n", result_pyranges) result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.txt".format(temp_dir) f2 = "{}/f2.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) runlengths.to_csv(f1, sep="\t", index=False) runlengths2.to_csv(f2, sep="\t", index=False) cmd = rle_operation_cmd.format(f1, f2, operation, outfile) # + " 2>/dev/null" subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_csv(outfile, sep="\t") s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", result_pyranges) print("s4vectors result\n", s4vectors_result) assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True) rle_commute_how = ["__add__", "__mul__"] @pytest.mark.parametrize("how", rle_commute_how) @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) @given(gr=dfs_min(), gr2=dfs_min()) def test_commutative_rles(gr, gr2, how): cv = gr.to_rle(strand=True) cv2 = gr2.to_rle(strand=True) method = getattr(cv, how) method2 = getattr(cv2, how) result = method(cv2) result2 = method2(cv) assert result == result2, "\n".join([str(e) for e in [cv, cv2, result, result2, "---" * 10]]) @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) @given(df=runlengths_same_length_integers) def test_inverse_div_mul_rles(df): """Testing with small integers, since small value floating points might lead to mul then div not being equal to identity function because of float equality.""" print(df) runlength = df.Runs.sum() cv = Rle(df.Runs.values, df.Values.values) newruns = np.random.permutation(df.Runs.values) print("newruns", newruns) cv2 = Rle(newruns, df.Values2.values) print("cv\n", cv) print("cv2\n", cv2) assert runlength == np.sum(cv.runs) and runlength == np.sum(cv2.runs) result = cv / cv2 result2 = result * cv2 print("result\n", result) print("result2\n", result2) assert np.all(np.equal(result2.runs, cv.runs)) assert np.allclose(result2.values, cv.values) @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) @given(df=runlengths_same_length_integers) def test_inverse_add_sub_rles(df): """Testing with small integers, since small value floating points might lead to mul then div not being equal to identity function because of float equality.""" cv = Rle(df.Runs.values, df.Values.values) cv2 = Rle(np.random.permutation(df.Runs.values), df.Values2.values) result = cv + cv2 result2 = result - cv2 assert np.all(np.equal(result2.runs, cv.runs)) assert np.allclose(result2.values, cv.values) pyrle-0.0.33/tests/test_hypothesis_coverage.py000066400000000000000000000052271406033450200215600ustar00rootroot00000000000000 import pytest from hypothesis import given, settings, reproduce_failure, unlimited, HealthCheck, seed from hypothesis.extra.pandas import data_frames, columns, range_indexes, column, indexes from hypothesis.extra.numpy import arrays import hypothesis.strategies as st from itertools import product import tempfile import subprocess from io import StringIO from pyrle import Rle import pyranges as pr import pandas as pd import numpy as np # runlengths = data_frames(index=indexes(dtype=np.int64, min_size=1, unique=True), # columns=[column("Runs", st.integers(min_value=1, max_value=int(1e7))), # # must have a min/max on floats because R S4vectors translates too big ones into inf. # # which is unequal to eg -1.79769e+308 so the tests fail # column("Values", st.integers(min_value=-int(1e7), max_value=int(1e7)))]) from tests.hypothesis_helper import dfs_min_single_chromosome # using assert df equal, because we want to consider output from bedtools and # pyranges equal even if they have different sort order from tests.helpers import assert_df_equal import numpy as np from os import environ if environ.get("TRAVIS"): max_examples = 100 deadline = None else: max_examples = 100 deadline = None coverage_cmd = "Rscript --vanilla tests/compute_coverage.R {} {}" @pytest.mark.r @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) @given(df=dfs_min_single_chromosome()) def test_coverage(df): print("---" * 10) p = pr.PyRanges(df) print("pyranges\n", p) c = p.to_rle(strand=False)["chr1"] result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) R_df = df R_df.End = R_df.End - 1 R_df.to_csv(f1, sep="\t", index=False) cmd = coverage_cmd.format(f1, outfile) + " 2>/dev/null" subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_table(outfile)[["Runs.value", "Values.value"]] result.columns = "Runs Values".split() result = pd.concat([pd.DataFrame(index=[0], data={"Runs": 1, "Values": 0}), result], ignore_index=True) s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", c) print("s4vectors result\n", s4vectors_result) print(str(c == s4vectors_result) + " " * 10, c == s4vectors_result) assert np.all(np.equal(c.runs, s4vectors_result.runs)) assert np.all(np.equal(c.values, s4vectors_result.values)) pyrle-0.0.33/tests/test_subset_coverage.py000066400000000000000000000066111406033450200206640ustar00rootroot00000000000000import pytest from hypothesis import given, settings, reproduce_failure, unlimited, HealthCheck, seed from tests.hypothesis_helper import runlengths, dfs_min, runlengths_same_length_integers, _slice from itertools import product import tempfile import subprocess from io import StringIO from pyrle import Rle import pandas as pd import numpy as np # using assert df equal, because we want to consider output from bedtools and # pyranges equal even if they have different sort order from tests.helpers import assert_df_equal import numpy as np from os import environ if environ.get("TRAVIS"): max_examples = 100 deadline = None else: max_examples = 100 deadline = None rle_operation_cmd = "Rscript --vanilla tests/subset_coverage.R {} {} {} {}" @pytest.mark.r @given(runlengths=runlengths, interval=_slice()) @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) def test_subset_coverage(runlengths, interval): start, end = interval print("runlengths\n", runlengths) r = Rle(runlengths.Runs, runlengths.Values) result_pyranges = r[start:end] result_df = None with tempfile.TemporaryDirectory() as temp_dir: # temp_dir = "." f1 = "{}/f1.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) runlengths.to_csv(f1, sep="\t", index=False) cmd = rle_operation_cmd.format(f1, start + 1, end, outfile) # + " 2>/dev/null" print(cmd) subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_csv(outfile, sep="\t") s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", result_pyranges) print("s4vectors result\n", s4vectors_result) assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True) rle_operation_cmd = "Rscript --vanilla tests/subset_coverage.R {} {} {} {}" # @pytest.mark.r # @given(runlengths=runlengths, interval=_slice()) # @settings(max_examples=max_examples, deadline=deadline, timeout=unlimited, suppress_health_check=HealthCheck.all()) # def test_getloc_coverage(runlengths, interval): # start, end = interval # # Only compared against bioc with integers because float equality is hard, # # for both libraries, sometimes end up with slightly different runlengths # # when consecutive values are almost equal # print("runlengths\n", runlengths) # r = Rle(runlengths.Runs, runlengths.Values) # result_pyranges = r[start:end] # result_df = None # with tempfile.TemporaryDirectory() as temp_dir: # f1 = "{}/f1.txt".format(temp_dir) # outfile = "{}/result.txt".format(temp_dir) # runlengths.to_csv(f1, sep="\t", index=False) # cmd = rle_operation_cmd.format(f1, start, end, outfile) # + " 2>/dev/null" # print(cmd) # subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() # result = pd.read_csv(outfile, sep="\t") # s4vectors_result = Rle(result.Runs, result.Values) # print("pyranges result\n", result_pyranges) # print("s4vectors result\n", s4vectors_result) # assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) # assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True)