pyBigWig-0.3.2/0000750000201600010240000000000013016027005013630 5ustar ryanbioinfo00000000000000pyBigWig-0.3.2/libBigWig/0000750000201600010240000000000013016027005015467 5ustar ryanbioinfo00000000000000pyBigWig-0.3.2/libBigWig/LICENSE0000640000201600010240000000206612622361435016513 0ustar ryanbioinfo00000000000000The MIT License (MIT) Copyright (c) 2015 Devon Ryan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pyBigWig-0.3.2/libBigWig/README.md0000640000201600010240000003041613015055353016761 0ustar ryanbioinfo00000000000000![Master build status](https://travis-ci.org/dpryan79/libBigWig.svg?branch=master) [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.45278.svg)](http://dx.doi.org/10.5281/zenodo.45278) A C library for reading/parsing local and remote bigWig and bigBed files. While Kent's source code is free to use for these purposes, it's really inappropriate as library code since it has the unfortunate habit of calling `exit()` whenever there's an error. If that's then used inside of something like python then the python interpreter gets killed. This library is aimed at resolving these sorts of issues and should also use more standard things like curl and has a friendlier license to boot. Documentation is automatically generated by doxygen and can be found under `docs/html` or online [here](https://cdn.rawgit.com/dpryan79/libBigWig/master/docs/html/index.html). # Example The only functions and structures that end users need to care about are in "bigWig.h". Below is a commented example. You can see the files under `test/` for further examples. #include "bigWig.h" int main(int argc, char *argv[]) { bigWigFile_t *fp = NULL; bwOverlappingIntervals_t *intervals = NULL; double *stats = NULL; if(argc != 2) { fprintf(stderr, "Usage: %s {file.bw|URL://path/file.bw}\n", argv[0]); return 1; } //Initialize enough space to hold 128KiB (1<<17) of data at a time if(bwInit(1<<17) != 0) { fprintf(stderr, "Received an error in bwInit\n"); return 1; } //Open the local/remote file fp = bwOpen(argv[1], NULL, "r"); if(!fp) { fprintf(stderr, "An error occured while opening %s\n", argv[1]); return 1; } //Get values in a range (0-based, half open) without NAs intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 0); bwDestroyOverlappingIntervals(intervals); //Free allocated memory //Get values in a range (0-based, half open) with NAs intervals = bwGetValues(fp, "chr1", 10000000, 10000100, 1); bwDestroyOverlappingIntervals(intervals); //Free allocated memory //Get the full intervals that overlap intervals = bwGetOverlappingIntervals(fp, "chr1", 10000000, 10000100); bwDestroyOverlappingIntervals(intervals); //Get an example statistic - standard deviation //We want ~4 bins in the range stats = bwStats(fp, "chr1", 10000000, 10000100, 4, dev); if(stats) { printf("chr1:10000000-10000100 std. dev.: %f %f %f %f\n", stats[0], stats[1], stats[2], stats[3]); free(stats); } bwClose(fp); bwCleanup(); return 0; } ##Writing example N.B., creation of bigBed files is not supported (there are no plans to change this). Below is an example of how to write bigWig files. You can also find this file under `test/exampleWrite.c`. Unlike with Kent's tools, you can create bigWig files entry by entry without needing an intermediate wiggle or bedGraph file. Entries in bigWig files are stored in blocks with each entry in a block referring to the same chromosome and having the same type, of which there are three (see the [wiggle specification](http://genome.ucsc.edu/goldenpath/help/wiggle.html) for more information on this). #include "bigWig.h" int main(int argc, char *argv[]) { bigWigFile_t *fp = NULL; char *chroms[] = {"1", "2"}; char *chromsUse[] = {"1", "1", "1"}; uint32_t chrLens[] = {1000000, 1500000}; uint32_t starts[] = {0, 100, 125, 200, 220, 230, 500, 600, 625, 700, 800, 850}; uint32_t ends[] = {5, 120, 126, 205, 226, 231}; float values[] = {0.0f, 1.0f, 200.0f, -2.0f, 150.0f, 25.0f, 0.0f, 1.0f, 200.0f, -2.0f, 150.0f, 25.0f, -5.0f, -20.0f, 25.0f, -5.0f, -20.0f, 25.0f}; if(bwInit(1<<17) != 0) { fprintf(stderr, "Received an error in bwInit\n"); return 1; } fp = bwOpen("example_output.bw", NULL, "w"); if(!fp) { fprintf(stderr, "An error occurred while opening example_output.bw for writingn\n"); return 1; } //Allow up to 10 zoom levels, though fewer will be used in practice if(bwCreateHdr(fp, 10)) goto error; //Create the chromosome lists fp->cl = bwCreateChromList(chroms, chrLens, 2); if(!fp->cl) goto error; //Write the header if(bwWriteHdr(fp)) goto error; //Some example bedGraph-like entries if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; //We can continue appending similarly formatted entries //N.B. you can't append a different chromosome (those always go into different if(bwAppendIntervals(fp, starts+3, ends+3, values+3, 3)) goto error; //Add a new block of entries with a span. Since bwAdd/AppendIntervals was just used we MUST create a new block if(bwAddIntervalSpans(fp, "1", starts+6, 20, values+6, 3)) goto error; //We can continue appending similarly formatted entries if(bwAppendIntervalSpans(fp, starts+9, values+9, 3)) goto error; //Add a new block of fixed-step entries if(bwAddIntervalSpanSteps(fp, "1", 900, 20, 30, values+12, 3)) goto error; //The start is then 760, since that's where the previous step ended if(bwAppendIntervalSpanSteps(fp, values+15, 3)) goto error; //Add a new chromosome chromsUse[0] = "2"; chromsUse[1] = "2"; chromsUse[2] = "2"; if(bwAddIntervals(fp, chromsUse, starts, ends, values, 3)) goto error; //Closing the file causes the zoom levels to be created bwClose(fp); bwCleanup(); return 0; error: fprintf(stderr, "Received an error somewhere!\n"); bwClose(fp); bwCleanup(); return 1; } # Testing file types As of version 0.3.0, this library supports accessing bigBed files, which are related to bigWig files. Applications that need to support both bigWig and bigBed input can use the `bwIsBigWig` and `bbIsBigBed` functions to determine if their inputs are bigWig/bigBed files: ...code... if(bwIsBigWig(input_file_name, NULL)) { //do something } else if(bbIsBigBed(input_file_name, NULL)) { //do something else } else { //handle unknown input } Note that these two functions rely on the "magic number" at the beginning of each file, which differs between bigWig and bigBed files. # bigBed support Support for accessing bigBed files was added in version 0.3.0. The function names used for accessing bigBed files are similar to those used for bigWig files. Function | Use --- | --- bbOpen | Opens a bigBed file bbGetSQL | Returns the SQL string (if it exists) in a bigBed file bbGetOverlappingEntries | Returns all entries overlapping an interval (either with or without their associated strings bbDestroyOverlappingEntries | Free memory allocated by the above command Other functions, such as `bwClose` and `bwInit`, are shared between bigWig and bigBed files. See `test/testBigBed.c` for a full example. # A note on bigBed entries Inside bigBed files, entries are stored as chromosome, start, and end coordinates with an (optional) associated string. For example, a "bedRNAElements" file from Encode has name, score, strand, "level", "significance", and "score2" values associated with each entry. These are stored inside the bigBed files as a single tab-separated character vector (char \*), which makes parsing difficult. The names of the various fields inside of bigBed files is stored as an SQL string, for example: table RnaElements "BED6 + 3 scores for RNA Elements data " ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Normalized score from 0-1000" char[1] strand; "+ or - or . for unknown" float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." float signif; "Statistical significance such as IDR. Set to -1 for no data." uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." ) Entries will then be of the form (one per line): 59426 115 - 0.021 0.48 218 51 209 + 0.071 0.74 130 52 170 + 0.045 0.61 171 59433 178 - 0.049 0.34 296 53 156 + 0.038 0.19 593 59436 186 - 0.054 0.15 1010 59437 506 - 1.560 0.00 430611 Note that chromosome and start/end intervals are stored separately, so there's no need to parse them out of string. libBigWig can return these entries, either with or without the above associated strings. Parsing these string is left to the application requiring them and is currently outside the scope of this library. # Interval/Entry iterators Sometimes it is desirable to request a large number of intervals from a bigWig file or entries from a bigBed file, but not hold them all in memory at once (e.g., due to saving memory). To support this, libBigWig (since version 0.3.0) supports two kinds of iterators. The general process of using iterators is: (1) iterator creation, (2) traversal, and finally (3) iterator destruction. Only iterator creation differs between bigWig and bigBed files. Importantly, iterators return results by one or more blocks. This is for convenience, since bigWig intervals and bigBed entries are stored in together in fixed-size groups, called blocks. The number of blocks of entries returned, therefore, is an option that can be specified to balance performance and memory usage. ## Iterator creation For bigwig files, iterators are created with the `bwOverlappingIntervalsIterator()`. This function takes chromosomal bounds (chromosome name, start, and end position) as well as a number of blocks. The equivalent function for bigBed files is `bbOverlappingEntriesIterator()`, which additionally takes a `withString` argutment, which dictates whether the returned entries include the associated string values or not. Each of the aforementioned files returns a pointer to a `bwOverlapIterator_t` object. The only important parts of this structure for end users are the following members: `entries`, `intervals`, and `data`. `entries` is a pointer to a `bbOverlappingEntries_t` object, or `NULL` if a bigWig file is being used. Likewise, `intervals` is a pointer to a `bwOverlappingIntervals_t` object, or `NULL` if a bigBed file is being used. `data` is a special pointer, used to signify the end of iteration. Thus, when `data` is a `NULL` pointer, iteration has ended. ## Iterator traversal Regardless of whether a bigWig or bigBed file is being used, the `bwIteratorNext()` function will free currently used memory and load the appropriate intervals or entries for the next block(s). On error, this will return a NULL pointer (memory is already internally freed in this case). ## Iterator destruction `bwOverlapIterator_t` objects MUST be destroyed after use. This can be done with the `bwIteratorDestroy()` function. ## Example A full example is provided in `tests/testIterator.c`, but a small example of iterating over all bigWig intervals in `chr1:0-10000000` in chunks of 5 blocks follows: iter = bwOverlappingIntervalsIterator(fp, "chr1", 0, 10000000, 5); while(iter->data) { //Do stuff with iter->intervals iter = bwIteratorNext(iter); } bwIteratorDestroy(iter); # A note on bigWig statistics The results of `min`, `max`, and `mean` should be the same as those from `BigWigSummary`. `stdev` and `coverage`, however, may differ due to Kent's tools producing incorrect results (at least for `coverage`, though the same appears to be the case for `stdev`). # Python interface There are currently two python interfaces that make use of libBigWig: [pyBigWig](https://github.com/dpryan79/pyBigWig) by me and [bw-python](https://github.com/brentp/bw-python) by Brent Pederson. Those interested are encouraged to give both a try! pyBigWig-0.3.2/libBigWig/bigWig.h0000640000201600010240000007112413015055353017064 0ustar ryanbioinfo00000000000000#include "io.h" #include "bwValues.h" #include #include #ifdef __cplusplus extern "C" { #endif /*! \mainpage libBigWig * * \section Introduction * * libBigWig is a C library for parsing local/remote bigWig and bigBed files. This is similar to Kent's library from UCSC, except * * The license is much more liberal * * This code doesn't call `exit()` on error, thereby killing the calling application. * * External files are accessed using [curl](http://curl.haxx.se/). * * Please submit issues and pull requests [here](https://github.com/dpryan79/libBigWig). * * \section Compilation * * Assuming you already have the curl libraries installed (not just the curl binary!): * * make install prefix=/some/path * * \section Writing bigWig files * * There are three methods for storing values in a bigWig file, further described in the [wiggle format](http://genome.ucsc.edu/goldenpath/help/wiggle.html). The entries within the file are grouped into "blocks" and each such block is limited to storing entries of a single type. So, it is unwise to use a single bedGraph-like endtry followed by a single fixed-step entry followed by a variable-step entry, as that would require three separate blocks, with additional space required for each. * * \section Testing file types * * As of version 0.3.0, libBigWig supports reading bigBed files. If an application needs to support both bigBed and bigWig input, then the `bwIsBigWig` and `bbIsBigBed` functions can be used to determine the file type. These both use the "magic" number at the beginning of the file to determine the file type. * * \section Interval and entry iterators * * As of version 0.3.0, libBigWig supports iterating over intervals in bigWig files and entries in bigBed files. The number of intervals/entries returned with each iteration can be controlled by setting the number of blocks processed in each iteration (intervals and entries are group inside of bigWig and bigBed files into blocks of entries). See `test/testIterator.c` for an example. * * \section Examples * * Please see [README.md](README.md) and the files under `test/` for examples. */ /*! \file bigWig.h * * These are the functions and structured that should be used by external users. While I don't particularly recommend dealing with some of the structures (e.g., a bigWigHdr_t), they're described here in case you need them. * * BTW, this library doesn't switch endianness as appropriate, since I kind of assume that there's only one type produced these days. */ /*! * The library version number */ #define LIBBIGWIG_VERSION 0.3.0 /*! * The magic number of a bigWig file. */ #define BIGWIG_MAGIC 0x888FFC26 /*! * The magic number of a bigBed file. */ #define BIGBED_MAGIC 0x8789F2EB /*! * The magic number of a "cirTree" block in a file. */ #define CIRTREE_MAGIC 0x78ca8c91 /*! * The magic number of an index block in a file. */ #define IDX_MAGIC 0x2468ace0 /*! * The default number of children per block. */ #define DEFAULT_nCHILDREN 64 /*! * The default decompression buffer size in bytes. This is used to determin */ #define DEFAULT_BLOCKSIZE 32768 /*! * An enum that dictates the type of statistic to fetch for a given interval */ enum bwStatsType { doesNotExist = -1, mean = 0, average = 0, stdev = 1, dev = 1, max = 2, min = 3, cov = 4, coverage = 4 }; //Should hide this from end users /*! * @brief BigWig files have multiple "zoom" levels, each of which has its own header. This hold those headers * * N.B., there's 4 bytes of padding in the on disk representation of level and dataOffset. */ typedef struct { uint32_t *level; /**bufSize*/ bwLL *firstIndexNode; /** 65535 will result in a maximum of 10. * @return 0 on success. */ int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms); /*! * @brief Take a list of chromosome names and lengths and return a pointer to a chromList_t * This MUST be run before `bwWriteHdr()`. Note that the input is NOT free()d! * @param chroms A list of chromosomes. * @param lengths The length of each chromosome. * @param n The number of chromosomes (thus, the length of `chroms` and `lengths`) * @return A pointer to a chromList_t or NULL on error. */ chromList_t *bwCreateChromList(char **chroms, uint32_t *lengths, int64_t n); /*! * @brief Write a the header to a bigWig file. * You must have already opened the output file, created a header and a chromosome list. * @param bw The output bigWigFile_t pointer. * @see bwCreateHdr * @see bwCreateChromList */ int bwWriteHdr(bigWigFile_t *bw); /*! * @brief Write a new block of bedGraph-like intervals to a bigWig file * Adds entries of the form: * chromosome start end value * to the file. These will always be added in a new block, so you may have previously used a different storage type. * * In general it's more efficient to use the bwAppend* functions, but then you MUST know that the previously written block is of the same type. In other words, you can only use bwAppendIntervals() after bwAddIntervals() or a previous bwAppendIntervals(). * @param fp The output file pointer. * @param chrom A list of chromosomes, of length `n`. * @param start A list of start positions of length`n`. * @param end A list of end positions of length`n`. * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @see bwAppendIntervals */ int bwAddIntervals(bigWigFile_t *fp, char **chrom, uint32_t *start, uint32_t *end, float *values, uint32_t n); /*! * @brief Append bedGraph-like intervals to a previous block of bedGraph-like intervals in a bigWig file. * If you have previously used bwAddIntervals() then this will append additional entries into the previous block (or start a new one if needed). * @param fp The output file pointer. * @param start A list of start positions of length`n`. * @param end A list of end positions of length`n`. * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @warning Do NOT use this after `bwAddIntervalSpanSteps()`, `bwAppendIntervalSpanSteps()`, `bwAddIntervalSpanSteps()`, or `bwAppendIntervalSpanSteps()`. * @see bwAddIntervals */ int bwAppendIntervals(bigWigFile_t *fp, uint32_t *start, uint32_t *end, float *values, uint32_t n); /*! * @brief Add a new block of variable-step entries to a bigWig file * Adds entries for the form * chromosome start value * to the file. Each block of such entries has an associated "span", so each value describes the region chromosome:start-(start+span) * * This will always start a new block of values. * @param fp The output file pointer. * @param chrom A list of chromosomes, of length `n`. * @param start A list of start positions of length`n`. * @param span The span of each entry (the must all be the same). * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @see bwAppendIntervalSpans */ int bwAddIntervalSpans(bigWigFile_t *fp, char *chrom, uint32_t *start, uint32_t span, float *values, uint32_t n); /*! * @brief Append to a previous block of variable-step entries. * If you previously used `bwAddIntervalSpans()`, this will continue appending more values to the block(s) it created. * @param fp The output file pointer. * @param start A list of start positions of length`n`. * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpanSteps()` or `bwAppendIntervalSpanSteps()` * @see bwAddIntervalSpans */ int bwAppendIntervalSpans(bigWigFile_t *fp, uint32_t *start, float *values, uint32_t n); /*! * @brief Add a new block of fixed-step entries to a bigWig file * Adds entries for the form * value * to the file. Each block of such entries has an associated "span", "step", chromosome and start position. See the wiggle format for more details. * * This will always start a new block of values. * @param fp The output file pointer. * @param chrom The chromosome that the entries describe. * @param start The starting position of the block of entries. * @param span The span of each entry (i.e., the number of bases it describes). * @param step The step between entry start positions. * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @see bwAddIntervalSpanSteps */ int bwAddIntervalSpanSteps(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t span, uint32_t step, float *values, uint32_t n); /*! * @brief Append to a previous block of fixed-step entries. * If you previously used `bwAddIntervalSpanSteps()`, this will continue appending more values to the block(s) it created. * @param fp The output file pointer. * @param values A list of values of length`n`. * @param n The length of the aforementioned lists. * @return 0 on success and another value on error. * @warning Do NOT use this after `bwAddIntervals()`, `bwAppendIntervals()`, `bwAddIntervalSpans()` or `bwAppendIntervalSpans()` * @see bwAddIntervalSpanSteps */ int bwAppendIntervalSpanSteps(bigWigFile_t *fp, float *values, uint32_t n); #ifdef __cplusplus } #endif pyBigWig-0.3.2/libBigWig/bwCommon.h0000640000201600010240000000553612715313173017444 0ustar ryanbioinfo00000000000000/*! \file bwCommon.h * * You have no reason to use these functions. They may change without warning because there's no reason for them to be used outside of libBigWig's internals. * * These are structures and functions from a variety of files that are used across files internally but don't need to be see by libBigWig users. */ /*! * @brief Like fsetpos, but for local or remote bigWig files. * This will set the file position indicator to the specified point. For local files this literally is `fsetpos`, while for remote files it fills a memory buffer with data starting at the desired position. * @param fp A valid opened bigWigFile_t. * @param pos The position within the file to seek to. * @return 0 on success and -1 on error. */ int bwSetPos(bigWigFile_t *fp, size_t pos); /*! * @brief A local/remote version of `fread`. * Reads data from either local or remote bigWig files. * @param data An allocated memory block big enough to hold the data. * @param sz The size of each member that should be copied. * @param nmemb The number of members to copy. * @param fp The bigWigFile_t * from which to copy the data. * @see bwSetPos * @return For nmemb==1, the size of the copied data. For nmemb>1, the number of members fully copied (this is equivalent to `fread`). */ size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp); /*! * @brief Determine what the file position indicator say. * This is equivalent to `ftell` for local or remote files. * @param fp The file. * @return The position in the file. */ long bwTell(bigWigFile_t *fp); /*! * @brief Reads a data index (either full data or a zoom level) from a bigWig file. * There is little reason for end users to use this function. This must be freed with `bwDestroyIndex` * @param fp A valid bigWigFile_t pointer * @param offset The file offset where the index begins * @return A bwRTree_t pointer or NULL on error. */ bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset); /*! * @brief Destroy an bwRTreeNode_t and all of its children. * @param node The node to destroy. */ void bwDestroyIndexNode(bwRTreeNode_t *node); /*! * @brief Frees space allocated by `bwReadIndex` * There is generally little reason to use this, since end users should typically not need to run `bwReadIndex` themselves. * @param idx A bwRTree_t pointer allocated by `bwReadIndex`. */ void bwDestroyIndex(bwRTree_t *idx); /// @cond SKIP bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end); void destroyBWOverlapBlock(bwOverlapBlock_t *b); /// @endcond /*! * @brief Finishes what's needed to write a bigWigFile * Flushes the buffer, converts the index linked list to a tree, writes that to disk, handles zoom level stuff, writes magic at the end * @param fp A valid bigWigFile_t pointer * @return 0 on success */ int bwFinalize(bigWigFile_t *fp); pyBigWig-0.3.2/libBigWig/bwRead.c0000640000201600010240000003121213015055353017045 0ustar ryanbioinfo00000000000000#include "bigWig.h" #include "bwCommon.h" #include #include #include #include static uint64_t readChromBlock(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize); //Return the position in the file long bwTell(bigWigFile_t *fp) { if(fp->URL->type == BWG_FILE) return ftell(fp->URL->x.fp); return (long) (fp->URL->filePos + fp->URL->bufPos); } //Seek to a given position, always from the beginning of the file //Return 0 on success and -1 on error //To do, use the return code of urlSeek() in a more useful way. int bwSetPos(bigWigFile_t *fp, size_t pos) { CURLcode rv = urlSeek(fp->URL, pos); if(rv == CURLE_OK) return 0; return -1; } //returns the number of full members read (nmemb on success, something less on error) size_t bwRead(void *data, size_t sz, size_t nmemb, bigWigFile_t *fp) { size_t i, rv; for(i=0; iURL, data+i*sz, sz); if(rv != sz) return i; } return nmemb; } //Initializes curl and sets global variables //Returns 0 on success and 1 on error //This should be called only once and bwCleanup() must be called when finished. int bwInit(size_t defaultBufSize) { //set the buffer size, number of iterations, sleep time between iterations, etc. GLOBAL_DEFAULTBUFFERSIZE = defaultBufSize; //call curl_global_init() CURLcode rv; rv = curl_global_init(CURL_GLOBAL_ALL); if(rv != CURLE_OK) return 1; return 0; } //This should be called before quiting, to release memory acquired by curl void bwCleanup() { curl_global_cleanup(); } static bwZoomHdr_t *bwReadZoomHdrs(bigWigFile_t *bw) { if(bw->isWrite) return NULL; uint16_t i; bwZoomHdr_t *zhdr = malloc(sizeof(bwZoomHdr_t)); if(!zhdr) return NULL; uint32_t *level = malloc(bw->hdr->nLevels * sizeof(uint64_t)); if(!level) { free(zhdr); return NULL; } uint32_t padding = 0; uint64_t *dataOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); if(!dataOffset) { free(zhdr); free(level); return NULL; } uint64_t *indexOffset = malloc(sizeof(uint64_t) * bw->hdr->nLevels); if(!dataOffset) { free(zhdr); free(level); free(dataOffset); return NULL; } for(i=0; ihdr->nLevels; i++) { if(bwRead((void*) &(level[i]), sizeof(uint32_t), 1, bw) != 1) goto error; if(bwRead((void*) &padding, sizeof(uint32_t), 1, bw) != 1) goto error; if(bwRead((void*) &(dataOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; if(bwRead((void*) &(indexOffset[i]), sizeof(uint64_t), 1, bw) != 1) goto error; } zhdr->level = level; zhdr->dataOffset = dataOffset; zhdr->indexOffset = indexOffset; zhdr->idx = calloc(bw->hdr->nLevels, sizeof(bwRTree_t*)); if(!zhdr->idx) goto error; return zhdr; error: for(i=0; ihdr->nLevels; i++) { if(zhdr->idx[i]) bwDestroyIndex(zhdr->idx[i]); } free(zhdr); free(level); free(dataOffset); free(indexOffset); return NULL; } static void bwHdrDestroy(bigWigHdr_t *hdr) { int i; if(hdr->zoomHdrs) { free(hdr->zoomHdrs->level); free(hdr->zoomHdrs->dataOffset); free(hdr->zoomHdrs->indexOffset); for(i=0; inLevels; i++) { if(hdr->zoomHdrs->idx[i]) bwDestroyIndex(hdr->zoomHdrs->idx[i]); } free(hdr->zoomHdrs->idx); free(hdr->zoomHdrs); } free(hdr); } static void bwHdrRead(bigWigFile_t *bw) { uint32_t magic; if(bw->isWrite) return; bw->hdr = calloc(1, sizeof(bigWigHdr_t)); if(!bw->hdr) return; if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //0x0 if(magic != BIGWIG_MAGIC && magic != BIGBED_MAGIC) goto error; if(bwRead((void*) &(bw->hdr->version), sizeof(uint16_t), 1, bw) != 1) goto error; //0x4 if(bwRead((void*) &(bw->hdr->nLevels), sizeof(uint16_t), 1, bw) != 1) goto error; //0x6 if(bwRead((void*) &(bw->hdr->ctOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x8 if(bwRead((void*) &(bw->hdr->dataOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x10 if(bwRead((void*) &(bw->hdr->indexOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x18 if(bwRead((void*) &(bw->hdr->fieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x20 if(bwRead((void*) &(bw->hdr->definedFieldCount), sizeof(uint16_t), 1, bw) != 1) goto error; //0x22 if(bwRead((void*) &(bw->hdr->sqlOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x24 if(bwRead((void*) &(bw->hdr->summaryOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x2c if(bwRead((void*) &(bw->hdr->bufSize), sizeof(uint32_t), 1, bw) != 1) goto error; //0x34 if(bwRead((void*) &(bw->hdr->extensionOffset), sizeof(uint64_t), 1, bw) != 1) goto error; //0x38 //zoom headers if(bw->hdr->nLevels) { if(!(bw->hdr->zoomHdrs = bwReadZoomHdrs(bw))) goto error; } //File summary information if(bw->hdr->summaryOffset) { if(urlSeek(bw->URL, bw->hdr->summaryOffset) != CURLE_OK) goto error; if(bwRead((void*) &(bw->hdr->nBasesCovered), sizeof(uint64_t), 1, bw) != 1) goto error; if(bwRead((void*) &(bw->hdr->minVal), sizeof(uint64_t), 1, bw) != 1) goto error; if(bwRead((void*) &(bw->hdr->maxVal), sizeof(uint64_t), 1, bw) != 1) goto error; if(bwRead((void*) &(bw->hdr->sumData), sizeof(uint64_t), 1, bw) != 1) goto error; if(bwRead((void*) &(bw->hdr->sumSquared), sizeof(uint64_t), 1, bw) != 1) goto error; } return; error: bwHdrDestroy(bw->hdr); fprintf(stderr, "[bwHdrRead] There was an error while reading in the header!\n"); bw->hdr = NULL; } static void destroyChromList(chromList_t *cl) { uint32_t i; if(!cl) return; if(cl->nKeys && cl->chrom) { for(i=0; inKeys; i++) { if(cl->chrom[i]) free(cl->chrom[i]); } } if(cl->chrom) free(cl->chrom); if(cl->len) free(cl->len); free(cl); } static uint64_t readChromLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t valueSize) { uint16_t nVals, i; uint32_t idx; char *chrom = NULL; if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; chrom = calloc(valueSize+1, sizeof(char)); if(!chrom) return -1; for(i=0; ilen[idx]), sizeof(uint32_t), 1, bw) != 1) goto error; cl->chrom[idx] = strdup(chrom); if(!(cl->chrom[idx])) goto error; } free(chrom); return nVals; error: free(chrom); return -1; } static uint64_t readChromNonLeaf(bigWigFile_t *bw, chromList_t *cl, uint32_t keySize) { uint64_t offset , rv = 0, previous; uint16_t nVals, i; if(bwRead((void*) &nVals, sizeof(uint16_t), 1, bw) != 1) return -1; previous = bwTell(bw) + keySize; for(i=0; iisWrite) return NULL; if(bwSetPos(bw, bw->hdr->ctOffset)) return NULL; cl = calloc(1, sizeof(chromList_t)); if(!cl) return NULL; if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; if(magic != CIRTREE_MAGIC) goto error; if(bwRead((void*) &itemsPerBlock, sizeof(uint32_t), 1, bw) != 1) goto error; if(bwRead((void*) &keySize, sizeof(uint32_t), 1, bw) != 1) goto error; if(bwRead((void*) &valueSize, sizeof(uint32_t), 1, bw) != 1) goto error; //Unused if(bwRead((void*) &itemCount, sizeof(uint64_t), 1, bw) != 1) goto error; cl->nKeys = itemCount; cl->chrom = calloc(itemCount, sizeof(char*)); cl->len = calloc(itemCount, sizeof(uint32_t)); if(!cl->chrom) goto error; if(!cl->len) goto error; if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //padding if(bwRead((void*) &magic, sizeof(uint32_t), 1, bw) != 1) goto error; //padding //Read in the blocks i = 0; while(ip) free(wb->p); if(wb->compressP) free(wb->compressP); if(wb->firstZoomBuffer) free(wb->firstZoomBuffer); if(wb->lastZoomBuffer) free(wb->lastZoomBuffer); if(wb->nNodes) free(wb->nNodes); free(wb); } void bwClose(bigWigFile_t *fp) { if(!fp) return; if(bwFinalize(fp)) { fprintf(stderr, "[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated.\n"); } if(fp->URL) urlClose(fp->URL); if(fp->hdr) bwHdrDestroy(fp->hdr); if(fp->cl) destroyChromList(fp->cl); if(fp->idx) bwDestroyIndex(fp->idx); if(fp->writeBuffer) bwDestroyWriteBuffer(fp->writeBuffer); free(fp); } int bwIsBigWig(char *fname, CURLcode (*callBack) (CURL*)) { uint32_t magic = 0; URL_t *URL = NULL; URL = urlOpen(fname, *callBack, NULL); if(!URL) return 0; if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; urlClose(URL); if(magic == BIGWIG_MAGIC) return 1; return 0; } char *bbGetSQL(bigWigFile_t *bw) { char *o = NULL; uint64_t len; if(!bw->hdr->sqlOffset) return NULL; len = bw->hdr->summaryOffset - bw->hdr->sqlOffset; //This includes the NULL terminator o = malloc(sizeof(char) * len); if(!o) goto error; if(bwSetPos(bw, bw->hdr->sqlOffset)) goto error; if(bwRead((void*) o, len, 1, bw) != 1) goto error; return o; error: if(o) free(o); printf("Got an error in bbGetSQL!\n"); return NULL; } int bbIsBigBed(char *fname, CURLcode (*callBack) (CURL*)) { uint32_t magic = 0; URL_t *URL = NULL; URL = urlOpen(fname, *callBack, NULL); if(!URL) return 0; if(urlRead(URL, (void*) &magic, sizeof(uint32_t)) != sizeof(uint32_t)) magic = 0; urlClose(URL); if(magic == BIGBED_MAGIC) return 1; return 0; } bigWigFile_t *bwOpen(char *fname, CURLcode (*callBack) (CURL*), const char *mode) { bigWigFile_t *bwg = calloc(1, sizeof(bigWigFile_t)); if(!bwg) { fprintf(stderr, "[bwOpen] Couldn't allocate space to create the output object!\n"); return NULL; } if((!mode) || (strchr(mode, 'w') == NULL)) { bwg->isWrite = 0; bwg->URL = urlOpen(fname, *callBack, NULL); if(!bwg->URL) goto error; //Attempt to read in the fixed header bwHdrRead(bwg); if(!bwg->hdr) goto error; //Read in the chromosome list bwg->cl = bwReadChromList(bwg); if(!bwg->cl) goto error; //Read in the index bwg->idx = bwReadIndex(bwg, 0); if(!bwg->idx) goto error; } else { bwg->isWrite = 1; bwg->URL = urlOpen(fname, NULL, "w+"); if(!bwg->URL) goto error; bwg->writeBuffer = calloc(1,sizeof(bwWriteBuffer_t)); if(!bwg->writeBuffer) goto error; bwg->writeBuffer->l = 24; } return bwg; error: bwClose(bwg); return NULL; } bigWigFile_t *bbOpen(char *fname, CURLcode (*callBack) (CURL*)) { bigWigFile_t *bb = calloc(1, sizeof(bigWigFile_t)); if(!bb) { fprintf(stderr, "[bbOpen] Couldn't allocate space to create the output object!\n"); return NULL; } //Set the type to 1 for bigBed bb->type = 1; bb->URL = urlOpen(fname, *callBack, NULL); if(!bb->URL) goto error; //Attempt to read in the fixed header bwHdrRead(bb); if(!bb->hdr) goto error; //Read in the chromosome list bb->cl = bwReadChromList(bb); if(!bb->cl) goto error; //Read in the index bb->idx = bwReadIndex(bb, 0); if(!bb->idx) goto error; return bb; error: bwClose(bb); return NULL; } pyBigWig-0.3.2/libBigWig/bwStats.c0000640000201600010240000003226313015055353017277 0ustar ryanbioinfo00000000000000#include "bigWig.h" #include "bwCommon.h" #include #include #include #include #include //Returns -1 if there are no applicable levels, otherwise an integer indicating the most appropriate level. //Like Kent's library, this divides the desired bin size by 2 to minimize the effect of blocks overlapping multiple bins static int32_t determineZoomLevel(bigWigFile_t *fp, int basesPerBin) { int32_t out = -1; int64_t diff; uint32_t bestDiff = -1; uint16_t i; basesPerBin/=2; for(i=0; ihdr->nLevels; i++) { diff = basesPerBin - (int64_t) fp->hdr->zoomHdrs->level[i]; if(diff >= 0 && diff < bestDiff) { bestDiff = diff; out = i; } } return out; } /// @cond SKIP struct val_t { uint32_t nBases; float min, max, sum, sumsq; double scalar; }; struct vals_t { uint32_t n; struct val_t **vals; }; /// @endcond void destroyVals_t(struct vals_t *v) { uint32_t i; if(!v) return; for(i=0; in; i++) free(v->vals[i]); if(v->vals) free(v->vals); free(v); } //Determine the base-pair overlap between an interval and a block double getScalar(uint32_t i_start, uint32_t i_end, uint32_t b_start, uint32_t b_end) { double rv = 0.0; if(b_start <= i_start) { if(b_end > i_start) rv = ((double)(b_end - i_start))/(b_end-b_start); } else if(b_start < i_end) { if(b_end < i_end) rv = ((double)(b_end - b_start))/(b_end-b_start); else rv = ((double)(i_end - b_start))/(b_end-b_start); } return rv; } //Returns NULL on error static struct vals_t *getVals(bigWigFile_t *fp, bwOverlapBlock_t *o, int i, uint32_t tid, uint32_t start, uint32_t end) { void *buf = NULL, *compBuf = NULL; uLongf sz = fp->hdr->bufSize; int compressed = 0, rv; uint32_t *p, vtid, vstart, vend; struct vals_t *vals = NULL; struct val_t *v = NULL; if(sz) { compressed = 1; buf = malloc(sz); } sz = 0; //This is now the size of the compressed buffer if(bwSetPos(fp, o->offset[i])) goto error; vals = calloc(1,sizeof(struct vals_t)); if(!vals) goto error; v = malloc(sizeof(struct val_t)); if(!v) goto error; if(sz < o->size[i]) compBuf = malloc(o->size[i]); if(!compBuf) goto error; if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; if(compressed) { sz = fp->hdr->bufSize; rv = uncompress(buf, &sz, compBuf, o->size[i]); if(rv != Z_OK) goto error; } else { buf = compBuf; } p = buf; while(((uLongf) ((void*)p-buf)) < sz) { vtid = p[0]; vstart = p[1]; vend = p[2]; v->nBases = p[3]; v->min = ((float*) p)[4]; v->max = ((float*) p)[5]; v->sum = ((float*) p)[6]; v->sumsq = ((float*) p)[7]; v->scalar = getScalar(start, end, vstart, vend); if(tid == vtid) { if((start <= vstart && end > vstart) || (start < vend && start >= vstart)) { vals->vals = realloc(vals->vals, sizeof(struct val_t*)*(vals->n+1)); if(!vals->vals) goto error; vals->vals[vals->n++] = v; v = malloc(sizeof(struct val_t)); if(!v) goto error; } if(vstart > end) break; } else if(vtid > tid) { break; } p+=8; } free(v); free(buf); free(compBuf); return vals; error: if(buf) free(buf); if(compBuf) free(compBuf); if(v) free(v); destroyVals_t(vals); return NULL; } //On error, errno is set to ENOMEM and NaN is returned (though NaN can be returned normally) static double blockMean(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { uint32_t i, j; double output = 0.0, coverage = 0.0; struct vals_t *v = NULL; if(!blocks->n) return strtod("NaN", NULL); //Iterate over the blocks for(i=0; in; i++) { v = getVals(fp, blocks, i, tid, start, end); if(!v) goto error; for(j=0; jn; j++) { output += v->vals[j]->sum * v->vals[j]->scalar; coverage += v->vals[j]->nBases * v->vals[j]->scalar; } destroyVals_t(v); } if(!coverage) return strtod("NaN", NULL); return output/coverage; error: if(v) free(v); errno = ENOMEM; return strtod("NaN", NULL); } static double intMean(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { double sum = 0.0; uint32_t nBases = 0, i, start_use, end_use; if(!ints->l) return strtod("NaN", NULL); for(i=0; il; i++) { start_use = ints->start[i]; end_use = ints->end[i]; if(ints->start[i] < start) start_use = start; if(ints->end[i] > end) end_use = end; nBases += end_use-start_use; sum += (end_use-start_use)*((double) ints->value[i]); } return sum/nBases; } //Does UCSC compensate for partial block/range overlap? static double blockDev(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { uint32_t i, j; double mean = 0.0, ssq = 0.0, coverage = 0.0, diff; struct vals_t *v = NULL; if(!blocks->n) return strtod("NaN", NULL); //Iterate over the blocks for(i=0; in; i++) { v = getVals(fp, blocks, i, tid, start, end); if(!v) goto error; for(j=0; jn; j++) { coverage += v->vals[j]->nBases * v->vals[j]->scalar; mean += v->vals[j]->sum * v->vals[j]->scalar; ssq += v->vals[j]->sumsq * v->vals[j]->scalar; } destroyVals_t(v); v = NULL; } if(coverage<=1.0) return strtod("NaN", NULL); diff = ssq-mean*mean/coverage; if(coverage > 1.0) diff /= coverage-1; if(fabs(diff) > 1e-8) { //Ignore floating point differences return sqrt(diff); } else { return 0.0; } error: if(v) destroyVals_t(v); errno = ENOMEM; return strtod("NaN", NULL); } //This uses compensated summation to account for finite precision math static double intDev(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { double v1 = 0.0, mean, rv; uint32_t nBases = 0, i, start_use, end_use; if(!ints->l) return strtod("NaN", NULL); mean = intMean(ints, start, end); for(i=0; il; i++) { start_use = ints->start[i]; end_use = ints->end[i]; if(ints->start[i] < start) start_use = start; if(ints->end[i] > end) end_use = end; nBases += end_use-start_use; v1 += (end_use-start_use) * pow(ints->value[i]-mean, 2.0); //running sum of squared difference } if(nBases>=2) rv = sqrt(v1/(nBases-1)); else if(nBases==1) rv = sqrt(v1); else rv = strtod("NaN", NULL); return rv; } static double blockMax(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { uint32_t i, j, isNA = 1; double o = strtod("NaN", NULL); struct vals_t *v = NULL; if(!blocks->n) return o; //Iterate the blocks for(i=0; in; i++) { v = getVals(fp, blocks, i, tid, start, end); if(!v) goto error; for(j=0; jn; j++) { if(isNA) { o = v->vals[j]->max; isNA = 0; } else if(v->vals[j]->max > o) { o = v->vals[j]->max; } } destroyVals_t(v); } return o; error: destroyVals_t(v); errno = ENOMEM; return strtod("NaN", NULL); } static double intMax(bwOverlappingIntervals_t* ints) { uint32_t i; double o; if(ints->l < 1) return strtod("NaN", NULL); o = ints->value[0]; for(i=1; il; i++) { if(ints->value[i] > o) o = ints->value[i]; } return o; } static double blockMin(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { uint32_t i, j, isNA = 1; double o = strtod("NaN", NULL); struct vals_t *v = NULL; if(!blocks->n) return o; //Iterate the blocks for(i=0; in; i++) { v = getVals(fp, blocks, i, tid, start, end); if(!v) goto error; for(j=0; jn; j++) { if(isNA) { o = v->vals[j]->min; isNA = 0; } else if(v->vals[j]->min < o) o = v->vals[j]->min; } destroyVals_t(v); } return o; error: destroyVals_t(v); errno = ENOMEM; return strtod("NaN", NULL); } static double intMin(bwOverlappingIntervals_t* ints) { uint32_t i; double o; if(ints->l < 1) return strtod("NaN", NULL); o = ints->value[0]; for(i=1; il; i++) { if(ints->value[i] < o) o = ints->value[i]; } return o; } //Does UCSC compensate for only partial block/interval overlap? static double blockCoverage(bigWigFile_t *fp, bwOverlapBlock_t *blocks, uint32_t tid, uint32_t start, uint32_t end) { uint32_t i, j; double o = 0.0; struct vals_t *v = NULL; if(!blocks->n) return strtod("NaN", NULL); //Iterate over the blocks for(i=0; in; i++) { v = getVals(fp, blocks, i, tid, start, end); if(!v) goto error; for(j=0; jn; j++) { o+= v->vals[j]->nBases * v->vals[j]->scalar; } destroyVals_t(v); } if(o == 0.0) return strtod("NaN", NULL); return o; error: destroyVals_t(v); errno = ENOMEM; return strtod("NaN", NULL); } static double intCoverage(bwOverlappingIntervals_t* ints, uint32_t start, uint32_t end) { uint32_t i, start_use, end_use; double o = 0.0; if(!ints->l) return strtod("NaN", NULL); for(i=0; il; i++) { start_use = ints->start[i]; end_use = ints->end[i]; if(start_use < start) start_use = start; if(end_use > end) end_use = end; o += end_use - start_use; } return o/(end-start); } //Returns NULL on error, otherwise a double* that needs to be free()d double *bwStatsFromZoom(bigWigFile_t *fp, int32_t level, uint32_t tid, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { bwOverlapBlock_t *blocks = NULL; double *output = NULL; uint32_t pos = start, i, end2; if(!fp->hdr->zoomHdrs->idx[level]) { fp->hdr->zoomHdrs->idx[level] = bwReadIndex(fp, fp->hdr->zoomHdrs->indexOffset[level]); if(!fp->hdr->zoomHdrs->idx[level]) return NULL; } output = malloc(sizeof(double)*nBins); if(!output) return NULL; for(i=0, pos=start; ihdr->zoomHdrs->idx[level]->root, tid, pos, end2); if(!blocks) goto error; switch(type) { case 0: //mean output[i] = blockMean(fp, blocks, tid, pos, end2); break; case 1: //stdev output[i] = blockDev(fp, blocks, tid, pos, end2); break; case 2: //max output[i] = blockMax(fp, blocks, tid, pos, end2); break; case 3: //min output[i] = blockMin(fp, blocks, tid, pos, end2); break; case 4: //cov output[i] = blockCoverage(fp, blocks, tid, pos, end2)/(end2-pos); break; default: goto error; break; } if(errno) goto error; destroyBWOverlapBlock(blocks); pos = end2; } return output; error: fprintf(stderr, "got an error in bwStatsFromZoom in the range %"PRIu32"-%"PRIu32": %s\n", pos, end2, strerror(errno)); if(blocks) destroyBWOverlapBlock(blocks); if(output) free(output); return NULL; } double *bwStatsFromFull(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, uint32_t nBins, enum bwStatsType type) { bwOverlappingIntervals_t *ints = NULL; double *output = malloc(sizeof(double)*nBins); uint32_t i, pos = start, end2; if(!output) return NULL; for(i=0; i #include #include static uint32_t roundup(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } //Returns the root node on success and NULL on error static bwRTree_t *readRTreeIdx(bigWigFile_t *fp, uint64_t offset) { uint32_t magic; bwRTree_t *node; if(!offset) { if(bwSetPos(fp, fp->hdr->indexOffset)) return NULL; } else { if(bwSetPos(fp, offset)) return NULL; } if(bwRead(&magic, sizeof(uint32_t), 1, fp) != 1) return NULL; if(magic != IDX_MAGIC) { fprintf(stderr, "[readRTreeIdx] Mismatch in the magic number!\n"); return NULL; } node = malloc(sizeof(bwRTree_t)); if(!node) return NULL; if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->nItems), sizeof(uint64_t), 1, fp) != 1) goto error; if(bwRead(&(node->chrIdxStart), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->baseStart), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->chrIdxEnd), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->baseEnd), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->idxSize), sizeof(uint64_t), 1, fp) != 1) goto error; if(bwRead(&(node->nItemsPerSlot), sizeof(uint32_t), 1, fp) != 1) goto error; //Padding if(bwRead(&(node->blockSize), sizeof(uint32_t), 1, fp) != 1) goto error; node->rootOffset = bwTell(fp); return node; error: free(node); return NULL; } //Returns a bwRTreeNode_t on success and NULL on an error //For the root node, set offset to 0 static bwRTreeNode_t *bwGetRTreeNode(bigWigFile_t *fp, uint64_t offset) { bwRTreeNode_t *node = NULL; uint8_t padding; uint16_t i; if(offset) { if(bwSetPos(fp, offset)) return NULL; } else { //seek if(bwSetPos(fp, fp->idx->rootOffset)) return NULL; } node = calloc(1, sizeof(bwRTreeNode_t)); if(!node) return NULL; if(bwRead(&(node->isLeaf), sizeof(uint8_t), 1, fp) != 1) goto error; if(bwRead(&padding, sizeof(uint8_t), 1, fp) != 1) goto error; if(bwRead(&(node->nChildren), sizeof(uint16_t), 1, fp) != 1) goto error; node->chrIdxStart = malloc(sizeof(uint32_t)*(node->nChildren)); if(!node->chrIdxStart) goto error; node->baseStart = malloc(sizeof(uint32_t)*(node->nChildren)); if(!node->baseStart) goto error; node->chrIdxEnd = malloc(sizeof(uint32_t)*(node->nChildren)); if(!node->chrIdxEnd) goto error; node->baseEnd = malloc(sizeof(uint32_t)*(node->nChildren)); if(!node->baseEnd) goto error; node->dataOffset = malloc(sizeof(uint64_t)*(node->nChildren)); if(!node->dataOffset) goto error; if(node->isLeaf) { node->x.size = malloc(node->nChildren * sizeof(uint64_t)); if(!node->x.size) goto error; } else { node->x.child = calloc(node->nChildren, sizeof(struct bwRTreeNode_t *)); if(!node->x.child) goto error; } for(i=0; inChildren; i++) { if(bwRead(&(node->chrIdxStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->baseStart[i]), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->chrIdxEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->baseEnd[i]), sizeof(uint32_t), 1, fp) != 1) goto error; if(bwRead(&(node->dataOffset[i]), sizeof(uint64_t), 1, fp) != 1) goto error; if(node->isLeaf) { if(bwRead(&(node->x.size[i]), sizeof(uint64_t), 1, fp) != 1) goto error; } } return node; error: if(node->chrIdxStart) free(node->chrIdxStart); if(node->baseStart) free(node->baseStart); if(node->chrIdxEnd) free(node->chrIdxEnd); if(node->baseEnd) free(node->baseEnd); if(node->dataOffset) free(node->dataOffset); if(node->isLeaf && node->x.size) free(node->x.size); else if((!node->isLeaf) && node->x.child) free(node->x.child); free(node); return NULL; } void destroyBWOverlapBlock(bwOverlapBlock_t *b) { if(!b) return; if(b->size) free(b->size); if(b->offset) free(b->offset); free(b); } //Returns a bwOverlapBlock_t * object or NULL on error. static bwOverlapBlock_t *overlapsLeaf(bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { uint16_t i, idx = 0; bwOverlapBlock_t *o = calloc(1, sizeof(bwOverlapBlock_t)); if(!o) return NULL; for(i=0; inChildren; i++) { if(tid < node->chrIdxStart[i]) break; if(tid > node->chrIdxEnd[i]) continue; /* The individual blocks can theoretically span multiple contigs. So if we treat the first/last contig in the range as special but anything in the middle is a guaranteed match */ if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { if(tid == node->chrIdxStart[i]) { if(node->baseStart[i] >= end) break; } else if(tid == node->chrIdxEnd[i]) { if(node->baseEnd[i] <= start) continue; } } else { if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; } o->n++; } if(o->n) { o->offset = malloc(sizeof(uint64_t) * (o->n)); if(!o->offset) goto error; o->size = malloc(sizeof(uint64_t) * (o->n)); if(!o->size) goto error; for(i=0; inChildren; i++) { if(tid < node->chrIdxStart[i]) break; if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { if(tid == node->chrIdxStart[i]) { if(node->baseStart[i] >= end) continue; } else if(tid == node->chrIdxEnd[i]) { if(node->baseEnd[i] <= start) continue; } } else { if(node->baseStart[i] >= end || node->baseEnd[i] <= start) continue; } o->offset[idx] = node->dataOffset[i]; o->size[idx++] = node->x.size[i]; if(idx >= o->n) break; } } if(idx != o->n) { //This should never happen fprintf(stderr, "[overlapsLeaf] Mismatch between number of overlaps calculated and found!\n"); goto error; } return o; error: if(o) destroyBWOverlapBlock(o); return NULL; } //This will free l2 unless there's an error! //Returns NULL on error, otherwise the merged lists static bwOverlapBlock_t *mergeOverlapBlocks(bwOverlapBlock_t *b1, bwOverlapBlock_t *b2) { uint64_t i,j; if(!b2) return b1; if(!b2->n) { destroyBWOverlapBlock(b2); return b1; } if(!b1->n) { destroyBWOverlapBlock(b1); return b2; } j = b1->n; b1->n += b2->n; b1->offset = realloc(b1->offset, sizeof(uint64_t) * (b1->n+b2->n)); if(!b1->offset) goto error; b1->size = realloc(b1->size, sizeof(uint64_t) * (b1->n+b2->n)); if(!b1->size) goto error; for(i=0; in; i++) { b1->offset[j+i] = b2->offset[i]; b1->size[j+i] = b2->size[i]; } destroyBWOverlapBlock(b2); return b1; error: destroyBWOverlapBlock(b1); return NULL; } //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned //The output needs to be free()d if not NULL (likewise with *sizes) static bwOverlapBlock_t *overlapsNonLeaf(bigWigFile_t *fp, bwRTreeNode_t *node, uint32_t tid, uint32_t start, uint32_t end) { uint16_t i; bwOverlapBlock_t *nodeBlocks, *output = calloc(1, sizeof(bwOverlapBlock_t)); if(!output) return NULL; for(i=0; inChildren; i++) { if(tid < node->chrIdxStart[i]) break; if(tid < node->chrIdxStart[i] || tid > node->chrIdxEnd[i]) continue; if(node->chrIdxStart[i] != node->chrIdxEnd[i]) { //child spans contigs if(tid == node->chrIdxStart[i]) { if(node->baseStart[i] >= end) continue; } else if(tid == node->chrIdxEnd[i]) { if(node->baseEnd[i] <= start) continue; } } else { if(end <= node->baseStart[i] || start >= node->baseEnd[i]) continue; } //We have an overlap! if(!node->x.child[i]) node->x.child[i] = bwGetRTreeNode(fp, node->dataOffset[i]); if(!node->x.child[i]) goto error; if(node->x.child[i]->isLeaf) { //leaf nodeBlocks = overlapsLeaf(node->x.child[i], tid, start, end); } else { //non-leaf nodeBlocks = overlapsNonLeaf(fp, node->x.child[i], tid, start, end); } //The output is processed the same regardless of leaf/non-leaf if(!nodeBlocks) goto error; else { output = mergeOverlapBlocks(output, nodeBlocks); if(!output) { destroyBWOverlapBlock(nodeBlocks); goto error; } } } return output; error: destroyBWOverlapBlock(output); return NULL; } //Returns NULL and sets nOverlaps to >0 on error, otherwise nOverlaps is the number of file offsets returned //The output must be free()d bwOverlapBlock_t *walkRTreeNodes(bigWigFile_t *bw, bwRTreeNode_t *root, uint32_t tid, uint32_t start, uint32_t end) { if(root->isLeaf) return overlapsLeaf(root, tid, start, end); return overlapsNonLeaf(bw, root, tid, start, end); } //In reality, a hash or some sort of tree structure is probably faster... //Return -1 (AKA 0xFFFFFFFF...) on "not there", so we can hold (2^32)-1 items. uint32_t bwGetTid(bigWigFile_t *fp, char *chrom) { uint32_t i; if(!chrom) return -1; for(i=0; icl->nKeys; i++) { if(strcmp(chrom, fp->cl->chrom[i]) == 0) return i; } return -1; } static bwOverlapBlock_t *bwGetOverlappingBlocks(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end) { uint32_t tid = bwGetTid(fp, chrom); if(tid == (uint32_t) -1) { fprintf(stderr, "[bwGetOverlappingBlocks] Non-existent contig: %s\n", chrom); return NULL; } //Get the info if needed if(!fp->idx) { fp->idx = readRTreeIdx(fp, fp->hdr->indexOffset); if(!fp->idx) { return NULL; } } if(!fp->idx->root) fp->idx->root = bwGetRTreeNode(fp, 0); if(!fp->idx->root) return NULL; return walkRTreeNodes(fp, fp->idx->root, tid, start, end); } void bwFillDataHdr(bwDataHeader_t *hdr, void *b) { hdr->tid = ((uint32_t*)b)[0]; hdr->start = ((uint32_t*)b)[1]; hdr->end = ((uint32_t*)b)[2]; hdr->step = ((uint32_t*)b)[3]; hdr->span = ((uint32_t*)b)[4]; hdr->type = ((uint8_t*)b)[20]; hdr->nItems = ((uint16_t*)b)[11]; } void bwDestroyOverlappingIntervals(bwOverlappingIntervals_t *o) { if(!o) return; if(o->start) free(o->start); if(o->end) free(o->end); if(o->value) free(o->value); free(o); } void bbDestroyOverlappingEntries(bbOverlappingEntries_t *o) { uint32_t i; if(!o) return; if(o->start) free(o->start); if(o->end) free(o->end); if(o->str) { for(i=0; il; i++) { if(o->str[i]) free(o->str[i]); } free(o->str); } free(o); } //Returns NULL on error, in which case o has been free()d static bwOverlappingIntervals_t *pushIntervals(bwOverlappingIntervals_t *o, uint32_t start, uint32_t end, float value) { if(o->l+1 >= o->m) { o->m = roundup(o->l+1); o->start = realloc(o->start, o->m * sizeof(uint32_t)); if(!o->start) goto error; o->end = realloc(o->end, o->m * sizeof(uint32_t)); if(!o->end) goto error; o->value = realloc(o->value, o->m * sizeof(float)); if(!o->value) goto error; } o->start[o->l] = start; o->end[o->l] = end; o->value[o->l++] = value; return o; error: bwDestroyOverlappingIntervals(o); return NULL; } static bbOverlappingEntries_t *pushBBIntervals(bbOverlappingEntries_t *o, uint32_t start, uint32_t end, char *str, int withString) { if(o->l+1 >= o->m) { o->m = roundup(o->l+1); o->start = realloc(o->start, o->m * sizeof(uint32_t)); if(!o->start) goto error; o->end = realloc(o->end, o->m * sizeof(uint32_t)); if(!o->end) goto error; if(withString) { o->str = realloc(o->str, o->m * sizeof(char**)); if(!o->str) goto error; } } o->start[o->l] = start; o->end[o->l] = end; if(withString) o->str[o->l] = strdup(str); o->l++; return o; error: bbDestroyOverlappingEntries(o); return NULL; } //Returns NULL on error bwOverlappingIntervals_t *bwGetOverlappingIntervalsCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend) { uint64_t i; uint16_t j; int compressed = 0, rv; uLongf sz = fp->hdr->bufSize, tmp; void *buf = NULL, *compBuf = NULL; uint32_t start = 0, end , *p; float value; bwDataHeader_t hdr; bwOverlappingIntervals_t *output = calloc(1, sizeof(bwOverlappingIntervals_t)); if(!output) goto error; if(!o) return output; if(!o->n) return output; if(sz) { compressed = 1; buf = malloc(sz); } sz = 0; //This is now the size of the compressed buffer for(i=0; in; i++) { if(bwSetPos(fp, o->offset[i])) goto error; if(sz < o->size[i]) { compBuf = realloc(compBuf, o->size[i]); sz = o->size[i]; } if(!compBuf) goto error; if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; if(compressed) { tmp = fp->hdr->bufSize; //This gets over-written by uncompress rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); if(rv != Z_OK) goto error; } else { buf = compBuf; } //TODO: ensure that tmp is large enough! bwFillDataHdr(&hdr, buf); p = ((uint32_t*) buf); p += 6; if(hdr.tid != tid) continue; if(hdr.type == 3) start = hdr.start - hdr.step; //FIXME: We should ensure that sz is large enough to hold nItems of the given type for(j=0; j= oend) continue; //Push the overlap if(!pushIntervals(output, start, end, value)) goto error; } } if(compressed && buf) free(buf); if(compBuf) free(compBuf); return output; error: fprintf(stderr, "[bwGetOverlappingIntervalsCore] Got an error\n"); if(output) bwDestroyOverlappingIntervals(output); if(compressed && buf) free(buf); if(compBuf) free(compBuf); return NULL; } bbOverlappingEntries_t *bbGetOverlappingEntriesCore(bigWigFile_t *fp, bwOverlapBlock_t *o, uint32_t tid, uint32_t ostart, uint32_t oend, int withString) { uint64_t i; int compressed = 0, rv, slen; uLongf sz = fp->hdr->bufSize, tmp = 0; void *buf = NULL, *bufEnd = NULL, *compBuf = NULL; uint32_t entryTid = 0, start = 0, end; char *str; bbOverlappingEntries_t *output = calloc(1, sizeof(bbOverlappingEntries_t)); if(!output) goto error; if(!o) return output; if(!o->n) return output; if(sz) { compressed = 1; buf = malloc(sz); } sz = 0; //This is now the size of the compressed buffer for(i=0; in; i++) { if(bwSetPos(fp, o->offset[i])) goto error; if(sz < o->size[i]) { compBuf = realloc(compBuf, o->size[i]); sz = o->size[i]; } if(!compBuf) goto error; if(bwRead(compBuf, o->size[i], 1, fp) != 1) goto error; if(compressed) { tmp = fp->hdr->bufSize; //This gets over-written by uncompress rv = uncompress(buf, (uLongf *) &tmp, compBuf, o->size[i]); if(rv != Z_OK) goto error; } else { buf = compBuf; tmp = o->size[i]; //TODO: Is this correct? Do non-gzipped bigBeds exist? } bufEnd = buf + tmp; while(buf < bufEnd) { entryTid = ((uint32_t*)buf)[0]; start = ((uint32_t*)buf)[1]; end = ((uint32_t*)buf)[2]; buf += 12; str = (char*)buf; slen = strlen(str) + 1; buf += slen; if(entryTid < tid) continue; if(entryTid > tid) break; if(end <= ostart) continue; if(start >= oend) break; //Push the overlap if(!pushBBIntervals(output, start, end, str, withString)) goto error; } buf = bufEnd - tmp; //reset the buffer pointer } if(compressed && buf) free(buf); if(compBuf) free(compBuf); return output; error: fprintf(stderr, "[bbGetOverlappingEntriesCore] Got an error\n"); buf = bufEnd - tmp; if(output) bbDestroyOverlappingEntries(output); if(compressed && buf) free(buf); if(compBuf) free(compBuf); return NULL; } //Returns NULL on error OR no intervals, which is a bad design... bwOverlappingIntervals_t *bwGetOverlappingIntervals(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end) { bwOverlappingIntervals_t *output; uint32_t tid = bwGetTid(fp, chrom); if(tid == (uint32_t) -1) return NULL; bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); if(!blocks) return NULL; output = bwGetOverlappingIntervalsCore(fp, blocks, tid, start, end); destroyBWOverlapBlock(blocks); return output; } //Like above, but for bigBed files bbOverlappingEntries_t *bbGetOverlappingEntries(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int withString) { bbOverlappingEntries_t *output; uint32_t tid = bwGetTid(fp, chrom); if(tid == (uint32_t) -1) return NULL; bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(fp, chrom, start, end); if(!blocks) return NULL; output = bbGetOverlappingEntriesCore(fp, blocks, tid, start, end, withString); destroyBWOverlapBlock(blocks); return output; } //Returns NULL on error bwOverlapIterator_t *bwOverlappingIntervalsIterator(bigWigFile_t *bw, char *chrom, uint32_t start, uint32_t end, uint32_t blocksPerIteration) { bwOverlapIterator_t *output = NULL; uint64_t n; uint32_t tid = bwGetTid(bw, chrom); if(tid == (uint32_t) -1) return output; output = calloc(1, sizeof(bwOverlapIterator_t)); if(!output) return output; bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(bw, chrom, start, end); output->bw = bw; output->tid = tid; output->start = start; output->end = end; output->blocks = blocks; output->blocksPerIteration = blocksPerIteration; if(blocks) { n = blocks->n; if(n>blocksPerIteration) blocks->n = blocksPerIteration; output->intervals = bwGetOverlappingIntervalsCore(bw, blocks,tid, start, end); blocks->n = n; output->offset = blocksPerIteration; } output->data = output->intervals; return output; } //Returns NULL on error bwOverlapIterator_t *bbOverlappingEntriesIterator(bigWigFile_t *bw, char *chrom, uint32_t start, uint32_t end, int withString, uint32_t blocksPerIteration) { bwOverlapIterator_t *output = NULL; uint64_t n; uint32_t tid = bwGetTid(bw, chrom); if(tid == (uint32_t) -1) return output; output = calloc(1, sizeof(bwOverlapIterator_t)); if(!output) return output; bwOverlapBlock_t *blocks = bwGetOverlappingBlocks(bw, chrom, start, end); output->bw = bw; output->tid = tid; output->start = start; output->end = end; output->blocks = blocks; output->blocksPerIteration = blocksPerIteration; output->withString = withString; if(blocks) { n = blocks->n; if(n>blocksPerIteration) blocks->n = blocksPerIteration; output->entries = bbGetOverlappingEntriesCore(bw, blocks,tid, start, end, withString); blocks->n = n; output->offset = blocksPerIteration; } output->data = output->entries; return output; } void bwIteratorDestroy(bwOverlapIterator_t *iter) { if(!iter) return; if(iter->blocks) destroyBWOverlapBlock((bwOverlapBlock_t*) iter->blocks); if(iter->intervals) bwDestroyOverlappingIntervals(iter->intervals); if(iter->entries) bbDestroyOverlappingEntries(iter->entries); free(iter); } //On error, points to NULL and destroys the input bwOverlapIterator_t *bwIteratorNext(bwOverlapIterator_t *iter) { uint64_t n, *offset, *size; bwOverlapBlock_t *blocks = iter->blocks; if(iter->intervals) { bwDestroyOverlappingIntervals(iter->intervals); iter->intervals = NULL; } if(iter->entries) { bbDestroyOverlappingEntries(iter->entries); iter->entries = NULL; } iter->data = NULL; if(iter->offset < blocks->n) { //store the previous values n = blocks->n; offset = blocks->offset; size = blocks->size; //Move the start of the blocks blocks->offset += iter->offset; blocks->size += iter->offset; if(iter->offset + iter->blocksPerIteration > n) { blocks->n = blocks->n - iter->offset; } else { blocks->n = iter->blocksPerIteration; } //Get the intervals or entries, as appropriate if(iter->bw->type == 0) { //bigWig iter->intervals = bwGetOverlappingIntervalsCore(iter->bw, blocks, iter->tid, iter->start, iter->end); iter->data = iter->intervals; } else { //bigBed iter->entries = bbGetOverlappingEntriesCore(iter->bw, blocks, iter->tid, iter->start, iter->end, iter->withString); iter->data = iter->entries; } iter->offset += iter->blocksPerIteration; //reset the values in iter->blocks blocks->n = n; blocks->offset = offset; blocks->size = size; //Check for error if(!iter->intervals && !iter->entries) goto error; } return iter; error: bwIteratorDestroy(iter); return NULL; } //This is like bwGetOverlappingIntervals, except it returns 1 base windows. If includeNA is not 0, then a value will be returned for every position in the range (defaulting to NAN). //The ->end member is NULL //If includeNA is not 0 then ->start is also NULL, since it's implied //Note that bwDestroyOverlappingIntervals() will work in either case bwOverlappingIntervals_t *bwGetValues(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t end, int includeNA) { uint32_t i, j, n; bwOverlappingIntervals_t *output = NULL; bwOverlappingIntervals_t *intermediate = bwGetOverlappingIntervals(fp, chrom, start, end); if(!intermediate) return NULL; output = calloc(1, sizeof(bwOverlappingIntervals_t)); if(!output) goto error; if(includeNA) { output->l = end-start; output->value = malloc((end-start)*sizeof(float)); if(!output->value) goto error; for(i=0; ivalue[i] = strtod("NaN", NULL); for(i=0; il; i++) { for(j=intermediate->start[i]; jend[i]; j++) { if(j < start || j >= end) continue; output->value[j-start] = intermediate->value[i]; } } } else { n = 0; for(i=0; il; i++) { if(intermediate->start[i] < start) intermediate->start[i] = start; if(intermediate->end[i] > end) intermediate->end[i] = end; n += intermediate->end[i]-intermediate->start[i]; } output->l = n; output->start = malloc(sizeof(uint32_t)*n); if(!output->start) goto error; output->value = malloc(sizeof(float)*n); if(!output->value) goto error; n = 0; //this is now the index for(i=0; il; i++) { for(j=intermediate->start[i]; jend[i]; j++) { if(j < start || j >= end) continue; output->start[n] = j; output->value[n++] = intermediate->value[i]; } } } bwDestroyOverlappingIntervals(intermediate); return output; error: if(intermediate) bwDestroyOverlappingIntervals(intermediate); if(output) bwDestroyOverlappingIntervals(output); return NULL; } void bwDestroyIndexNode(bwRTreeNode_t *node) { uint16_t i; if(!node) return; free(node->chrIdxStart); free(node->baseStart); free(node->chrIdxEnd); free(node->baseEnd); free(node->dataOffset); if(!node->isLeaf) { for(i=0; inChildren; i++) { bwDestroyIndexNode(node->x.child[i]); } free(node->x.child); } else { free(node->x.size); } free(node); } void bwDestroyIndex(bwRTree_t *idx) { bwDestroyIndexNode(idx->root); free(idx); } //Returns a pointer to the requested index (@offset, unless it's 0, in which case the index for the values is returned //Returns NULL on error bwRTree_t *bwReadIndex(bigWigFile_t *fp, uint64_t offset) { bwRTree_t *idx = readRTreeIdx(fp, offset); if(!idx) return NULL; //Read in the root node idx->root = bwGetRTreeNode(fp, idx->rootOffset); if(!idx->root) { bwDestroyIndex(idx); return NULL; } return idx; } pyBigWig-0.3.2/libBigWig/bwValues.h0000640000201600010240000000737412715313174017456 0ustar ryanbioinfo00000000000000#include /*! \file bwValues.h * * You should not directly use functions and structures defined here. They're really meant for internal use only. * * All of the structures here need to be destroyed or you'll leak memory! There are methods available to destroy anything that you need to take care of yourself. */ //N.B., coordinates are still 0-based half open! /*! * @brief A node within an R-tree holding the index for data. * * Note that there are two types of nodes: leaf and twig. Leaf nodes point to where data actually is. Twig nodes point to additional index nodes, which may or may not be leaves. Each of these nodes has additional children, which may span multiple chromosomes/contigs. * * With the start/end position, these positions refer specifically to the chromosomes specified in chrIdxStart/chrIdxEnd. Any chromosomes between these are completely spanned by a given child. */ typedef struct bwRTreeNode_t { uint8_t isLeaf; /** #include #include #include #include #include "bigWig.h" #include "bwCommon.h" /// @cond SKIP struct val_t { uint32_t tid; uint32_t start; uint32_t nBases; float min, max, sum, sumsq; double scalar; struct val_t *next; }; /// @endcond //Create a chromList_t and attach it to a bigWigFile_t *. Returns NULL on error //Note that chroms and lengths are duplicated, so you MUST free the input chromList_t *bwCreateChromList(char **chroms, uint32_t *lengths, int64_t n) { int64_t i = 0; chromList_t *cl = calloc(1, sizeof(chromList_t)); if(!cl) return NULL; cl->nKeys = n; cl->chrom = malloc(sizeof(char*)*n); cl->len = malloc(sizeof(uint32_t)*n); if(!cl->chrom) goto error; if(!cl->len) goto error; for(i=0; ilen[i] = lengths[i]; cl->chrom[i] = strdup(chroms[i]); if(!cl->chrom[i]) goto error; } return cl; error: if(i) { int64_t j; for(j=0; jchrom[j]); } if(cl) { if(cl->chrom) free(cl->chrom); if(cl->len) free(cl->len); free(cl); } return NULL; } //If maxZooms == 0, then 0 is used (i.e., there are no zoom levels). If maxZooms < 0 or > 65535 then 10 is used. //TODO allow changing bufSize and blockSize int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms) { if(!fp->isWrite) return 1; bigWigHdr_t *hdr = calloc(1, sizeof(bigWigHdr_t)); if(!hdr) return 2; hdr->version = 4; if(maxZooms < 0 || maxZooms > 65535) { hdr->nLevels = 10; } else { hdr->nLevels = maxZooms; } hdr->bufSize = 32768; //When the file is finalized this is reset if fp->writeBuffer->compressPsz is 0! hdr->minVal = DBL_MAX; hdr->maxVal = DBL_MIN; fp->hdr = hdr; fp->writeBuffer->blockSize = 64; //Allocate the writeBuffer buffers fp->writeBuffer->compressPsz = compressBound(hdr->bufSize); fp->writeBuffer->compressP = malloc(fp->writeBuffer->compressPsz); if(!fp->writeBuffer->compressP) return 3; fp->writeBuffer->p = calloc(1,hdr->bufSize); if(!fp->writeBuffer->p) return 4; return 0; } //return 0 on success static int writeAtPos(void *ptr, size_t sz, size_t nmemb, size_t pos, FILE *fp) { size_t curpos = ftell(fp); if(fseek(fp, pos, SEEK_SET)) return 1; if(fwrite(ptr, sz, nmemb, fp) != nmemb) return 2; if(fseek(fp, curpos, SEEK_SET)) return 3; return 0; } //Are nblocks and nperblock correct? //We lose keySize bytes on error static int writeChromList(FILE *fp, chromList_t *cl) { uint16_t k; uint32_t j, magic = CIRTREE_MAGIC; uint32_t nperblock = (cl->nKeys>0xFFFF)?-1:cl->nKeys; //Items per leaf/non-leaf uint32_t nblocks = (cl->nKeys>>16)+1, keySize = 0, valSize = 8; //does the valSize even matter? I ignore it... uint64_t i, written = 0; uint8_t eight; int64_t i64; char *chrom; size_t l; for(i64=0; i64nKeys; i64++) { l = strlen(cl->chrom[i64]); if(l>keySize) keySize = l; } l--; //We don't null terminate strings, because schiess mich tot chrom = calloc(keySize, sizeof(char)); //Write the root node of a largely pointless tree if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 1; if(fwrite(&nperblock, sizeof(uint32_t), 1, fp) != 1) return 2; if(fwrite(&keySize, sizeof(uint32_t), 1, fp) != 1) return 3; if(fwrite(&valSize, sizeof(uint32_t), 1, fp) != 1) return 4; if(fwrite(&(cl->nKeys), sizeof(uint64_t), 1, fp) != 1) return 5; //Padding? i=0; if(fwrite(&i, sizeof(uint64_t), 1, fp) != 1) return 6; //Do we need a non-leaf node? if(nblocks>1) { eight = 0; if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 7; if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 8; //padding j = 0; for(i=0; inKeys - written < nperblock) nperblock = cl->nKeys - written; if(fwrite(&nperblock, sizeof(uint16_t), 1, fp) != 1) return 13; for(k=0; k=cl->nKeys) return 14; chrom = strncpy(chrom, cl->chrom[j], keySize); if(fwrite(chrom, keySize, 1, fp) != 1) return 15; if(fwrite(&j, sizeof(uint32_t), 1, fp) != 1) return 16; if(fwrite(&(cl->len[j++]), sizeof(uint32_t), 1, fp) != 1) return 17; written++; } } free(chrom); return 0; } //returns 0 on success //Still need to fill in indexOffset int bwWriteHdr(bigWigFile_t *bw) { uint32_t magic = BIGWIG_MAGIC; uint16_t two = 4; FILE *fp; void *p = calloc(58, sizeof(uint8_t)); //58 bytes of nothing if(!bw->isWrite) return 1; //The header itself, largely just reserving space... fp = bw->URL->x.fp; if(!fp) return 2; if(fseek(fp, 0, SEEK_SET)) return 3; if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 4; if(fwrite(&two, sizeof(uint16_t), 1, fp) != 1) return 5; if(fwrite(p, sizeof(uint8_t), 58, fp) != 58) return 6; //Empty zoom headers if(bw->hdr->nLevels) { for(two=0; twohdr->nLevels; two++) { if(fwrite(p, sizeof(uint8_t), 24, fp) != 24) return 9; } } //Update summaryOffset and write an empty summary block bw->hdr->summaryOffset = ftell(fp); if(fwrite(p, sizeof(uint8_t), 40, fp) != 40) return 10; if(writeAtPos(&(bw->hdr->summaryOffset), sizeof(uint64_t), 1, 0x2c, fp)) return 11; //Write the chromosome list as a stupid freaking tree (because let's TREE ALL THE THINGS!!!) bw->hdr->ctOffset = ftell(fp); if(writeChromList(fp, bw->cl)) return 7; if(writeAtPos(&(bw->hdr->ctOffset), sizeof(uint64_t), 1, 0x8, fp)) return 8; //Update the dataOffset bw->hdr->dataOffset = ftell(fp); if(writeAtPos(&bw->hdr->dataOffset, sizeof(uint64_t), 1, 0x10, fp)) return 12; //Save space for the number of blocks if(fwrite(p, sizeof(uint8_t), 8, fp) != 8) return 13; free(p); return 0; } static int insertIndexNode(bigWigFile_t *fp, bwRTreeNode_t *leaf) { bwLL *l = malloc(sizeof(bwLL)); if(!l) return 1; l->node = leaf; l->next = NULL; if(!fp->writeBuffer->firstIndexNode) { fp->writeBuffer->firstIndexNode = l; } else { fp->writeBuffer->currentIndexNode->next = l; } fp->writeBuffer->currentIndexNode = l; return 0; } //0 on success static int appendIndexNodeEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) { bwLL *n = fp->writeBuffer->currentIndexNode; if(!n) return 1; if(n->node->nChildren >= fp->writeBuffer->blockSize) return 2; n->node->chrIdxStart[n->node->nChildren] = tid0; n->node->baseStart[n->node->nChildren] = start; n->node->chrIdxEnd[n->node->nChildren] = tid1; n->node->baseEnd[n->node->nChildren] = end; n->node->dataOffset[n->node->nChildren] = offset; n->node->x.size[n->node->nChildren] = size; n->node->nChildren++; return 0; } //Returns 0 on success static int addIndexEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) { bwRTreeNode_t *node; if(appendIndexNodeEntry(fp, tid0, tid1, start, end, offset, size)) { //The last index node is full, we need to add a new one node = calloc(1, sizeof(bwRTreeNode_t)); if(!node) return 1; //Allocate and set the fields node->isLeaf = 1; node->nChildren = 1; node->chrIdxStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); if(!node->chrIdxStart) goto error; node->baseStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); if(!node->baseStart) goto error; node->chrIdxEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); if(!node->chrIdxEnd) goto error; node->baseEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize); if(!node->baseEnd) goto error; node->dataOffset = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize); if(!node->dataOffset) goto error; node->x.size = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize); if(!node->x.size) goto error; node->chrIdxStart[0] = tid0; node->baseStart[0] = start; node->chrIdxEnd[0] = tid1; node->baseEnd[0] = end; node->dataOffset[0] = offset; node->x.size[0] = size; if(insertIndexNode(fp, node)) goto error; } return 0; error: if(node->chrIdxStart) free(node->chrIdxStart); if(node->baseStart) free(node->baseStart); if(node->chrIdxEnd) free(node->chrIdxEnd); if(node->baseEnd) free(node->baseEnd); if(node->dataOffset) free(node->dataOffset); if(node->x.size) free(node->x.size); return 2; } /* * TODO: * The buffer size and compression sz need to be determined elsewhere (and p and compressP filled in!) */ static int flushBuffer(bigWigFile_t *fp) { bwWriteBuffer_t *wb = fp->writeBuffer; uLongf sz = wb->compressPsz; uint16_t nItems; if(!fp->writeBuffer->l) return 0; if(!wb->ltype) return 0; //Fill in the header if(!memcpy(wb->p, &(wb->tid), sizeof(uint32_t))) return 1; if(!memcpy(wb->p+4, &(wb->start), sizeof(uint32_t))) return 2; if(!memcpy(wb->p+8, &(wb->end), sizeof(uint32_t))) return 3; if(!memcpy(wb->p+12, &(wb->step), sizeof(uint32_t))) return 4; if(!memcpy(wb->p+16, &(wb->span), sizeof(uint32_t))) return 5; if(!memcpy(wb->p+20, &(wb->ltype), sizeof(uint8_t))) return 6; //1 byte padding //Determine the number of items switch(wb->ltype) { case 1: nItems = (wb->l-24)/12; break; case 2: nItems = (wb->l-24)/8; break; case 3: nItems = (wb->l-24)/4; break; default: return 7; } if(!memcpy(wb->p+22, &nItems, sizeof(uint16_t))) return 8; if(sz) { //compress if(compress(wb->compressP, &sz, wb->p, wb->l) != Z_OK) return 9; //write the data to disk if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 10; } else { sz = wb->l; if(fwrite(wb->p, sizeof(uint8_t), wb->l, fp->URL->x.fp) != wb->l) return 10; } //Add an entry into the index if(addIndexEntry(fp, wb->tid, wb->tid, wb->start, wb->end, bwTell(fp)-sz, sz)) return 11; wb->nBlocks++; wb->l = 24; return 0; } static void updateStats(bigWigFile_t *fp, uint32_t span, float val) { if(val < fp->hdr->minVal) fp->hdr->minVal = val; else if(val > fp->hdr->maxVal) fp->hdr->maxVal = val; fp->hdr->nBasesCovered += span; fp->hdr->sumData += span*val; fp->hdr->sumSquared += span*pow(val,2); fp->writeBuffer->nEntries++; fp->writeBuffer->runningWidthSum += span; } //12 bytes per entry int bwAddIntervals(bigWigFile_t *fp, char **chrom, uint32_t *start, uint32_t *end, float *values, uint32_t n) { uint32_t tid = 0, i; char *lastChrom = NULL; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; //Not an error per se if(!fp->isWrite) return 1; if(!wb) return 2; //Flush if needed if(wb->ltype != 1) if(flushBuffer(fp)) return 3; if(wb->l+36 > fp->hdr->bufSize) if(flushBuffer(fp)) return 4; lastChrom = chrom[0]; tid = bwGetTid(fp, chrom[0]); if(tid == (uint32_t) -1) return 5; if(tid != wb->tid) { if(flushBuffer(fp)) return 6; wb->tid = tid; wb->start = start[0]; wb->end = end[0]; } //Ensure that everything is set correctly wb->ltype = 1; if(wb->l <= 24) { wb->start = start[0]; wb->span = 0; wb->step = 0; } if(!memcpy(wb->p+wb->l, start, sizeof(uint32_t))) return 7; if(!memcpy(wb->p+wb->l+4, end, sizeof(uint32_t))) return 8; if(!memcpy(wb->p+wb->l+8, values, sizeof(float))) return 9; updateStats(fp, end[0]-start[0], values[0]); wb->l += 12; for(i=1; iend = end[i-1]; flushBuffer(fp); lastChrom = chrom[i]; tid = bwGetTid(fp, chrom[i]); if(tid == (uint32_t) -1) return 10; wb->tid = tid; wb->start = start[i]; } if(wb->l+12 > fp->hdr->bufSize) { //12 bytes/entry wb->end = end[i-1]; flushBuffer(fp); wb->start = start[i]; } if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 11; if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 12; if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 13; updateStats(fp, end[i]-start[i], values[i]); wb->l += 12; } wb->end = end[i-1]; return 0; } int bwAppendIntervals(bigWigFile_t *fp, uint32_t *start, uint32_t *end, float *values, uint32_t n) { uint32_t i; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; if(!fp->isWrite) return 1; if(!wb) return 2; if(wb->ltype != 1) return 3; for(i=0; il+12 > fp->hdr->bufSize) { if(i>0) { //otherwise it's already set wb->end = end[i-1]; } flushBuffer(fp); wb->start = start[i]; } if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4; if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 5; if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 6; updateStats(fp, end[i]-start[i], values[i]); wb->l += 12; } wb->end = end[i-1]; return 0; } //8 bytes per entry int bwAddIntervalSpans(bigWigFile_t *fp, char *chrom, uint32_t *start, uint32_t span, float *values, uint32_t n) { uint32_t i, tid; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; if(!fp->isWrite) return 1; if(!wb) return 2; if(wb->ltype != 2) if(flushBuffer(fp)) return 3; if(flushBuffer(fp)) return 4; tid = bwGetTid(fp, chrom); if(tid == (uint32_t) -1) return 5; wb->tid = tid; wb->start = start[0]; wb->step = 0; wb->span = span; wb->ltype = 2; for(i=0; il + 8 >= fp->hdr->bufSize) { //8 bytes/entry if(i) wb->end = start[i-1]+span; flushBuffer(fp); wb->start = start[i]; } if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 5; if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 6; updateStats(fp, span, values[i]); wb->l += 8; } wb->end = start[n-1] + span; return 0; } int bwAppendIntervalSpans(bigWigFile_t *fp, uint32_t *start, float *values, uint32_t n) { uint32_t i; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; if(!fp->isWrite) return 1; if(!wb) return 2; if(wb->ltype != 2) return 3; for(i=0; il + 8 >= fp->hdr->bufSize) { if(i) wb->end = start[i-1]+wb->span; flushBuffer(fp); wb->start = start[i]; } if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4; if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 5; updateStats(fp, wb->span, values[i]); wb->l += 8; } wb->end = start[n-1] + wb->span; return 0; } //4 bytes per entry int bwAddIntervalSpanSteps(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t span, uint32_t step, float *values, uint32_t n) { uint32_t i, tid; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; if(!fp->isWrite) return 1; if(!wb) return 2; if(wb->ltype != 3) flushBuffer(fp); if(flushBuffer(fp)) return 3; tid = bwGetTid(fp, chrom); if(tid == (uint32_t) -1) return 4; wb->tid = tid; wb->start = start; wb->step = step; wb->span = span; wb->ltype = 3; for(i=0; il + 4 >= fp->hdr->bufSize) { wb->end = wb->start + ((wb->l-24)>>2) * step; flushBuffer(fp); wb->start = wb->end; } if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 5; updateStats(fp, wb->span, values[i]); wb->l += 4; } wb->end = wb->start + (wb->l>>2) * step; return 0; } int bwAppendIntervalSpanSteps(bigWigFile_t *fp, float *values, uint32_t n) { uint32_t i; bwWriteBuffer_t *wb = fp->writeBuffer; if(!n) return 0; if(!fp->isWrite) return 1; if(!wb) return 2; if(wb->ltype != 3) return 3; for(i=0; il + 4 >= fp->hdr->bufSize) { wb->end = wb->start + ((wb->l-24)>>2) * wb->step; flushBuffer(fp); wb->start = wb->end; } if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 4; updateStats(fp, wb->span, values[i]); wb->l += 4; } wb->end = wb->start + (wb->l>>2) * wb->step; return 0; } //0 on success int writeSummary(bigWigFile_t *fp) { if(writeAtPos(&(fp->hdr->nBasesCovered), sizeof(uint64_t), 1, fp->hdr->summaryOffset, fp->URL->x.fp)) return 1; if(writeAtPos(&(fp->hdr->minVal), sizeof(double), 1, fp->hdr->summaryOffset+8, fp->URL->x.fp)) return 2; if(writeAtPos(&(fp->hdr->maxVal), sizeof(double), 1, fp->hdr->summaryOffset+16, fp->URL->x.fp)) return 3; if(writeAtPos(&(fp->hdr->sumData), sizeof(double), 1, fp->hdr->summaryOffset+24, fp->URL->x.fp)) return 4; if(writeAtPos(&(fp->hdr->sumSquared), sizeof(double), 1, fp->hdr->summaryOffset+32, fp->URL->x.fp)) return 5; return 0; } static bwRTreeNode_t *makeEmptyNode(uint32_t blockSize) { bwRTreeNode_t *n = calloc(1, sizeof(bwRTreeNode_t)); if(!n) return NULL; n->chrIdxStart = malloc(blockSize*sizeof(uint32_t)); if(!n->chrIdxStart) goto error; n->baseStart = malloc(blockSize*sizeof(uint32_t)); if(!n->baseStart) goto error; n->chrIdxEnd = malloc(blockSize*sizeof(uint32_t)); if(!n->chrIdxEnd) goto error; n->baseEnd = malloc(blockSize*sizeof(uint32_t)); if(!n->baseEnd) goto error; n->dataOffset = calloc(blockSize,sizeof(uint64_t)); //This MUST be 0 for node writing! if(!n->dataOffset) goto error; n->x.child = malloc(blockSize*sizeof(uint64_t)); if(!n->x.child) goto error; return n; error: if(n->chrIdxStart) free(n->chrIdxStart); if(n->baseStart) free(n->baseStart); if(n->chrIdxEnd) free(n->chrIdxEnd); if(n->baseEnd) free(n->baseEnd); if(n->dataOffset) free(n->dataOffset); if(n->x.child) free(n->x.child); free(n); return NULL; } //Returns 0 on success. This doesn't attempt to clean up! static bwRTreeNode_t *addLeaves(bwLL **ll, uint64_t *sz, uint64_t toProcess, uint32_t blockSize) { uint32_t i; uint64_t foo; bwRTreeNode_t *n = makeEmptyNode(blockSize); if(!n) return NULL; if(toProcess <= blockSize) { for(i=0; ichrIdxStart[i] = (*ll)->node->chrIdxStart[0]; n->baseStart[i] = (*ll)->node->baseStart[0]; n->chrIdxEnd[i] = (*ll)->node->chrIdxEnd[(*ll)->node->nChildren-1]; n->baseEnd[i] = (*ll)->node->baseEnd[(*ll)->node->nChildren-1]; n->x.child[i] = (*ll)->node; *sz += 4 + 32*(*ll)->node->nChildren; *ll = (*ll)->next; n->nChildren++; } } else { for(i=0; ix.child[i] = addLeaves(ll, sz, foo, blockSize); if(!n->x.child[i]) goto error; n->chrIdxStart[i] = n->x.child[i]->chrIdxStart[0]; n->baseStart[i] = n->x.child[i]->baseStart[0]; n->chrIdxEnd[i] = n->x.child[i]->chrIdxEnd[n->x.child[i]->nChildren-1]; n->baseEnd[i] = n->x.child[i]->baseEnd[n->x.child[i]->nChildren-1]; n->nChildren++; toProcess -= foo; } } *sz += 4 + 24*n->nChildren; return n; error: bwDestroyIndexNode(n); return NULL; } //Returns 1 on error int writeIndexTreeNode(FILE *fp, bwRTreeNode_t *n, uint8_t *wrote, int level) { uint8_t one = 0; uint32_t i, j, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0 if(n->isLeaf) return 0; for(i=0; inChildren; i++) { if(n->dataOffset[i]) { //traverse into child if(n->isLeaf) return 0; //Only write leaves once! if(writeIndexTreeNode(fp, n->x.child[i], wrote, level+1)) return 1; } else { n->dataOffset[i] = ftell(fp); if(fwrite(&(n->x.child[i]->isLeaf), sizeof(uint8_t), 1, fp) != 1) return 1; if(fwrite(&one, sizeof(uint8_t), 1, fp) != 1) return 1; //one byte of padding if(fwrite(&(n->x.child[i]->nChildren), sizeof(uint16_t), 1, fp) != 1) return 1; for(j=0; jx.child[i]->nChildren; j++) { vector[0] = n->x.child[i]->chrIdxStart[j]; vector[1] = n->x.child[i]->baseStart[j]; vector[2] = n->x.child[i]->chrIdxEnd[j]; vector[3] = n->x.child[i]->baseEnd[j]; if(n->x.child[i]->isLeaf) { //Include the offset and size if(fwrite(vector, sizeof(uint32_t), 4, fp) != 4) return 1; if(fwrite(&(n->x.child[i]->dataOffset[j]), sizeof(uint64_t), 1, fp) != 1) return 1; if(fwrite(&(n->x.child[i]->x.size[j]), sizeof(uint64_t), 1, fp) != 1) return 1; } else { if(fwrite(vector, sizeof(uint32_t), 6, fp) != 6) return 1; } } *wrote = 1; } } return 0; } //returns 1 on success int writeIndexOffsets(FILE *fp, bwRTreeNode_t *n, uint64_t offset) { uint32_t i; if(n->isLeaf) return 0; for(i=0; inChildren; i++) { if(writeIndexOffsets(fp, n->x.child[i], n->dataOffset[i])) return 1; if(writeAtPos(&(n->dataOffset[i]), sizeof(uint64_t), 1, offset+20+24*i, fp)) return 2; } return 0; } //Returns 0 on success int writeIndexTree(bigWigFile_t *fp) { uint64_t offset; uint8_t wrote = 0; int rv; while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->idx->root, &wrote, 0)) == 0) { if(!wrote) break; wrote = 0; } if(rv || wrote) return 1; //Save the file position offset = bwTell(fp); //Write the offsets if(writeIndexOffsets(fp->URL->x.fp, fp->idx->root, fp->idx->rootOffset)) return 2; //Move the file pointer back to the end bwSetPos(fp, offset); return 0; } //Returns 0 on success. The original state SHOULD be preserved on error int writeIndex(bigWigFile_t *fp) { uint32_t four = IDX_MAGIC; uint64_t idxSize = 0, foo; uint8_t one = 0; uint32_t i, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0 bwLL *ll = fp->writeBuffer->firstIndexNode, *p; bwRTreeNode_t *root = NULL; if(!fp->writeBuffer->nBlocks) return 1; fp->idx = malloc(sizeof(bwRTree_t)); if(!fp->idx) return 2; fp->idx->root = root; //Update the file header to indicate the proper index position foo = bwTell(fp); if(writeAtPos(&foo, sizeof(uint64_t), 1, 0x18, fp->URL->x.fp)) return 3; //Make the tree if(ll == fp->writeBuffer->currentIndexNode) { root = ll->node; idxSize = 4 + 24*root->nChildren; } else { root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize); } if(!root) return 4; fp->idx->root = root; ll = fp->writeBuffer->firstIndexNode; while(ll) { p = ll->next; free(ll); ll=p; } //write the header if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 5; if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6; if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7; if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10; if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11; if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12; four = 1; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13; four = 0; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding fp->idx->rootOffset = bwTell(fp); //Write the root node, since writeIndexTree writes the children and fills in the offset if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16; if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18; for(i=0; inChildren; i++) { vector[0] = root->chrIdxStart[i]; vector[1] = root->baseStart[i]; vector[2] = root->chrIdxEnd[i]; vector[3] = root->baseEnd[i]; if(root->isLeaf) { //Include the offset and size if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19; if(fwrite(&(root->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20; if(fwrite(&(root->x.size[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21; } else { root->dataOffset[i] = 0; //FIXME: Something upstream is setting this to impossible values (e.g., 0x21?!?!?) if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22; } } //Write each level if(writeIndexTree(fp)) return 23; return 0; } //The first zoom level has a resolution of 4x mean entry size //This may or may not produce the requested number of zoom levels int makeZoomLevels(bigWigFile_t *fp) { uint32_t meanBinSize, i; uint32_t multiplier = 4, zoom = 10; uint16_t nLevels = 0; meanBinSize = ((double) fp->writeBuffer->runningWidthSum)/(fp->writeBuffer->nEntries); //In reality, one level is skipped meanBinSize *= 4; //N.B., we must ALWAYS check that the zoom doesn't overflow a uint32_t! if(((uint32_t)-1)>>2 < meanBinSize) return 0; //No zoom levels! if(meanBinSize*4 > zoom) zoom = multiplier*meanBinSize; fp->hdr->zoomHdrs = calloc(1, sizeof(bwZoomHdr_t)); if(!fp->hdr->zoomHdrs) return 1; fp->hdr->zoomHdrs->level = malloc(fp->hdr->nLevels * sizeof(uint32_t)); fp->hdr->zoomHdrs->dataOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t)); fp->hdr->zoomHdrs->indexOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t)); fp->hdr->zoomHdrs->idx = calloc(fp->hdr->nLevels, sizeof(bwRTree_t*)); if(!fp->hdr->zoomHdrs->level) return 2; if(!fp->hdr->zoomHdrs->dataOffset) return 3; if(!fp->hdr->zoomHdrs->indexOffset) return 4; if(!fp->hdr->zoomHdrs->idx) return 5; for(i=0; ihdr->nLevels; i++) { fp->hdr->zoomHdrs->level[i] = zoom; nLevels++; if(((uint32_t)-1)/multiplier < zoom) break; zoom *= multiplier; } fp->hdr->nLevels = nLevels; fp->writeBuffer->firstZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*)); if(!fp->writeBuffer->firstZoomBuffer) goto error; fp->writeBuffer->lastZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*)); if(!fp->writeBuffer->lastZoomBuffer) goto error; fp->writeBuffer->nNodes = calloc(nLevels, sizeof(uint64_t)); for(i=0; ihdr->nLevels; i++) { fp->writeBuffer->firstZoomBuffer[i] = calloc(1, sizeof(bwZoomBuffer_t)); if(!fp->writeBuffer->firstZoomBuffer[i]) goto error; fp->writeBuffer->firstZoomBuffer[i]->p = calloc(fp->hdr->bufSize/32, 32); if(!fp->writeBuffer->firstZoomBuffer[i]->p) goto error; fp->writeBuffer->firstZoomBuffer[i]->m = fp->hdr->bufSize; ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[0] = 0; ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[1] = 0; ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->hdr->zoomHdrs->level[i]; if(fp->hdr->zoomHdrs->level[i] > fp->cl->len[0]) ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->cl->len[0]; fp->writeBuffer->lastZoomBuffer[i] = fp->writeBuffer->firstZoomBuffer[i]; } return 0; error: if(fp->writeBuffer->firstZoomBuffer) { for(i=0; ihdr->nLevels; i++) { if(fp->writeBuffer->firstZoomBuffer[i]) { if(fp->writeBuffer->firstZoomBuffer[i]->p) free(fp->writeBuffer->firstZoomBuffer[i]->p); free(fp->writeBuffer->firstZoomBuffer[i]); } } free(fp->writeBuffer->firstZoomBuffer); } if(fp->writeBuffer->lastZoomBuffer) free(fp->writeBuffer->lastZoomBuffer); if(fp->writeBuffer->nNodes) free(fp->writeBuffer->lastZoomBuffer); return 6; } //Given an interval start, calculate the next one at a zoom level void nextPos(bigWigFile_t *fp, uint32_t size, uint32_t *pos, uint32_t desiredTid) { uint32_t *tid = pos; uint32_t *start = pos+1; uint32_t *end = pos+2; *start += size; if(*start >= fp->cl->len[*tid]) { (*start) = 0; (*tid)++; } //prevent needless iteration when changing chromosomes if(*tid < desiredTid) { *tid = desiredTid; *start = 0; } (*end) = *start+size; if(*end > fp->cl->len[*tid]) (*end) = fp->cl->len[*tid]; } //Return the number of bases two intervals overlap uint32_t overlapsInterval(uint32_t tid0, uint32_t start0, uint32_t end0, uint32_t tid1, uint32_t start1, uint32_t end1) { if(tid0 != tid1) return 0; if(end0 <= start1) return 0; if(end1 <= start0) return 0; if(end0 <= end1) { if(start1 > start0) return end0-start1; return end0-start0; } else { if(start1 > start0) return end1-start1; return end1-start0; } } //Returns the number of bases of the interval written uint32_t updateInterval(bigWigFile_t *fp, bwZoomBuffer_t *buffer, double *sum, double *sumsq, uint32_t size, uint32_t tid, uint32_t start, uint32_t end, float value) { uint32_t *p2 = (uint32_t*) buffer->p; float *fp2 = (float*) p2; uint32_t rv = 0, offset = 0; if(!buffer) return 0; if(buffer->l+32 >= buffer->m) return 0; if(buffer->l) { offset = buffer->l/32; } else { p2[0] = tid; p2[1] = start; if(start+size < end) p2[2] = start+size; else p2[2] = end; } //Do we have any overlap with the previously added interval? if(offset) { rv = overlapsInterval(p2[8*(offset-1)], p2[8*(offset-1)+1], p2[8*(offset-1)+1] + size, tid, start, end); if(rv) { p2[8*(offset-1)+2] = start + rv; p2[8*(offset-1)+3] += rv; if(fp2[8*(offset-1)+4] > value) fp2[8*(offset-1)+4] = value; if(fp2[8*(offset-1)+5] < value) fp2[8*(offset-1)+5] = value; *sum += rv*value; *sumsq += rv*pow(value, 2.0); return rv; } else { fp2[8*(offset-1)+6] = *sum; fp2[8*(offset-1)+7] = *sumsq; *sum = 0.0; *sumsq = 0.0; } } //If we move to a new interval then skip iterating over a bunch of obviously non-overlapping intervals if(offset && p2[8*offset+2] == 0) { p2[8*offset] = tid; p2[8*offset+1] = start; if(start+size < end) p2[8*offset+2] = start+size; else p2[8*offset+2] = end; //nextPos(fp, size, p2+8*offset, tid); //We can actually skip uncovered intervals } //Add a new entry while(!(rv = overlapsInterval(p2[8*offset], p2[8*offset+1], p2[8*offset+1] + size, tid, start, end))) { p2[8*offset] = tid; p2[8*offset+1] = start; if(start+size < end) p2[8*offset+2] = start+size; else p2[8*offset+2] = end; //nextPos(fp, size, p2+8*offset, tid); } p2[8*offset+3] = rv; fp2[8*offset+4] = value; //min fp2[8*offset+5] = value; //max *sum += rv * value; *sumsq += rv * pow(value,2.0); buffer->l += 32; return rv; } //Returns 0 on success int addIntervalValue(bigWigFile_t *fp, uint64_t *nEntries, double *sum, double *sumsq, bwZoomBuffer_t *buffer, uint32_t itemsPerSlot, uint32_t zoom, uint32_t tid, uint32_t start, uint32_t end, float value) { bwZoomBuffer_t *newBuffer = NULL; uint32_t rv; while(start < end) { rv = updateInterval(fp, buffer, sum, sumsq, zoom, tid, start, end, value); if(!rv) { //Allocate a new buffer newBuffer = calloc(1, sizeof(bwZoomBuffer_t)); if(!newBuffer) return 1; newBuffer->p = calloc(itemsPerSlot, 32); if(!newBuffer->p) goto error; newBuffer->m = itemsPerSlot*32; memcpy(newBuffer->p, buffer->p+buffer->l-32, 4); memcpy(newBuffer->p+4, buffer->p+buffer->l-28, 4); ((uint32_t*) newBuffer->p)[2] = ((uint32_t*) newBuffer->p)[1] + zoom; *sum = *sumsq = 0.0; rv = updateInterval(fp, newBuffer, sum, sumsq, zoom, tid, start, end, value); if(!rv) goto error; buffer->next = newBuffer; buffer = buffer->next; *nEntries += 1; } start += rv; } return 0; error: if(newBuffer) { if(newBuffer->m) free(newBuffer->p); free(newBuffer); } return 2; } //Get all of the intervals and add them to the appropriate zoomBuffer int constructZoomLevels(bigWigFile_t *fp) { bwOverlappingIntervals_t *intervals = NULL; double *sum = NULL, *sumsq = NULL; uint32_t i, j, k; sum = calloc(fp->hdr->nLevels, sizeof(double)); sumsq = calloc(fp->hdr->nLevels, sizeof(double)); if(!sum || !sumsq) goto error; for(i=0; icl->nKeys; i++) { intervals = bwGetOverlappingIntervals(fp, fp->cl->chrom[i], 0, fp->cl->len[i]); if(!intervals) goto error; for(j=0; jl; j++) { for(k=0; khdr->nLevels; k++) { if(addIntervalValue(fp, &(fp->writeBuffer->nNodes[k]), sum+k, sumsq+k, fp->writeBuffer->lastZoomBuffer[k], fp->hdr->bufSize/32, fp->hdr->zoomHdrs->level[k], i, intervals->start[j], intervals->end[j], intervals->value[j])) goto error; while(fp->writeBuffer->lastZoomBuffer[k]->next) fp->writeBuffer->lastZoomBuffer[k] = fp->writeBuffer->lastZoomBuffer[k]->next; } } bwDestroyOverlappingIntervals(intervals); } //Make an index for each zoom level for(i=0; ihdr->nLevels; i++) { fp->hdr->zoomHdrs->idx[i] = calloc(1, sizeof(bwRTree_t)); if(!fp->hdr->zoomHdrs->idx[i]) return 1; fp->hdr->zoomHdrs->idx[i]->blockSize = fp->writeBuffer->blockSize; } free(sum); free(sumsq); return 0; error: if(intervals) bwDestroyOverlappingIntervals(intervals); if(sum) free(sum); if(sumsq) free(sumsq); return 1; } int writeZoomLevels(bigWigFile_t *fp) { uint64_t offset1, offset2, idxSize = 0; uint32_t i, j, four = 0, last, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0; uint8_t wrote, one = 0; uint16_t actualNLevels = 0; int rv; bwLL *ll, *p; bwRTreeNode_t *root; bwZoomBuffer_t *zb, *zb2; bwWriteBuffer_t *wb = fp->writeBuffer; uLongf sz; for(i=0; ihdr->nLevels; i++) { if(i) { //Is this a duplicate level? if(fp->writeBuffer->nNodes[i] == fp->writeBuffer->nNodes[i-1]) break; } actualNLevels++; //reserve a uint32_t for the number of blocks fp->hdr->zoomHdrs->dataOffset[i] = bwTell(fp); fp->writeBuffer->nBlocks = 0; fp->writeBuffer->l = 24; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1; zb = fp->writeBuffer->firstZoomBuffer[i]; fp->writeBuffer->firstIndexNode = NULL; fp->writeBuffer->currentIndexNode = NULL; while(zb) { sz = fp->hdr->bufSize; if(compress(wb->compressP, &sz, zb->p, zb->l) != Z_OK) return 2; //write the data to disk if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 3; //Add an entry into the index last = (zb->l - 32)>>2; if(addIndexEntry(fp, ((uint32_t*)zb->p)[0], ((uint32_t*)zb->p)[last], ((uint32_t*)zb->p)[1], ((uint32_t*)zb->p)[last+2], bwTell(fp)-sz, sz)) return 4; wb->nBlocks++; wb->l = 24; zb = zb->next; } if(writeAtPos(&(wb->nBlocks), sizeof(uint32_t), 1, fp->hdr->zoomHdrs->dataOffset[i], fp->URL->x.fp)) return 5; //Make the tree ll = fp->writeBuffer->firstIndexNode; if(ll == fp->writeBuffer->currentIndexNode) { root = ll->node; idxSize = 4 + 24*root->nChildren; } else { root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize); } if(!root) return 4; fp->hdr->zoomHdrs->idx[i]->root = root; ll = fp->writeBuffer->firstIndexNode; while(ll) { p = ll->next; free(ll); ll=p; } //write the index wrote = 0; fp->hdr->zoomHdrs->indexOffset[i] = bwTell(fp); four = IDX_MAGIC; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1; root = fp->hdr->zoomHdrs->idx[i]->root; if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6; if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7; if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10; if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11; if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12; four = fp->hdr->bufSize/32; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13; four = 0; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding fp->hdr->zoomHdrs->idx[i]->rootOffset = bwTell(fp); //Write the root node, since writeIndexTree writes the children and fills in the offset offset1 = bwTell(fp); if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16; if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18; for(j=0; jnChildren; j++) { vector[0] = root->chrIdxStart[j]; vector[1] = root->baseStart[j]; vector[2] = root->chrIdxEnd[j]; vector[3] = root->baseEnd[j]; if(root->isLeaf) { //Include the offset and size if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19; if(fwrite(&(root->dataOffset[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20; if(fwrite(&(root->x.size[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21; } else { if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22; } } while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->hdr->zoomHdrs->idx[i]->root, &wrote, 0)) == 0) { if(!wrote) break; wrote = 0; } if(rv || wrote) return 6; //Save the file position offset2 = bwTell(fp); //Write the offsets if(writeIndexOffsets(fp->URL->x.fp, root, offset1)) return 2; //Move the file pointer back to the end bwSetPos(fp, offset2); //Free the linked list zb = fp->writeBuffer->firstZoomBuffer[i]; while(zb) { if(zb->p) free(zb->p); zb2 = zb->next; free(zb); zb = zb2; } fp->writeBuffer->firstZoomBuffer[i] = NULL; } //Free unused zoom levels for(i=actualNLevels; ihdr->nLevels; i++) { zb = fp->writeBuffer->firstZoomBuffer[i]; while(zb) { if(zb->p) free(zb->p); zb2 = zb->next; free(zb); zb = zb2; } fp->writeBuffer->firstZoomBuffer[i] = NULL; } //Write the zoom headers to disk offset1 = bwTell(fp); if(bwSetPos(fp, 0x40)) return 7; four = 0; for(i=0; ihdr->zoomHdrs->level[i]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; if(fwrite(&(fp->hdr->zoomHdrs->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 10; if(fwrite(&(fp->hdr->zoomHdrs->indexOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 11; } //Write the number of levels if needed if(bwSetPos(fp, 0x6)) return 12; if(fwrite(&actualNLevels, sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 13; if(bwSetPos(fp, offset1)) return 14; return 0; } //0 on success int bwFinalize(bigWigFile_t *fp) { uint32_t four; uint64_t offset; if(!fp->isWrite) return 0; //Flush the buffer if(flushBuffer(fp)) return 1; //Valgrind reports a problem here! //Update the data section with the number of blocks written if(fp->hdr) { if(writeAtPos(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->hdr->dataOffset, fp->URL->x.fp)) return 2; } else { //The header wasn't written! return 1; } //write the bufferSize if(fp->hdr->bufSize) { if(writeAtPos(&(fp->hdr->bufSize), sizeof(uint32_t), 1, 0x34, fp->URL->x.fp)) return 2; } //write the summary information if(writeSummary(fp)) return 3; //Convert the linked-list to a tree and write to disk if(writeIndex(fp)) return 4; //Zoom level stuff here? if(fp->hdr->nLevels) { offset = bwTell(fp); if(makeZoomLevels(fp)) return 5; if(constructZoomLevels(fp)) return 6; bwSetPos(fp, offset); if(writeZoomLevels(fp)) return 7; //This write nLevels as well } //write magic at the end of the file four = BIGWIG_MAGIC; if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9; return 0; } /* data chunk: uint64_t number of blocks (2 / 110851) some blocks an uncompressed data block (24 byte header) uint32_t Tid 0-4 uint32_t start 4-8 uint32_t end 8-12 uint32_t step 12-16 uint32_t span 16-20 uint8_t type 20 uint8_t padding uint16_t nItems 22 nItems of: type 1: //12 bytes uint32_t start uint32_t end float value type 2: //8 bytes uint32_t start float value type 3: //4 bytes float value data block index header uint32_t magic uint32_t blockSize (256 in the example) maximum number of children uint64_t number of blocks (2 / 110851) uint32_t startTid uint32_t startPos uint32_t endTid uint32_t endPos uint64_t index size? (0x1E7 / 0x1AF0401F) index address? uint32_t itemsPerBlock (1 / 1) 1024 for zoom headers 1024 for zoom headers uint32_t padding data block index node non-leaf (4 bytes + 24*nChildren) uint8_t isLeaf uint8_t padding uint16_t nChildren (2, 256) uint32_t startTid uint32_t startPos uint32_t endTid uint32_t endPos uint64_t dataOffset (0x1AF05853, 0x1AF07057) data block index node leaf (4 bytes + 32*nChildren) uint8_t isLeaf uint8_t padding uint16_t nChildren (2) uint32_t startTid uint32_t startPos uint32_t endTid uint32_t endPos uint64_t dataOffset (0x198, 0x1CF) uint64_t dataSize (55, 24) zoom data block uint32_t number of blocks (10519766) some data blocks */ pyBigWig-0.3.2/libBigWig/io.c0000640000201600010240000002117713015055353016261 0ustar ryanbioinfo00000000000000#include #include #include #include #include #include "io.h" #include size_t GLOBAL_DEFAULTBUFFERSIZE; uint64_t getContentLength(URL_t *URL) { double size; if(curl_easy_getinfo(URL->x.curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &size) != CURLE_OK) { return 0; } if(size== -1.0) return 0; return (uint64_t) size; } //Fill the buffer, note that URL may be left in an unusable state on error! CURLcode urlFetchData(URL_t *URL, unsigned long bufSize) { CURLcode rv; char range[1024]; if(URL->filePos != (size_t) -1) URL->filePos += URL->bufLen; else URL->filePos = 0; URL->bufPos = URL->bufLen = 0; //Otherwise, we can't copy anything into the buffer! sprintf(range,"%lu-%lu", URL->filePos, URL->filePos+bufSize-1); rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); if(rv != CURLE_OK) { fprintf(stderr, "[urlFetchData] Couldn't set the range (%s)\n", range); return rv; } rv = curl_easy_perform(URL->x.curl); return rv; } //Read data into a buffer, ideally from a buffer already in memory //The loop is likely no longer needed. size_t url_fread(void *obuf, size_t obufSize, URL_t *URL) { size_t remaining = obufSize; void *p = obuf; CURLcode rv; while(remaining) { if(!URL->bufLen) { rv = urlFetchData(URL, URL->bufSize); if(rv != CURLE_OK) { fprintf(stderr, "[url_fread] urlFetchData (A) returned %s\n", curl_easy_strerror(rv)); return 0; } } else if(URL->bufLen < URL->bufPos + remaining) { //Copy the remaining buffer and reload the buffer as needed p = memcpy(p, URL->memBuf+URL->bufPos, URL->bufLen - URL->bufPos); if(!p) return 0; p += URL->bufLen - URL->bufPos; remaining -= URL->bufLen - URL->bufPos; if(remaining) { rv = urlFetchData(URL, (remainingbufSize)?remaining:URL->bufSize); if(rv != CURLE_OK) { fprintf(stderr, "[url_fread] urlFetchData (B) returned %s\n", curl_easy_strerror(rv)); return 0; } } } else { p = memcpy(p, URL->memBuf+URL->bufPos, remaining); if(!p) return 0; URL->bufPos += remaining; remaining = 0; } } return obufSize; } //Returns the number of bytes requested or a smaller number on error //Note that in the case of remote files, the actual amount read may be less than the return value! size_t urlRead(URL_t *URL, void *buf, size_t bufSize) { if(URL->type==0) { return fread(buf, bufSize, 1, URL->x.fp)*bufSize; } else { return url_fread(buf, bufSize, URL); } } size_t bwFillBuffer(void *inBuf, size_t l, size_t nmemb, void *pURL) { URL_t *URL = (URL_t*) pURL; void *p = URL->memBuf; size_t copied = l*nmemb; if(!p) return 0; p += URL->bufLen; if(l*nmemb > URL->bufSize - URL->bufPos) { //We received more than we can store! copied = URL->bufSize - URL->bufLen; } memcpy(p, inBuf, copied); URL->bufLen += copied; if(!URL->memBuf) return 0; //signal error return copied; } //Seek to an arbitrary location, returning a CURLcode //Note that a local file returns CURLE_OK on success or CURLE_FAILED_INIT on any error; CURLcode urlSeek(URL_t *URL, size_t pos) { char range[1024]; CURLcode rv; if(URL->type == BWG_FILE) { if(fseek(URL->x.fp, pos, SEEK_SET) == 0) { return CURLE_OK; } else { return CURLE_FAILED_INIT; //This is arbitrary } } else { //If the location is covered by the buffer then don't seek! if(pos < URL->filePos || pos >= URL->filePos+URL->bufSize) { URL->filePos = pos; URL->bufLen = 0; //Otherwise, filePos will get incremented on the next read! URL->bufPos = 0; //Maybe this works for FTP? sprintf(range,"%lu-%lu", pos, pos+URL->bufSize-1); rv = curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range); if(rv != CURLE_OK) { fprintf(stderr, "[urlSeek] Couldn't set the range (%s)\n", range); return rv; } rv = curl_easy_perform(URL->x.curl); if(rv != CURLE_OK) { fprintf(stderr, "[urlSeek] curl_easy_perform received an error!\n"); } return rv; } else { URL->bufPos = pos-URL->filePos; return CURLE_OK; } } } URL_t *urlOpen(char *fname, CURLcode (*callBack)(CURL*), const char *mode) { URL_t *URL = calloc(1, sizeof(URL_t)); if(!URL) return NULL; char *url = NULL, *req = NULL; CURLcode code; char range[1024]; URL->fname = fname; if((!mode) || (strchr(mode, 'w') == 0)) { //Set the protocol if(strncmp(fname, "http://", 7) == 0) URL->type = BWG_HTTP; else if(strncmp(fname, "https://", 8) == 0) URL->type = BWG_HTTPS; else if(strncmp(fname, "ftp://", 6) == 0) URL->type = BWG_FTP; else URL->type = BWG_FILE; //local file? if(URL->type == BWG_FILE) { URL->filePos = -1; //This signals that nothing has been read URL->x.fp = fopen(fname, "rb"); if(!(URL->x.fp)) { free(URL); fprintf(stderr, "[urlOpen] Couldn't open %s for reading\n", fname); return NULL; } } else { //Remote file, set up the memory buffer and get CURL ready URL->memBuf = malloc(GLOBAL_DEFAULTBUFFERSIZE); if(!(URL->memBuf)) { free(URL); fprintf(stderr, "[urlOpen] Couldn't allocate enough space for the file buffer!\n"); return NULL; } URL->bufSize = GLOBAL_DEFAULTBUFFERSIZE; URL->x.curl = curl_easy_init(); if(!(URL->x.curl)) { fprintf(stderr, "[urlOpen] curl_easy_init() failed!\n"); goto error; } //Follow redirects if(curl_easy_setopt(URL->x.curl, CURLOPT_FOLLOWLOCATION, 1L) != CURLE_OK) { fprintf(stderr, "[urlOpen] Failed instructing curl to follow redirects!\n"); goto error; } //Set the URL if(curl_easy_setopt(URL->x.curl, CURLOPT_URL, fname) != CURLE_OK) { fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_URL!\n"); goto error; } //Set the range, which doesn't do anything for HTTP sprintf(range, "0-%lu", URL->bufSize-1); if(curl_easy_setopt(URL->x.curl, CURLOPT_RANGE, range) != CURLE_OK) { fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_RANGE (%s)!\n", range); goto error; } //Set the callback info, which means we no longer need to directly deal with sockets and header! if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEFUNCTION, bwFillBuffer) != CURLE_OK) { fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEFUNCTION!\n"); goto error; } if(curl_easy_setopt(URL->x.curl, CURLOPT_WRITEDATA, (void*)URL) != CURLE_OK) { fprintf(stderr, "[urlOpen] Couldn't set CURLOPT_WRITEDATA!\n"); goto error; } if(callBack) { code = callBack(URL->x.curl); if(code != CURLE_OK) { fprintf(stderr, "[urlOpen] The user-supplied call back function returned an error: %s\n", curl_easy_strerror(code)); goto error; } } code = curl_easy_perform(URL->x.curl); if(code != CURLE_OK) { fprintf(stderr, "[urlOpen] curl_easy_perform received an error: %s\n", curl_easy_strerror(code)); goto error; } } } else { URL->type = BWG_FILE; URL->x.fp = fopen(fname, mode); if(!(URL->x.fp)) { free(URL); fprintf(stderr, "[urlOpen] Couldn't open %s for writing\n", fname); return NULL; } } if(url) free(url); if(req) free(req); return URL; error: if(url) free(url); if(req) free(req); free(URL->memBuf); curl_easy_cleanup(URL->x.curl); free(URL); return NULL; } //Performs the necessary free() operations and handles cleaning up curl void urlClose(URL_t *URL) { if(URL->type == BWG_FILE) { fclose(URL->x.fp); } else { free(URL->memBuf); curl_easy_cleanup(URL->x.curl); } free(URL); } pyBigWig-0.3.2/libBigWig/io.h0000640000201600010240000001014712715313174016265 0ustar ryanbioinfo00000000000000#include /*! \file io.h * These are (typically internal) IO functions, so there's generally no need for you to directly use them! */ /*! * The size of the buffer used for remote files. */ extern size_t GLOBAL_DEFAULTBUFFERSIZE; /*! * The enumerated values that indicate the connection type used to access a file. */ enum bigWigFile_type_enum { BWG_FILE = 0, BWG_HTTP = 1, BWG_HTTPS = 2, BWG_FTP = 3 }; /*! * @brief This structure holds the file pointers and buffers needed for raw access to local and remote files. */ typedef struct { union { CURL *curl; /**>> import pyBigWig ## Open a bigWig or bigBed file This will work if your working directory is the pyBigWig source code directory. >>> bw = pyBigWig.open("test/test.bw") Note that if the file doesn't exist you'll see an error message and `None` will be returned. Be default, all files are opened for reading and not writing. You can alter this by passing a mode containing `w`: >>> bw = pyBigWig.open("test/output.bw", "w") Note that a file opened for writing can't be queried for its intervals or statistics, it can *only* be written to. If you open a file for writing then you will next need to add a header (see the section on this below). Local and remote bigBed read access is also supported: >>> bb = pyBigWig.open("https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed") While you can specify a mode for bigBed files, it is ignored. The object returned by `pyBigWig.open()` is the same regardless of whether you're opening a bigWig or bigBed file. ## Determining the file type Since bigWig and bigBed files can both be opened, it may be necessary to determine whether a given `bigWigFile` object points to a bigWig or bigBed file. To that end, one can use the `isBigWig()` and `isBigBed()` functions: >>> bw = pyBigWig.open("test/test.bw") >>> bw.isBigWig() True >>> bw.isBigBed() False ## Access the list of chromosomes and their lengths `bigWigFile` objects contain a dictionary holding the chromosome lengths, which can be accessed with the `chroms()` accessor. >>> bw.chroms() dict_proxy({'1': 195471971L, '10': 130694993L}) You can also directly query a particular chromosome. >>> bw.chroms("1") 195471971L The lengths are stored a the "long" integer type, which is why there's an `L` suffix. If you specify a non-existant chromosome then nothing is output. >>> bw.chroms("c") >>> ## Print the header It's sometimes useful to print a bigWig's header. This is presented here as a python dictionary containing: the version (typically `4`), the number of zoom levels (`nLevels`), the number of bases described (`nBasesCovered`), the minimum value (`minVal`), the maximum value (`maxVal`), the sum of all values (`sumData`), and the sum of all squared values (`sumSquared`). The last two of these are needed for determining the mean and standard deviation. >>> bw.header() {'maxVal': 2L, 'sumData': 272L, 'minVal': 0L, 'version': 4L, 'sumSquared': 500L, 'nLevels': 1L, 'nBasesCovered': 154L} Note that this is also possible for bigBed files and the same dictionary keys will be present. Entries such as `maxVal`, `sumData`, `minVal`, and `sumSquared` are then largely not meaningful. ## Compute summary information on a range bigWig files are used to store values associated with positions and ranges of them. Typically we want to quickly access the average value over a range, which is very simple: >>> bw.stats("1", 0, 3) [0.2000000054637591] Suppose instead of the mean value, we instead wanted the maximum value: >>> bw.stats("1", 0, 3, type="max") [0.30000001192092896] Other options are "min" (the minimum value), "coverage" (the fraction of bases covered), and "std" (the standard deviation of the values). It's often the case that we would instead like to compute values of some number of evenly spaced bins in a given interval, which is also simple: >>> bw.stats("1",99, 200, type="max", nBins=2) [1.399999976158142, 1.5] `nBins` defaults to 1, just as `type` defaults to `mean`. If the start and end positions are omitted then the entire chromosome is used: >>> bw.stats("1") [1.3351851569281683] ### A note on statistics and zoom levels > A note to the lay reader: This section is rather technical and included only for the sake of completeness. The summary is that if your needs require exact mean/max/etc. summary values for an interval or intervals and that a small trade-off in speed is acceptable, that you should use the `exact=True` option in the `stats()` function. By default, there are some unintuitive aspects to computing statistics on ranges in a bigWig file. The bigWig format was originally created in the context of genome browsers. There, computing exact summary statistics for a given interval is less important than quickly being able to compute an approximate statistic (after all, browsers need to be able to quickly display a number of contiguous intervals and support scrolling/zooming). Because of this, bigWig files contain not only interval-value associations, but also `sum of values`/`sum of squared values`/`minimum value`/`maximum value`/`number of bases covered` for equally sized bins of various sizes. These different sizes are referred to as "zoom levels". The smallest zoom level has bins that are 16 times the mean interval size in the file and each subsequent zoom level has bins 4 times larger than the previous. This methodology is used in Kent's tools and, therefore, likely used in almost every currently existing bigWig file. When a bigWig file is queried for a summary statistic, the size of the interval is used to determine whether to use a zoom level and, if so, which one. The optimal zoom level is that which has the largest bins no more than half the width of the desired interval. If no such zoom level exists, the original intervals are instead used for the calculation. For the sake of consistency with other tools, pyBigWig adopts this same methodology. However, since this is (A) unintuitive and (B) undesirable in some applications, pyBigWig enables computation of exact summary statistics regardless of the interval size (i.e., it allows ignoring the zoom levels). This was originally proposed [here](https://github.com/dpryan79/pyBigWig/issues/12) and an example is below: >>> import pyBigWig >>> from numpy import mean >>> bw = pyBigWig.open("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign75mer.bigWig") >>> bw.stats('chr1', 89294, 91629) [0.20120902053804418] >>> mean(bw.values('chr1', 89294, 91629)) 0.22213841940688142 >>> bw.stats('chr1', 89294, 91629, exact=True) [0.22213841940688142] Additionally, `values()` can directly output a numpy vector: >>> bw = bw.open(" ## Retrieve values for individual bases in a range While the `stats()` method **can** be used to retrieve the original values for each base (e.g., by setting `nBins` to the number of bases), it's preferable to instead use the `values()` accessor. >>> bw.values("1", 0, 3) [0.10000000149011612, 0.20000000298023224, 0.30000001192092896] The list produced will always contain one value for every base in the range specified. If a particular base has no associated value in the bigWig file then the returned value will be `nan`. >>> bw.values("1", 0, 4) [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, nan] ## Retrieve all intervals in a range Sometimes it's convenient to retrieve all entries overlapping some range. This can be done with the `intervals()` function: >>> bw.intervals("1", 0, 3) ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896)) What's returned is a list of tuples containing: the start position, end end position, and the value. Thus, the example above has values of `0.1`, `0.2`, and `0.3` at positions `0`, `1`, and `2`, respectively. If the start and end position are omitted then all intervals on the chromosome specified are returned: >>> bw.intervals("1") ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224), (2, 3, 0.30000001192092896), (100, 150, 1.399999976158142), (150, 151, 1.5)) ## Retrieving bigBed entries As opposed to bigWig files, bigBed files hold entries, which are intervals with an associated string. You can access these entries using the `entries()` function: >>> bb = pyBigWig.open("https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed") >>> bb.entries('chr1', 10000000, 10020000) [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'), (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'), (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')] The output is a list of entry tuples. The tuple elements are the `start` and `end` position of each entry, followed by its associated `string`. The string is returned exactly as it's held in the bigBed file, so parsing it is left to you. To determine what the various fields are in these string, consult the SQL string: >>> bb.SQL() table RnaElements "BED6 + 3 scores for RNA Elements data" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Normalized score from 0-1000" char[1] strand; "+ or - or . for unknown" float level; "Expression level such as RPKM or FPKM. Set to -1 for no data." float signif; "Statistical significance such as IDR. Set to -1 for no data." uint score2; "Additional measurement/count e.g. number of reads. Set to 0 for no data." ) Note that the first three entries in the SQL string are not part of the string. If you only need to know where entries are and not their associated values, you can save memory by additionally specifying `withString=False` in `entries()`: >>> bb.entries('chr1', 10000000, 10020000, withString=False) [(10009333, 10009640), (10014007, 10014289), (10014373, 10024307)] ## Add a header to a bigWig file If you've opened a file for writing then you'll need to give it a header before you can add any entries. The header contains all of the chromosomes, **in order**, and their sizes. If your chromosome has two chromosomes, chr1 and chr2, of lengths 1 and 1.5 million bases, then the following would add an appropriate header: >>> bw.addHeader([("chr1", 1000000), ("chr2", 1500000)]) bigWig headers are case-sensitive, so `chr1` and `Chr1` are different. Likewise, `1` and `chr1` are not the same, so you can't mix Ensembl and UCSC chromosome names. After adding a header, you can then add entries. By default, up to 10 "zoom levels" are constructed for bigWig files. You can change this default number with the `maxZooms` optional argument. A common use of this is to create a bigWig file that simply holds intervals and no zoom levels: >>> bw.addHeader([("chr1", 1000000), ("chr2", 1500000)], maxZooms=0) ## Adding entries to a bigWig file Assuming you've opened a file for writing and added a header, you can then add entries. Note that the entries **must** be added in order, as bigWig files always contain ordered intervals. There are three formats that bigWig files can use internally to store entries. The most commonly observed format is identical to a [bedGraph](https://genome.ucsc.edu/goldenpath/help/bedgraph.html) file: chr1 0 100 0.0 chr1 100 120 1.0 chr1 125 126 200.0 These entries would be added as follows: >>> bw.addEntries(["chr1", "chr1", "chr1"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0]) Each entry occupies 12 bytes before compression. The second format uses a fixed span, but a variable step size between entries. These can be represented in a [wiggle](http://genome.ucsc.edu/goldenpath/help/wiggle.html) file as: variableStep chrom=chr1 span=20 500 -2.0 600 150.0 635 25.0 The above entries describe (1-based) positions 501-520, 601-620 and 636-655. These would be added as follows: >>> bw.addEntries("chr1", [500, 600, 635], values=[-2.0, 150.0, 25.0], span=20) Each entry of this type occupies 8 bytes before compression. The final format uses a fixed step and span for each entry, corresponding to the fixedStep [wiggle format](http://genome.ucsc.edu/goldenpath/help/wiggle.html): fixedStep chrom=chr1 step=30 span=20 -5.0 -20.0 25.0 The above entries describe (1-based) bases 901-920, 931-950 and 961-980 and would be added as follows: >>> bw.addEntries("chr1", 900, values=[-5.0, -20.0, 25.0], span=20, step=30) Each entry of this type occupies 4 bytes. Note that pyBigWig will try to prevent you from adding entries in an incorrect order. This, however, requires additional over-head. Should that not be acceptable, you can simply specify `validate=False` when adding entries: >>> bw.addEntries(["chr1", "chr1", "chr1"], [100, 0, 125], ends=[120, 5, 126], values=[0.0, 1.0, 200.0], validate=False) You're obviously then responsible for ensuring that you **do not** add entries out of order. The resulting files would otherwise largley not be usable. ## Close a bigWig or bigBed file A file can be closed with a simple `bw.close()`, as is commonly done with other file types. For files opened for writing, closing a file writes any buffered entries to disk, constructs and writes the file index, and constructs zoom levels. Consequently, this can take a bit of time. # Numpy As of version 0.3.0, pyBigWig supports input of coordinates using numpy integers and vectors in some functions **if numpy was installed prior to installing pyBigWig**. To determine if pyBigWig was installed with numpy support by checking the `numpy` accessor: >>> import pyBigWig >>> pyBigWig.numpy 1 If `pyBigWig.numpy` is `1`, then pyBigWig was compiled with numpy support. This means that `addEntries()` can accept numpy coordinates: >>> import pyBigWig >>> import numpy >>> bw = pyBigWig.open("/tmp/delete.bw", "w") >>> bw.addHeader([("1", 1000)], maxZooms=0) >>> chroms = np.array(["1"] * 10) >>> starts = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=np.int64) >>> ends = np.array([5, 15, 25, 35, 45, 55, 65, 75, 85, 95], dtype=np.int64) >>> values0 = np.array(np.random.random_sample(10), dtype=np.float64) >>> bw.addEntries(chroms, starts, ends=ends, values=values0) >>> bw.close() Additionally, `values()` can directly output a numpy vector: >>> bw = bw.open("/tmp/delete.bw") >>> bw.values('1', 0, 10, numpy=True) [ 0.74336642 0.74336642 0.74336642 0.74336642 0.74336642 nan nan nan nan nan] >>> type(bw.values('1', 0, 10, numpy=True)) # A note on coordinates Wiggle, bigWig, and bigBed files use 0-based half-open coordinates, which are also used by this extension. So to access the value for the first base on `chr1`, one would specify the starting position as `0` and the end position as `1`. Similarly, bases 100 to 115 would have a start of `99` and an end of `115`. This is simply for the sake of consistency with the underlying bigWig file and may change in the future. # Galaxy pyBigWig is also available as a package in [Galaxy](http://www.usegalaxy.org). You can find it in the toolshed and the [IUC](https://wiki.galaxyproject.org/IUC) is currently hosting the XML definition of this on [github](https://github.com/galaxyproject/tools-iuc/tree/master/packages/package_python_2_7_10_pybigwig_0_2_8). pyBigWig-0.3.2/pyBigWig.c0000640000201600010240000014020513016026753015531 0ustar ryanbioinfo00000000000000#include #include #include "pyBigWig.h" #ifdef WITHNUMPY #include #include "numpy/npy_common.h" #include "numpy/halffloat.h" #include "numpy/ndarrayobject.h" int lsize = NPY_SIZEOF_LONG; //Raises an exception on error, which should be checked uint32_t getNumpyU32(PyArrayObject *obj, Py_ssize_t i) { int dtype; char *p; uint32_t o = 0; npy_intp stride; //Get the dtype dtype = PyArray_TYPE(obj); //Get the stride stride = PyArray_STRIDE(obj, 0); p = PyArray_BYTES(obj) + i*stride; switch(dtype) { case NPY_INT8: if(((int8_t *) p)[0] < 0) { PyErr_SetString(PyExc_RuntimeError, "Received an integer < 0!\n"); goto error; } o += ((int8_t *) p)[0]; break; case NPY_INT16: if(((int16_t *) p)[0] < 0) { PyErr_SetString(PyExc_RuntimeError, "Received an integer < 0!\n"); goto error; } o += ((int16_t *) p)[0]; break; case NPY_INT32: if(((int32_t *) p)[0] < 0) { PyErr_SetString(PyExc_RuntimeError, "Received an integer < 0!\n"); goto error; } o += ((int32_t *) p)[0]; break; case NPY_INT64: if(((int64_t *) p)[0] < 0) { PyErr_SetString(PyExc_RuntimeError, "Received an integer < 0!\n"); goto error; } o += ((int64_t *) p)[0]; break; case NPY_UINT8: o += ((uint8_t *) p)[0]; break; case NPY_UINT16: o += ((uint16_t *) p)[0]; break; case NPY_UINT32: o += ((uint32_t *) p)[0]; break; case NPY_UINT64: if(((uint64_t *) p)[0] > (uint32_t) -1) { PyErr_SetString(PyExc_RuntimeError, "Received an integer larger than possible for a 32bit unsigned integer!\n"); goto error; } o += ((uint64_t *) p)[0]; break; default: PyErr_SetString(PyExc_RuntimeError, "Received unknown data type for conversion to uint32_t!\n"); goto error; break; } return o; error: return 0; }; //Raises an exception on error, which should be checked float getNumpyF(PyArrayObject *obj, Py_ssize_t i) { int dtype; char *p; float o = 0.0; npy_intp stride; //Get the dtype dtype = PyArray_TYPE(obj); //Get the stride stride = PyArray_STRIDE(obj, 0); p = PyArray_BYTES(obj) + i*stride; switch(dtype) { case NPY_FLOAT16: return npy_half_to_float(((npy_half*)p)[0]); case NPY_FLOAT32: return ((float*)p)[0]; case NPY_FLOAT64: if(((double*)p)[0] > FLT_MAX) { PyErr_SetString(PyExc_RuntimeError, "Received a floating point value greater than possible for a 32-bit float!\n"); goto error; } if(((double*)p)[0] < -FLT_MAX) { PyErr_SetString(PyExc_RuntimeError, "Received a floating point value less than possible for a 32-bit float!\n"); goto error; } o += ((double*)p)[0]; return o; default: PyErr_SetString(PyExc_RuntimeError, "Received unknown data type for conversion to float!\n"); goto error; break; } return o; error: return 0; } char *getNumpyStr(PyArrayObject *obj, Py_ssize_t i) { char *p , *o = NULL; npy_intp stride, j; int dtype; //Get the dtype dtype = PyArray_TYPE(obj); //Get the stride stride = PyArray_STRIDE(obj, 0); p = PyArray_BYTES(obj) + i*stride; switch(dtype) { case NPY_STRING: o = calloc(1, stride + 1); strncpy(o, p, stride); return o; case NPY_UNICODE: o = calloc(1, stride/4 + 1); for(j=0; jcl) goto error; } pybw = PyObject_New(pyBigWigFile_t, &bigWigFile); if(!pybw) goto error; pybw->bw = bw; pybw->lastTid = -1; pybw->lastType = -1; pybw->lastSpan = (uint32_t) -1; pybw->lastStep = (uint32_t) -1; pybw->lastStart = (uint32_t) -1; return (PyObject*) pybw; error: if(bw) bwClose(bw); PyErr_SetString(PyExc_RuntimeError, "Received an error during file opening!"); return NULL; } static void pyBwDealloc(pyBigWigFile_t *self) { if(self->bw) bwClose(self->bw); PyObject_DEL(self); } static PyObject *pyBwClose(pyBigWigFile_t *self, PyObject *args) { bwClose(self->bw); self->bw = NULL; Py_INCREF(Py_None); return Py_None; } //Accessor for the header (version, nLevels, nBasesCovered, minVal, maxVal, sumData, sumSquared static PyObject *pyBwGetHeader(pyBigWigFile_t *self, PyObject *args) { bigWigFile_t *bw = self->bw; PyObject *ret, *val; ret = PyDict_New(); val = PyLong_FromUnsignedLong(bw->hdr->version); if(PyDict_SetItemString(ret, "version", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromUnsignedLong(bw->hdr->nLevels); if(PyDict_SetItemString(ret, "nLevels", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromUnsignedLongLong(bw->hdr->nBasesCovered); if(PyDict_SetItemString(ret, "nBasesCovered", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromDouble(bw->hdr->minVal); if(PyDict_SetItemString(ret, "minVal", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromDouble(bw->hdr->maxVal); if(PyDict_SetItemString(ret, "maxVal", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromDouble(bw->hdr->sumData); if(PyDict_SetItemString(ret, "sumData", val) == -1) goto error; Py_DECREF(val); val = PyLong_FromDouble(bw->hdr->sumSquared); if(PyDict_SetItemString(ret, "sumSquared", val) == -1) goto error; Py_DECREF(val); return ret; error : Py_XDECREF(val); Py_XDECREF(ret); PyErr_SetString(PyExc_RuntimeError, "Received an error while getting the bigWig header!"); return NULL; } //Accessor for the chroms, args is optional static PyObject *pyBwGetChroms(pyBigWigFile_t *self, PyObject *args) { PyObject *ret = NULL, *val; bigWigFile_t *bw = self->bw; char *chrom = NULL; uint32_t i; if(!(PyArg_ParseTuple(args, "|s", &chrom)) || !chrom) { ret = PyDict_New(); for(i=0; icl->nKeys; i++) { val = PyLong_FromUnsignedLong(bw->cl->len[i]); if(PyDict_SetItemString(ret, bw->cl->chrom[i], val) == -1) goto error; Py_DECREF(val); } } else { for(i=0; icl->nKeys; i++) { if(strcmp(bw->cl->chrom[i],chrom) == 0) { ret = PyLong_FromUnsignedLong(bw->cl->len[i]); break; } } } if(!ret) { Py_INCREF(Py_None); ret = Py_None; } return ret; error : Py_XDECREF(val); Py_XDECREF(ret); PyErr_SetString(PyExc_RuntimeError, "Received an error while adding an item to the output dictionary!"); return NULL; } enum bwStatsType char2enum(char *s) { if(strcmp(s, "mean") == 0) return mean; if(strcmp(s, "std") == 0) return stdev; if(strcmp(s, "dev") == 0) return dev; if(strcmp(s, "max") == 0) return max; if(strcmp(s, "min") == 0) return min; if(strcmp(s, "cov") == 0) return cov; if(strcmp(s, "coverage") == 0) return cov; return -1; }; //Fetch summary statistics, default is the mean of the entire chromosome. static PyObject *pyBwGetStats(pyBigWigFile_t *self, PyObject *args, PyObject *kwds) { bigWigFile_t *bw = self->bw; double *val; uint32_t start, end = -1, tid; unsigned long startl = 0, endl = -1; static char *kwd_list[] = {"chrom", "start", "end", "type", "nBins", "exact", NULL}; char *chrom, *type = "mean"; PyObject *ret, *exact = Py_False; int i, nBins = 1; errno = 0; //In the off-chance that something elsewhere got an error and didn't clear it... if(bw->type == 1) { PyErr_SetString(PyExc_RuntimeError, "bigBed files have no statistics!"); return NULL; } if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|kksiO", kwd_list, &chrom, &startl, &endl, &type, &nBins, &exact)) { PyErr_SetString(PyExc_RuntimeError, "You must supply at least a chromosome!"); return NULL; } //Check inputs, reset to defaults if nothing was input if(!nBins) nBins = 1; //For some reason, not specifying this overrides the default! if(!type) type = "mean"; tid = bwGetTid(bw, chrom); if(endl == (unsigned long) -1 && tid != (uint32_t) -1) endl = bw->cl->len[tid]; if(tid == (uint32_t) -1 || startl > end || endl > end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } start = (uint32_t) startl; end = (uint32_t) endl; if(end <= start || end > bw->cl->len[tid] || start >= end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } if(char2enum(type) == doesNotExist) { PyErr_SetString(PyExc_RuntimeError, "Invalid type!"); return NULL; } //Get the actual statistics if(exact == Py_True) { val = bwStatsFromFull(bw, chrom, start, end, nBins, char2enum(type)); } else { val = bwStats(bw, chrom, start, end, nBins, char2enum(type)); } if(!val) { PyErr_SetString(PyExc_RuntimeError, "An error was encountered while fetching statistics."); return NULL; } ret = PyList_New(nBins); for(i=0; ibw; int i; uint32_t start, end = -1, tid; unsigned long startl, endl; char *chrom; PyObject *ret; bwOverlappingIntervals_t *o; if(bw->type == 1) { PyErr_SetString(PyExc_RuntimeError, "bigBed files have no values! Use 'entries' instead."); return NULL; } #ifdef WITHNUMPY static char *kwd_list[] = {"chrom", "start", "end", "numpy", NULL}; PyObject *outputNumpy = Py_False; if(!PyArg_ParseTupleAndKeywords(args, kwds, "skk|O", kwd_list, &chrom, &startl, &endl, &outputNumpy)) { #else if(!PyArg_ParseTuple(args, "skk", &chrom, &startl, &endl)) { #endif PyErr_SetString(PyExc_RuntimeError, "You must supply a chromosome, start and end position.\n"); return NULL; } tid = bwGetTid(bw, chrom); if(endl == (unsigned long) -1 && tid != (uint32_t) -1) endl = bw->cl->len[tid]; if(tid == (uint32_t) -1 || startl > end || endl > end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } start = (uint32_t) startl; end = (uint32_t) endl; if(end <= start || end > bw->cl->len[tid] || start >= end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } o = bwGetValues(self->bw, chrom, start, end, 1); if(!o) { PyErr_SetString(PyExc_RuntimeError, "An error occurred while fetching values!"); return NULL; } #ifdef WITHNUMPY if(outputNumpy == Py_True) { npy_intp len = end - start; ret = PyArray_SimpleNewFromData(1, &len, NPY_FLOAT, (void *) o->value); //This will break if numpy ever stops using malloc! PyArray_ENABLEFLAGS((PyArrayObject*) ret, NPY_ARRAY_OWNDATA); free(o->start); free(o->end); free(o); } else { #endif ret = PyList_New(end-start); for(i=0; i<(int) o->l; i++) PyList_SetItem(ret, i, PyFloat_FromDouble(o->value[i])); bwDestroyOverlappingIntervals(o); #ifdef WITHNUMPY } #endif return ret; } static PyObject *pyBwGetIntervals(pyBigWigFile_t *self, PyObject *args, PyObject *kwds) { bigWigFile_t *bw = self->bw; uint32_t start, end = -1, tid, i; unsigned long startl = 0, endl = -1; static char *kwd_list[] = {"chrom", "start", "end", NULL}; bwOverlappingIntervals_t *intervals = NULL; char *chrom; PyObject *ret; if(bw->type == 1) { PyErr_SetString(PyExc_RuntimeError, "bigBed files have no intervals! Use 'entries()' instead."); return NULL; } if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|kk", kwd_list, &chrom, &startl, &endl)) { PyErr_SetString(PyExc_RuntimeError, "You must supply at least a chromosome.\n"); return NULL; } //Sanity check tid = bwGetTid(bw, chrom); if(endl == (unsigned long) -1 && tid != (uint32_t) -1) endl = bw->cl->len[tid]; if(tid == (uint32_t) -1 || startl > end || endl > end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } start = (uint32_t) startl; end = (uint32_t) endl; if(end <= start || end > bw->cl->len[tid] || start >= end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } //Get the intervals intervals = bwGetOverlappingIntervals(bw, chrom, start, end); if(!intervals) { PyErr_SetString(PyExc_RuntimeError, "An error occurred while fetching the overlapping intervals!"); return NULL; } if(!intervals->l) { Py_INCREF(Py_None); return Py_None; } ret = PyTuple_New(intervals->l); for(i=0; il; i++) { if(PyTuple_SetItem(ret, i, Py_BuildValue("(iif)", intervals->start[i], intervals->end[i], intervals->value[i]))) { Py_DECREF(ret); bwDestroyOverlappingIntervals(intervals); PyErr_SetString(PyExc_RuntimeError, "An error occurred while constructing the output tuple!"); return NULL; } } bwDestroyOverlappingIntervals(intervals); return ret; } #if PY_MAJOR_VERSION >= 3 //Return 1 iff obj is a ready unicode type int PyString_Check(PyObject *obj) { if(PyUnicode_Check(obj)) { return PyUnicode_READY(obj)+1; } return 0; } //I don't know what happens if PyBytes_AsString(NULL) is used... char *PyString_AsString(PyObject *obj) { return PyBytes_AsString(PyUnicode_AsASCIIString(obj)); } #endif //Will return 1 for long or int types currently int isNumeric(PyObject *obj) { #ifdef WITHNUMPY if(PyArray_IsScalar(obj, Integer)) return 1; #endif #if PY_MAJOR_VERSION < 3 if(PyInt_Check(obj)) return 1; #endif return PyLong_Check(obj); } //On error, throws a runtime error, so use PyErr_Occurred() after this uint32_t Numeric2Uint(PyObject *obj) { long l; #if PY_MAJOR_VERSION < 3 if(PyInt_Check(obj)) { return (uint32_t) PyInt_AsLong(obj); } #endif l = PyLong_AsLong(obj); //Check bounds if(l > 0xFFFFFFFF) { PyErr_SetString(PyExc_RuntimeError, "Length out of bounds for a bigWig file!"); return (uint32_t) -1; } return (uint32_t) l; } //This runs bwCreateHdr, bwCreateChromList, and bwWriteHdr PyObject *pyBwAddHeader(pyBigWigFile_t *self, PyObject *args, PyObject *kwds) { bigWigFile_t *bw = self->bw; char **chroms = NULL; int64_t n; uint32_t *lengths = NULL, len; int32_t maxZooms = 10; long zoomTmp = 10; static char *kwd_list[] = {"cl", "maxZooms", NULL}; PyObject *InputTuple = NULL, *tmpObject, *tmpObject2; Py_ssize_t i, pyLen; if(!PyArg_ParseTupleAndKeywords(args, kwds, "O|k", kwd_list, &InputTuple, &zoomTmp)) { PyErr_SetString(PyExc_RuntimeError, "Illegal arguments"); return NULL; } maxZooms = zoomTmp; //Ensure that we received a list if(!PyList_Check(InputTuple)) { PyErr_SetString(PyExc_RuntimeError, "You MUST input a list of tuples (e.g., [('chr1', 1000), ('chr2', 2000)]!"); goto error; } pyLen = PyList_Size(InputTuple); if(pyLen < 1) { PyErr_SetString(PyExc_RuntimeError, "You input an empty list!"); goto error; } n = pyLen; lengths = calloc(n, sizeof(uint32_t)); chroms = calloc(n, sizeof(char*)); if(!lengths || !chroms) { PyErr_SetString(PyExc_RuntimeError, "Couldn't allocate lengths or chroms!"); goto error; } //Convert the tuple into something more useful in C for(i=0; i 0xFFFFFFFF) { PyErr_SetString(PyExc_RuntimeError, "A requested length is longer than what can be stored in a bigWig file!"); goto error; } lengths[i] = len; } //Create the header if(bwCreateHdr(bw, maxZooms)) { PyErr_SetString(PyExc_RuntimeError, "Received an error in bwCreateHdr"); goto error; } //Create the chromosome list bw->cl = bwCreateChromList(chroms, lengths, n); if(!bw->cl) { PyErr_SetString(PyExc_RuntimeError, "Received an error in bwCreateChromList"); goto error; } //Write the header if(bwWriteHdr(bw)) { PyErr_SetString(PyExc_RuntimeError, "Received an error while writing the bigWig header"); goto error; } if(lengths) free(lengths); if(chroms) free(chroms); Py_INCREF(Py_None); return Py_None; error: if(lengths) free(lengths); if(chroms) free(chroms); return NULL; } //1 on true, 0 on false int isType0(PyObject *chroms, PyObject *starts, PyObject *ends, PyObject *values) { int rv = 0; Py_ssize_t i, sz = 0; PyObject *tmp; if(!PyList_Check(chroms) #ifdef WITHNUMPY && !PyArray_Check(chroms) #endif ) return rv; if(!PyList_Check(starts) #ifdef WITHNUMPY && !PyArray_Check(starts) #endif ) return rv; if(!PyList_Check(ends) #ifdef WITHNUMPY && !PyArray_Check(ends) #endif ) return rv; if(!PyList_Check(values) #ifdef WITHNUMPY && !PyArray_Check(values) #endif ) return rv; if(PyList_Check(chroms)) sz = PyList_Size(chroms); #ifdef WITHNUMPY if(PyArray_Check(chroms)) sz += PyArray_Size(chroms); #endif if(PyList_Check(starts)) { if(sz != PyList_Size(starts)) return rv; #ifdef WITHNUMPY } else { if(sz != PyArray_Size(starts)) return rv; #endif } if(PyList_Check(ends)) { if(sz != PyList_Size(ends)) return rv; #ifdef WITHNUMPY } else { if(sz != PyArray_Size(ends)) return rv; #endif } if(PyList_Check(values)) { if(sz != PyList_Size(values)) return rv; #ifdef WITHNUMPY } else { if(sz != PyArray_Size(values)) return rv; #endif } //Ensure chroms contains strings, etc. if(PyList_Check(chroms)) { for(i=0; ibw; Py_ssize_t i, sz = 0; uint32_t tid, uspan, ustep, ustart; PyObject *tmp; #ifdef WITHNUMPY void *foo; #endif if(self->lastType == -1) return 0; if(self->lastTid == -1) return 0; if(self->lastType != desiredType) return 0; //We can only append if (A) we have the same type or (B) the same chromosome (and compatible span/step/starts if(desiredType == 0) { //We need (A) chrom == lastTid and (B) all chroms to be the same if(PyList_Check(chroms)) sz = PyList_Size(chroms); #ifdef WITHNUMPY if(PyArray_Check(chroms)) sz = PyArray_Size(chroms); #endif for(i=0; ilastTid) return 0; } #ifdef WITHNUMPY if(PyArray_Check(starts)) { ustart = getNumpyU32((PyArrayObject*)starts, 0); } else { #endif ustart = Numeric2Uint(PyList_GetItem(starts, 0)); #ifdef WITHNUMPY } #endif if(PyErr_Occurred()) return 0; if(ustart < self->lastStart) return 0; return 1; } else if(desiredType == 1) { //We need (A) chrom == lastTid, (B) all chroms to be the same, and (C) equal spans uspan = Numeric2Uint(span); if(PyErr_Occurred()) return 0; if(uspan != self->lastSpan) return 0; if(!PyString_Check(chroms)) return 0; tid = bwGetTid(bw, PyString_AsString(chroms)); if(tid != (uint32_t) self->lastTid) return 0; #ifdef WITHNUMPY if(PyList_Check(starts)) ustart = Numeric2Uint(PyList_GetItem(starts, 0)); else ustart = getNumpyU32((PyArrayObject*) starts, 0); #else ustart = Numeric2Uint(PyList_GetItem(starts, 0)); #endif if(PyErr_Occurred()) return 0; if(ustart < self->lastStart) return 0; return 1; } else if(desiredType == 2) { //We need (A) chrom == lastTid, (B) span/step to be equal and (C) compatible starts tid = bwGetTid(bw, PyString_AsString(chroms)); if(tid != (uint32_t) self->lastTid) return 0; uspan = Numeric2Uint(span); if(PyErr_Occurred()) return 0; if(uspan != self->lastSpan) return 0; ustep = Numeric2Uint(step); if(PyErr_Occurred()) return 0; if(ustep != self->lastStep) return 0; //But is the start position compatible? ustart = Numeric2Uint(starts); if(PyErr_Occurred()) return 0; if(ustart != self->lastStart) return 0; return 1; } return 0; } //Returns 0 on success, 1 on error. Sets self->lastTid && self->lastStart (unless there was an error) int PyAddIntervals(pyBigWigFile_t *self, PyObject *chroms, PyObject *starts, PyObject *ends, PyObject *values) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; char **cchroms = NULL; uint32_t n, *ustarts = NULL, *uends = NULL; float *fvalues = NULL; int rv; #ifdef WITHNUMPY void *foo; #endif if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif n = (uint32_t) sz; //Allocate space cchroms = calloc(n, sizeof(char*)); ustarts = calloc(n, sizeof(uint32_t)); uends = calloc(n, sizeof(uint32_t)); fvalues = calloc(n, sizeof(float)); if(!cchroms || !ustarts || !uends || !fvalues) goto error; for(i=0; ilastTid = bwGetTid(bw, cchroms[n-1]); self->lastStart = uends[n-1]; } free(cchroms); free(ustarts); free(uends); free(fvalues); return rv; error: if(cchroms) free(cchroms); if(ustarts) free(ustarts); if(uends) free(uends); if(fvalues) free(fvalues); return 1; } //Returns 0 on success, 1 on error. Update self->lastStart int PyAppendIntervals(pyBigWigFile_t *self, PyObject *starts, PyObject *ends, PyObject *values) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; uint32_t n, *ustarts = NULL, *uends = NULL; float *fvalues = NULL; int rv; if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif n = (uint32_t) sz; //Allocate space ustarts = calloc(n, sizeof(uint32_t)); uends = calloc(n, sizeof(uint32_t)); fvalues = calloc(n, sizeof(float)); if(!ustarts || !uends || !fvalues) goto error; for(i=0; ilastStart = uends[n-1]; free(ustarts); free(uends); free(fvalues); return rv; error: if(ustarts) free(ustarts); if(uends) free(uends); if(fvalues) free(fvalues); return 1; } //Returns 0 on success, 1 on error. Sets self->lastTid/lastStart/lastSpan (unless there was an error) int PyAddIntervalSpans(pyBigWigFile_t *self, PyObject *chroms, PyObject *starts, PyObject *values, PyObject *span) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; char *cchroms = NULL; uint32_t n, *ustarts = NULL, uspan; float *fvalues = NULL; int rv; if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY else if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif n = (uint32_t) sz; //Allocate space ustarts = calloc(n, sizeof(uint32_t)); fvalues = calloc(n, sizeof(float)); if(!ustarts || !fvalues) goto error; uspan = (uint32_t) PyLong_AsLong(span); cchroms = PyString_AsString(chroms); if(PyList_Check(starts)) { for(i=0; ilastTid = bwGetTid(bw, cchroms); self->lastSpan = uspan; self->lastStart = ustarts[n-1]+uspan; } free(ustarts); free(fvalues); return rv; error: if(ustarts) free(ustarts); if(fvalues) free(fvalues); return 1; } //Returns 0 on success, 1 on error. Updates self->lastStart int PyAppendIntervalSpans(pyBigWigFile_t *self, PyObject *starts, PyObject *values) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; uint32_t n, *ustarts = NULL; float *fvalues = NULL; int rv; if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY else if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif n = (uint32_t) sz; //Allocate space ustarts = calloc(n, sizeof(uint32_t)); fvalues = calloc(n, sizeof(float)); if(!ustarts || !fvalues) goto error; if(PyList_Check(starts)) { for(i=0; ilastStart = ustarts[n-1] + self->lastSpan; free(ustarts); free(fvalues); return rv; error: if(ustarts) free(ustarts); if(fvalues) free(fvalues); return 1; } //Returns 0 on success, 1 on error. Sets self->lastTid/self->lastSpan/lastStep/lastStart (unless there was an error) int PyAddIntervalSpanSteps(pyBigWigFile_t *self, PyObject *chroms, PyObject *starts, PyObject *values, PyObject *span, PyObject *step) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; char *cchrom = NULL; uint32_t n, ustarts, uspan, ustep; float *fvalues = NULL; int rv; if(PyList_Check(values)) sz = PyList_Size(values); #ifdef WITHNUMPY else if(PyArray_Check(values)) sz += PyArray_Size(values); #endif n = (uint32_t) sz; //Allocate space fvalues = calloc(n, sizeof(float)); if(!fvalues) goto error; uspan = (uint32_t) PyLong_AsLong(span); ustep = (uint32_t) PyLong_AsLong(step); ustarts = (uint32_t) PyLong_AsLong(starts); cchrom = PyString_AsString(chroms); if(PyList_Check(values)) { for(i=0; ilastTid = bwGetTid(bw, cchrom); self->lastSpan = uspan; self->lastStep = ustep; self->lastStart = ustarts + ustep*n; } free(fvalues); return rv; error: if(fvalues) free(fvalues); return 1; } //Returns 0 on success, 1 on error. Sets self->lastStart int PyAppendIntervalSpanSteps(pyBigWigFile_t *self, PyObject *values) { bigWigFile_t *bw = self->bw; Py_ssize_t i, sz = 0; uint32_t n; float *fvalues = NULL; int rv; if(PyList_Check(values)) sz = PyList_Size(values); #ifdef WITHNUMPY else if(PyArray_Check(values)) sz += PyArray_Size(values); #endif n = (uint32_t) sz; //Allocate space fvalues = calloc(n, sizeof(float)); if(!fvalues) goto error; if(PyList_Check(values)) { for(i=0; ilastStart += self->lastStep * n; free(fvalues); return rv; error: if(fvalues) free(fvalues); return 1; } //Checks and ensures that (A) the entries are sorted correctly and don't overlap and (B) that the come after things that have already been added. //Returns 1 on correct input, 0 on incorrect input int addEntriesInputOK(pyBigWigFile_t *self, PyObject *chroms, PyObject *starts, PyObject *ends, PyObject *span, PyObject *step, int type) { uint32_t lastTid = self->lastTid; uint32_t lastEnd = self->lastStart; uint32_t cTid, ustart, uend, uspan, ustep; Py_ssize_t i, sz = 0; PyObject *tmp; #ifdef WITHNUMPY char *tmpStr; #endif if(type == 0) { //Each chrom:start-end needs to be properly formed and come after prior entries if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif if(sz == 0) return 0; for(i=0; ibw, tmpStr); free(tmpStr); } else { #endif tmp = PyList_GetItem(chroms, i); cTid = bwGetTid(self->bw, PyString_AsString(tmp)); #ifdef WITHNUMPY } #endif if(PyErr_Occurred()) return 0; if(cTid == (uint32_t) -1) return 0; #ifdef WITHNUMPY if(PyArray_Check(starts)) { ustart = getNumpyU32((PyArrayObject*)starts, i); } else { #endif ustart = Numeric2Uint(PyList_GetItem(starts, i)); #ifdef WITHNUMPY } #endif if(PyErr_Occurred()) return 0; #ifdef WITHNUMPY if(PyArray_Check(ends)) { uend = getNumpyU32((PyArrayObject*) ends, i); } else { #endif uend = Numeric2Uint(PyList_GetItem(ends, i)); #ifdef WITHNUMPY } #endif if(PyErr_Occurred()) return 0; if(ustart >= uend) return 0; if(lastTid != (uint32_t) -1) { if(lastTid > cTid) return 0; if(lastTid == cTid) { if(ustart < lastEnd) return 0; } } lastTid = cTid; lastEnd = uend; } return 1; } else if(type == 1) { //each chrom:start-(start+span) needs to be properly formed and come after prior entries if(!PyList_Check(starts) #ifdef WITHNUMPY && !PyArray_Check(starts) #endif ) return 0; if(PyList_Check(starts)) sz = PyList_Size(starts); #ifdef WITHNUMPY else if(PyArray_Check(starts)) sz += PyArray_Size(starts); #endif uspan = Numeric2Uint(span); if(PyErr_Occurred()) return 0; if(uspan < 1) return 0; if(sz == 0) return 0; cTid = bwGetTid(self->bw, PyString_AsString(chroms)); if(cTid == (uint32_t) -1) return 0; if(lastTid != (uint32_t) -1) { if(lastTid > cTid) return 0; } for(i=0; ibw, PyString_AsString(chroms)); if(cTid == (uint32_t) -1) return 0; ustart = Numeric2Uint(starts); if(PyErr_Occurred()) return 0; uspan = Numeric2Uint(span); if(PyErr_Occurred()) return 0; if(uspan < 1) return 0; ustep = Numeric2Uint(step); if(PyErr_Occurred()) return 0; if(ustep < 1) return 0; if(lastTid != (uint32_t) -1) { if(lastTid > cTid) return 0; if(lastTid == cTid) { if(ustart < lastEnd) return 0; } } return 1; } return 0; } PyObject *pyBwAddEntries(pyBigWigFile_t *self, PyObject *args, PyObject *kwds) { static char *kwd_list[] = {"chroms", "starts", "ends", "values", "span", "step", "validate", NULL}; PyObject *chroms = NULL, *starts = NULL, *ends = NULL, *values = NULL, *span = NULL, *step = NULL; PyObject *validate = Py_True; int desiredType; if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO|OOOOO", kwd_list, &chroms, &starts, &ends, &values, &span, &step, &validate)) { PyErr_SetString(PyExc_RuntimeError, "Illegal arguments"); return NULL; } desiredType = getType(chroms, starts, ends, values, span, step); if(desiredType == -1) { PyErr_SetString(PyExc_RuntimeError, "You must provide a valid set of entries. These can be comprised of any of the following: \n" "1. A list of each of chromosomes, start positions, end positions and values.\n" "2. A list of each of start positions and values. Also, a chromosome and span must be specified.\n" "3. A list values, in which case a single chromosome, start position, span and step must be specified.\n"); return NULL; } if(validate == Py_True && !addEntriesInputOK(self, chroms, starts, ends, span, step, desiredType)) { PyErr_SetString(PyExc_RuntimeError, "The entries you tried to add are out of order, precede already added entries, or otherwise use illegal values.\n" " Please correct this and try again.\n"); return NULL; } if(canAppend(self, desiredType, chroms, starts, span, step)) { switch(desiredType) { case 0: if(PyAppendIntervals(self, starts, ends, values)) goto error; break; case 1: if(PyAppendIntervalSpans(self, starts, values)) goto error; break; case 2: if(PyAppendIntervalSpanSteps(self, values)) goto error; break; } } else { switch(desiredType) { case 0: if(PyAddIntervals(self, chroms, starts, ends, values)) goto error; break; case 1: if(PyAddIntervalSpans(self, chroms, starts, values, span)) goto error; break; case 2: if(PyAddIntervalSpanSteps(self, chroms, starts, values, span, step)) goto error; break; } } self->lastType = desiredType; Py_INCREF(Py_None); return Py_None; error: return NULL; } /************************************************************** * * BigBed functions, added in 0.3.0 * **************************************************************/ static PyObject *pyBBGetEntries(pyBigWigFile_t *self, PyObject *args, PyObject *kwds) { bigWigFile_t *bw = self->bw; uint32_t i; uint32_t start, end = -1, tid; unsigned long startl, endl; char *chrom; static char *kwd_list[] = {"chrom", "start", "end", "withString", NULL}; PyObject *ret, *t; PyObject *withStringPy = Py_True; int withString = 1; bbOverlappingEntries_t *o; if(bw->type == 0) { PyErr_SetString(PyExc_RuntimeError, "bigWig files have no entries! Use 'intervals' or 'values' instead."); return NULL; } if(!PyArg_ParseTupleAndKeywords(args, kwds, "skk|O", kwd_list, &chrom, &startl, &endl, &withStringPy)) { PyErr_SetString(PyExc_RuntimeError, "You must supply a chromosome, start and end position.\n"); return NULL; } tid = bwGetTid(bw, chrom); if(endl == (unsigned long) -1 && tid != (uint32_t) -1) endl = bw->cl->len[tid]; if(tid == (uint32_t) -1 || startl > end || endl > end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } start = (uint32_t) startl; end = (uint32_t) endl; if(end <= start || end > bw->cl->len[tid] || start >= end) { PyErr_SetString(PyExc_RuntimeError, "Invalid interval bounds!"); return NULL; } if(withStringPy == Py_False) withString = 0; o = bbGetOverlappingEntries(bw, chrom, start, end, withString); if(!o) { PyErr_SetString(PyExc_RuntimeError, "An error occurred while fetching the overlapping entries!\n"); return NULL; } if(!o->l) { Py_INCREF(Py_None); return Py_None; } ret = PyList_New(o->l); if(!ret) goto error; for(i=0; il; i++) { if(withString) { t = Py_BuildValue("(iis)", o->start[i], o->end[i], o->str[i]); } else { t = Py_BuildValue("(ii)", o->start[i], o->end[i]); } if(!t) goto error; PyList_SetItem(ret, i, t); } bbDestroyOverlappingEntries(o); return ret; error: Py_DECREF(ret); bbDestroyOverlappingEntries(o); PyErr_SetString(PyExc_RuntimeError, "An error occurred while constructing the output list and tuple!"); return NULL; } static PyObject *pyBBGetSQL(pyBigWigFile_t *self, PyObject *args) { bigWigFile_t *bw = self->bw; char *str = bbGetSQL(bw); size_t len = 0; PyObject *o = NULL; if(!str) { Py_INCREF(Py_None); return Py_None; } len = strlen(str); #if PY_MAJOR_VERSION >= 3 o = PyBytes_FromStringAndSize(str, len); #else o = PyString_FromStringAndSize(str, len); #endif if(str) free(str); return o; } static PyObject *pyIsBigWig(pyBigWigFile_t *self, PyObject *args) { bigWigFile_t *bw = self->bw; if(bw->type == 0) { Py_INCREF(Py_True); return Py_True; } Py_INCREF(Py_False); return Py_False; } static PyObject *pyIsBigBed(pyBigWigFile_t *self, PyObject *args) { bigWigFile_t *bw = self->bw; if(bw->type == 1) { Py_INCREF(Py_True); return Py_True; } Py_INCREF(Py_False); return Py_False; } /************************************************************** * * End of bigBed functions * **************************************************************/ #if PY_MAJOR_VERSION >= 3 PyMODINIT_FUNC PyInit_pyBigWig(void) { PyObject *res; errno = 0; //just in case if(Py_AtExit(bwCleanup)) return NULL; if(PyType_Ready(&bigWigFile) < 0) return NULL; if(Py_AtExit(bwCleanup)) return NULL; if(bwInit(128000)) return NULL; res = PyModule_Create(&pyBigWigmodule); if(!res) return NULL; Py_INCREF(&bigWigFile); PyModule_AddObject(res, "pyBigWig", (PyObject *) &bigWigFile); #ifdef WITHNUMPY //Add the numpy constant import_array(); //Needed for numpy stuff to work PyModule_AddIntConstant(res, "numpy", 1); #else PyModule_AddIntConstant(res, "numpy", 0); #endif return res; } #else //Python2 initialization PyMODINIT_FUNC initpyBigWig(void) { PyObject *res; errno=0; //Sometimes libpython2.7.so is missing some links... if(Py_AtExit(bwCleanup)) return; if(PyType_Ready(&bigWigFile) < 0) return; if(bwInit(128000)) return; //This is temporary res = Py_InitModule3("pyBigWig", bwMethods, "A module for handling bigWig files"); #ifdef WITHNUMPY //Add the numpy constant import_array(); //Needed for numpy stuff to work PyModule_AddIntConstant(res, "numpy", 1); #else PyModule_AddIntConstant(res, "numpy", 0); #endif } #endif pyBigWig-0.3.2/pyBigWig.h0000640000201600010240000004333413015055353015540 0ustar ryanbioinfo00000000000000#include #include #include "bigWig.h" typedef struct { PyObject_HEAD bigWigFile_t *bw; int32_t lastTid; //The TID of the last written entry (or -1) uint32_t lastSpan; //The span of the last written entry (if applicable) uint32_t lastStep; //The step of the last written entry (if applicable) uint32_t lastStart; //The next start position (if applicable) int lastType; //The type of the last written entry } pyBigWigFile_t; static PyObject *pyBwOpen(PyObject *self, PyObject *pyFname); static PyObject *pyBwClose(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyBwGetChroms(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyIsBigWig(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyIsBigBed(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyBwGetStats(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); #ifdef WITHNUMPY static PyObject *pyBwGetValues(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); #else static PyObject *pyBwGetValues(pyBigWigFile_t *pybw, PyObject *args); #endif static PyObject *pyBwGetIntervals(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); static PyObject *pyBBGetEntries(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); static PyObject *pyBBGetSQL(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyBwGetHeader(pyBigWigFile_t *pybw, PyObject *args); static PyObject *pyBwAddHeader(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); static PyObject *pyBwAddEntries(pyBigWigFile_t *pybw, PyObject *args, PyObject *kwds); static void pyBwDealloc(pyBigWigFile_t *pybw); //The function types aren't actually correct... static PyMethodDef bwMethods[] = { {"open", (PyCFunction)pyBwOpen, METH_VARARGS, "Open a bigWig or bigBed file. For remote files, give a URL starting with HTTP,\n\ FTP, or HTTPS.\n\ \n\ Optional arguments:\n\ mode: An optional mode. The default is 'r', which opens a file for reading.\n\ If you specify a mode containing 'w' then you'll instead open a file\n\ for writing. Note that you then need to add an appropriate header\n\ before use. For bigBed files, only reading is supported.\n\ \n\ Returns:\n\ A bigWigFile object on success, otherwise None.\n\ \n\ Arguments:\n\ file: The name of a bigWig file.\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"some_file.bw\")\n"}, {"header", (PyCFunction)pyBwGetHeader, METH_VARARGS, "Returns the header of a bigWig file. This contains information such as: \n\ * The version number of the file ('version').\n\ * The number of zoom levels ('nLevels').\n\ * The number of bases covered ('nBasesCovered').\n\ * The minimum value ('minVal').\n\ * The maximum value ('maxVal').\n\ * The sum of all values ('sumData').\n\ * The sum of the square of all values ('sumSquared').\n\ These are returned as a dictionary.\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"some_file.bw\")\n\ >>> bw.header()\n\ {'maxVal': 2L, 'sumData': 272L, 'minVal': 0L, 'version': 4L,\n\ 'sumSquared': 500L, 'nLevels': 1L, 'nBasesCovered': 154L}\n\ >>> bw.close()\n"}, {"close", (PyCFunction)pyBwClose, METH_VARARGS, "Close a bigWig file.\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"some_file.bw\")\n\ >>> bw.close()\n"}, {"isBigWig", (PyCFunction)pyIsBigWig, METH_VARARGS, "Returns True if the object is a bigWig file (otherwise False).\n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"some_file.bigWig\")\n\ >>> bw.isBigWig()\n\ True\n\ >>> bw.isBigBed()\n\ False\n"}, {"isBigBed", (PyCFunction)pyIsBigBed, METH_VARARGS, "Returns true if the object is a bigBed file (otherwise False).\n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"some_file.bigBed\")\n\ >>> bw.isBigWig()\n\ False\n\ >>> bw.isBigBed()\n\ True\n"}, {"chroms", (PyCFunction)pyBwGetChroms, METH_VARARGS, "Return a chromosome: length dictionary. The order is typically not\n\ alphabetical and the lengths are long (thus the 'L' suffix).\n\ \n\ Optional arguments:\n\ chrom: An optional chromosome name\n\ \n\ Returns:\n\ A list of chromosome lengths or a dictionary of them.\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"test/test.bw\")\n\ >>> bw.chroms()\n\ {'1': 195471971L, '10': 130694993L}\n\ \n\ Note that you may optionally supply a specific chromosome:\n\ \n\ >>> bw.chroms(\"chr1\")\n\ 195471971L\n\ \n\ If you specify a non-existant chromosome then no output is produced:\n\ \n\ >>> bw.chroms(\"foo\")\n\ >>>\n"}, {"stats", (PyCFunction)pyBwGetStats, METH_VARARGS|METH_KEYWORDS, "Return summary statistics for a given range. On error, this function throws a\n\ runtime exception.\n\ \n\ Positional arguments:\n\ chr: Chromosome name\n\ \n\ Keyword arguments:\n\ start: Starting position\n\ end: Ending position\n\ type: Summary type (mean, min, max, coverage, std), default 'mean'.\n\ nBins: Number of bins into which the range should be divided before\n\ computing summary statistics. The default is 1.\n\ exact: By default, pyBigWig uses the same method as Kent's tools from UCSC\n\ for computing statistics. This means that 'zoom levels' may be\n\ used, rather than actual values (please see the pyBigWig repository\n\ on github for further information on this). To avoid this behaviour,\n\ simply specify 'exact=True'. Note that values returned will then\n\ differ from what UCSC, IGV, and similar other tools will report.\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"test/test.bw\")\n\ >>> bw.stats(\"1\", 0, 3)\n\ [0.2000000054637591]\n\ \n\ This is the mean value over the range 1:1-3 (in 1-based coordinates). If\n\ the start and end positions aren't given the entire chromosome is used.\n\ There are additional optional parameters 'type' and 'nBins'. 'type'\n\ specifies the type of summary information to calculate, which is 'mean'\n\ by default. Other possibilites for 'type' are: 'min' (minimum value),\n\ 'max' (maximum value), 'coverage' (number of covered bases), and 'std'\n\ (standard deviation). 'nBins' defines how many bins the region will be\n\ divided into and defaults to 1.\n\ \n\ >>> bw.stats(\"1\", 0, 3, type=\"min\")\n\ [0.10000000149011612]\n\ >>> bw.stats(\"1\", 0, 3, type=\"max\")\n\ [0.30000001192092896]\n\ >>> bw.stats(\"1\", 0, 10, type=\"coverage\")\n\ [0.30000000000000004]\n\ >>> bw.stats(\"1\", 0, 3, type=\"std\")\n\ [0.10000000521540645]\n\ >>> bw.stats(\"1\",99,200, type=\"max\", nBins=2)\n\ [1.399999976158142, 1.5]\n"}, #ifdef WITHNUMPY {"values", (PyCFunction)pyBwGetValues, METH_VARARGS|METH_KEYWORDS, "Retrieve the value stored for each position (or None). On error, a runtime\n\ exception is thrown.\n\ \n\ Positional arguments:\n\ chr: Chromosome name\n\ start: Starting position\n\ end: Ending position\n\ \n\ Optional arguments:\n\ numpy: If True, return a numpy array rather than a list of values. This\n\ is generally more memory efficient. Note that this option is only\n\ available if pyBigWig was installed with numpy support (check the\n\ pyBigWig.numpy() function).\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"test/test.bw\")\n\ >>> bw.values(\"1\", 0, 3)\n\ [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]\n\ \n\ The length of the returned list will always match the length of the\n\ range. Any uncovered bases will have a value of None.\n\ \n\ >>> bw.values(\"1\", 0, 4)\n\ [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, None]\n\ \n"}, #else {"values", (PyCFunction)pyBwGetValues, METH_VARARGS, "Retrieve the value stored for each position (or None). On error, a runtime\n\ exception is thrown.\n\ \n\ Positional arguments:\n\ chr: Chromosome name\n\ start: Starting position\n\ end: Ending position\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"test/test.bw\")\n\ >>> bw.values(\"1\", 0, 3)\n\ [0.10000000149011612, 0.20000000298023224, 0.30000001192092896]\n\ \n\ The length of the returned list will always match the length of the\n\ range. Any uncovered bases will have a value of None.\n\ \n\ >>> bw.values(\"1\", 0, 4)\n\ [0.10000000149011612, 0.20000000298023224, 0.30000001192092896, None]\n\ \n"}, #endif {"intervals", (PyCFunction)pyBwGetIntervals, METH_VARARGS|METH_KEYWORDS, "Retrieve each interval covering a part of a chromosome/region. On error, a\n\ runtime exception is thrown.\n\ \n\ Positional arguments:\n\ chr: Chromosome name\n\ \n\ Keyword arguments:\n\ start: Starting position\n\ end: Ending position\n\ \n\ If start and end aren't specified, the entire chromosome is returned.\n\ The returned object is a tuple containing the starting position, end\n\ position, and value of each interval in the file. As with all bigWig\n\ positions, those returned are 0-based half-open (e.g., a start of 0 and\n\ end of 10 specifies the first 10 positions).\n\ \n\ >>> import pyBigWig\n\ >>> bw = pyBigWig.open(\"test/test.bw\")\n\ >>> bw.intervals(\"1\", 0, 3)\n\ ((0, 1, 0.10000000149011612), (1, 2, 0.20000000298023224),\n\ (2, 3, 0.30000001192092896))\n\ >>> bw.close()"}, {"entries", (PyCFunction) pyBBGetEntries, METH_VARARGS|METH_KEYWORDS, "Retrieves entries from a bigBed file. These can optionally contain the string\n\ associated with each entry.\n\ \n\ Positional arguments:\n\ chr: Chromosome name\n\ \n\ Keyword arguments:\n\ start: Starting position\n\ end: Ending position\n\ withString: If True, return the string associated with each entry.\n\ Default True.\n\ \n\ The output is a list of tuples, with members \"start\", \"end\", and \"string\"\n\ (assuming \"withString=True\"). If there are no overlapping entries, then None\n\ is returned.\n\ \n\ >>> import pyBigWig\n\ >>> bb = pyBigWig.open(\"https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed\")\n\ >>> print(bw.entries('chr1',10000000,10020000))\n\ [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'),\n\ (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'),\n\ (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')]\n\ >>> print(bb.entries(\"chr1\", 10000000, 10000500, withString=False))\n\ [(10009333, 10009640), (10014007, 10014289), (10014373, 10024307)]\n\ \n"}, {"SQL", (PyCFunction) pyBBGetSQL, METH_VARARGS, "Returns the SQL string associated with the file. This is typically useful for\n\ bigBed files, where this determines what is held in each column of the text\n\ string associated with entries.\n\ \n\ If there is no SQL string, then None is returned.\n\ \n\ >>> import pyBigWig\n\ >>> bb = pyBigWig.open(\"https://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed\")\n\ >>> print(bb.SQL())\n\ table RnaElements\n\ \"BED6 + 3 scores for RNA Elements data \"\n\ (\n\ string chrom; \"Reference sequence chromosome or scaffold\"\n\ uint chromStart; \"Start position in chromosome\"\n\ uint chromEnd; \"End position in chromosome\"\n\ string name; \"Name of item\"\n\ uint score; \"Normalized score from 0-1000\"\n\ char[1] strand; \"+ or - or . for unknown\"\n\ float level; \"Expression level such as RPKM or FPKM. Set to -1 for no data.\"\n\ float signif; \"Statistical significance such as IDR. Set to -1 for no data.\"\n\ uint score2; \"Additional measurement/count e.g. number of reads. Set to 0 for no data.\"\n\ )\n\ \n\ \n"}, {"addHeader", (PyCFunction)pyBwAddHeader, METH_VARARGS|METH_KEYWORDS, "Adds a header to a file opened for writing. This MUST be called before adding\n\ any entries. On error, a runtime exception is thrown.\n\ \n\ Positional arguments:\n\ cl: A chromosome list, of the form (('chr1', 1000), ('chr2', 2000), ...).\n\ In other words, each element of the list is a tuple containing a\n\ chromosome name and its associated length.\n\ \n\ Keyword arguments:\n\ maxZooms: The maximum number of zoom levels. The value must be >=0. The\n\ default is 10.\n\ \n\ >>> import pyBigWig\n\ >>> import tempfile\n\ >>> import os\n\ >>> ofile = tempfile.NamedTemporaryFile(delete=False)\n\ >>> oname = ofile.name\n\ >>> ofile.close()\n\ >>> bw = pyBigWig.open(oname, 'w')\n\ >>> bw.addHeader([(\"1\", 1000000), (\"2\", 1500000)], maxZooms=0)\n\ >>> bw.close()\n\ >>> os.remove(oname)"}, {"addEntries", (PyCFunction)pyBwAddEntries, METH_VARARGS|METH_KEYWORDS, "Adds one or more entries to a bigWig file. This returns nothing, but throws a\n\ runtime exception on error.\n\ \n\ This function always accepts an optional 'validate' option. If set to 'True',\n\ which is the default, the input entries are checked to ensure that they come\n\ after previously entered entries. This comes with significant overhead, so if\n\ this is instead 'False' then this validation is not performed.\n\ \n\ There are three manners in which entries can be stored in bigWig files.\n\ \n\ \n\ bedGraph-like entries (12 bytes each):\n\ \n\ Positional arguments:\n\ chrom: A list of chromosome. These MUST match those added with addHeader().\n\ starts: A list of start positions. These are 0-based.\n\ \n\ Keyword arguments:\n\ ends: A list of end positions. These are 0-based half open, so a start of\n\ 0 and end of 10 specifies the first 10 bases.\n\ values: A list of values.\n\ \n\ \n\ Variable-step entries (8 bytes each):\n\ \n\ Positional arguments:\n\ chrom: A chromosome name. This MUST match one added with addHeader().\n\ starts: A list of start positions. These are 0-based.\n\ \n\ Keyword arguments:\n\ values: A list of values.\n\ span: A span width. This is an integer value and specifies how many bases\n\ each entry describes. An entry with a start position of 0 and a span\n\ of 10 describes the first 10 bases.\n\ \n\ \n\ Fixed-step entries (4 bytes each):\n\ \n\ Positional arguments:\n\ chrom: A chromosome name. This MUST match one added with addHeader().\n\ starts: A start position. These are 0-based. The start position of each\n\ entry starts 'step' after the previous and describes 'span' bases.\n\ \n\ Keyword arguments:\n\ values: A list of values.\n\ span: A span width. This is an integer value and specifies how many bases\n\ each entry describes. An entry with a start position of 0 and a span\n\ of 10 describes the first 10 bases.\n\ step: A step width. Each subsequent entry begins this number of bases\n\ after the previous. So if the first entry has a start of 0 and step\n\ or 30, the second entry will start at 30.\n\ \n\ >>> import pyBigWig\n\ >>> import tempfile\n\ >>> import os\n\ >>> ofile = tempfile.NamedTemporaryFile(delete=False)\n\ >>> oname = ofile.name\n\ >>> ofile.close()\n\ >>> bw = pyBigWig.open(oname, 'w')\n\ >>> bw.addHeader([(\"1\", 1000000), (\"2\", 1500000)])\n\ >>> #Add some bedGraph-like entries\n\ >>> bw.addEntries([\"1\", \"1\", \"1\"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0])\n\ >>> #Variable-step entries, the span 500-520, 600-620, and 635-655\n\ >>> bw.addEntries(\"1\", [500, 600, 635], values=[-2.0, 150.0, 25.0], span=20)\n\ >>> #Fixed-step entries, the bases described are 900-920, 930-950, and 960-980\n\ >>> bw.addEntries(\"1\", 900, values=[-5.0, -20.0, 25.0], span=20, step=30)\n\ >>> #This only works due to using validate=False. Obviously the file is then corrupt.\n\ >>> bw.addEntries([\"1\", \"1\", \"1\"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0], validate=False)\n\ >>> bw.close()\n\ >>> os.remove(oname)"}, {NULL, NULL, 0, NULL} }; #if PY_MAJOR_VERSION >= 3 struct pyBigWigmodule_state { PyObject *error; }; #define GETSTATE(m) ((struct pyBigWigmodule_state*)PyModule_GetState(m)) static PyModuleDef pyBigWigmodule = { PyModuleDef_HEAD_INIT, "pyBigWig", "A python module for bigWig file access", -1, bwMethods, NULL, NULL, NULL, NULL }; #endif //Should set tp_dealloc, tp_print, tp_repr, tp_str, tp_members static PyTypeObject bigWigFile = { #if PY_MAJOR_VERSION >= 3 PyVarObject_HEAD_INIT(NULL, 0) #else PyObject_HEAD_INIT(NULL) 0, /*ob_size*/ #endif "pyBigWig.bigWigFile", /*tp_name*/ sizeof(pyBigWigFile_t), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)pyBwDealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ 0, /*tp_as_mapping*/ 0, /*tp_hash*/ 0, /*tp_call*/ 0, /*tp_str*/ PyObject_GenericGetAttr, /*tp_getattro*/ PyObject_GenericSetAttr, /*tp_setattro*/ 0, /*tp_as_buffer*/ #if PY_MAJOR_VERSION >= 3 Py_TPFLAGS_DEFAULT, /*tp_flags*/ #else Py_TPFLAGS_HAVE_CLASS, /*tp_flags*/ #endif "bigWig File", /*tp_doc*/ 0, /*tp_traverse*/ 0, /*tp_clear*/ 0, /*tp_richcompare*/ 0, /*tp_weaklistoffset*/ 0, /*tp_iter*/ 0, /*tp_iternext*/ bwMethods, /*tp_methods*/ 0, /*tp_members*/ 0, /*tp_getset*/ 0, /*tp_base*/ 0, /*tp_dict*/ 0, /*tp_descr_get*/ 0, /*tp_descr_set*/ 0, /*tp_dictoffset*/ 0, /*tp_init*/ 0, /*tp_alloc*/ 0, /*tp_new*/ 0,0,0,0,0,0 }; pyBigWig-0.3.2/setup.py0000750000201600010240000000652313016026753015364 0ustar ryanbioinfo00000000000000#!/usr/bin/env python from setuptools import setup, Extension, find_packages from distutils import sysconfig import subprocess import glob import sys try: from numpy.distutils.misc_util import get_info from os.path import dirname WITHNUMPY = True except: WITHNUMPY = False srcs = [x for x in glob.glob("libBigWig/*.c")] srcs.append("pyBigWig.c") libs=["m", "z", "curl"] if sysconfig.get_config_vars('BLDLIBRARY') is not None: #Note the "-l" prefix! for e in sysconfig.get_config_vars('BLDLIBRARY')[0].split(): if e[0:2] == "-l": libs.append(e[2:]) elif(sys.version_info[0] >= 3 and sys.version_info[1] >= 3) : libs.append("python%i.%im" % (sys.version_info[0], sys.version_info[1])) else : libs.append("python%i.%i" % (sys.version_info[0], sys.version_info[1])) additional_libs = [sysconfig.get_config_var("LIBDIR"), sysconfig.get_config_var("LIBPL")] try: foo, _ = subprocess.Popen(['curl-config', '--libs'], stdout=subprocess.PIPE).communicate() except: sys.exit("Either libcurl isn't installed, it didn't come with curl-config, or curl-config isn't in your $PATH. This must be corrected before installing pyBigWig!\n") foo = foo.strip().split() for v in foo: if(v[0:2] == "-L") : additional_libs.append(v[2:]) include_dirs = ['libBigWig', sysconfig.get_config_var("INCLUDEPY")] defines = [] if WITHNUMPY is True: defines.extend([('WITHNUMPY', None), ('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')]) extra_info = get_info('npymath') include_dirs.extend(extra_info['include_dirs']) libs.extend(extra_info['libraries']) extra_info['library_dirs'].extend(additional_libs) additional_libs = extra_info['library_dirs'] module1 = Extension('pyBigWig', sources = srcs, libraries = libs, library_dirs = additional_libs, define_macros = defines, include_dirs = include_dirs) setup(name = 'pyBigWig', version = '0.3.2', description = 'A package for accessing bigWig files using libBigWig', author = "Devon P. Ryan", author_email = "ryan@ie-freiburg.mpg.de", url = "https://github.com/dpryan79/pyBigWig", download_url = "https://github.com/dpryan79/pyBigWig/tarball/0.3.2", keywords = ["bioinformatics", "bigWig", "bigBed"], classifier = ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved", "Programming Language :: C", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: Implementation :: CPython", "Operating System :: POSIX", "Operating System :: Unix", "Operating System :: MacOS"], packages = find_packages(), include_package_data = True, extras_require = {'numpy input': ["numpy"]}, ext_modules = [module1]) pyBigWig-0.3.2/PKG-INFO0000640000201600010240000000057613016027005014736 0ustar ryanbioinfo00000000000000Metadata-Version: 1.1 Name: pyBigWig Version: 0.3.2 Summary: A package for accessing bigWig files using libBigWig Home-page: https://github.com/dpryan79/pyBigWig Author: Devon P. Ryan Author-email: ryan@ie-freiburg.mpg.de License: UNKNOWN Download-URL: https://github.com/dpryan79/pyBigWig/tarball/0.3.2 Description: UNKNOWN Keywords: bioinformatics,bigWig,bigBed Platform: UNKNOWN pyBigWig-0.3.2/setup.cfg0000640000201600010240000000014413016027005015451 0ustar ryanbioinfo00000000000000[metadata] description-file = README.md [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0