pax_global_header 0000666 0000000 0000000 00000000064 12464172677 0014532 g ustar 00root root 0000000 0000000 52 comment=26229a368e3045a827f04fc03f3879e25f110ba3
htslib-1.2.1/ 0000775 0000000 0000000 00000000000 12464172677 0013020 5 ustar 00root root 0000000 0000000 htslib-1.2.1/INSTALL 0000664 0000000 0000000 00000006550 12464172677 0014057 0 ustar 00root root 0000000 0000000 Basic Installation
==================
To build and install HTSlib, 'cd' to the htslib-1.x directory containing
the package's source and type the following commands:
./configure
make
make install
The './configure' command checks your build environment and allows various
optional functionality to be enabled (see Configuration below). If you
don't want to select any optional functionality, you may wish to omit
configure and just type 'make; make install' as for previous versions
of HTSlib. However if the build fails you should run './configure' as
it can diagnose the common reasons for build failures.
The 'make' command builds the HTSlib library and and various useful
utilities: bgzip, htsfile, and tabix. If compilation fails you should
run './configure' as it can diagnose problems with your build environment
that cause build failures.
The 'make install' command installs the libraries, library header files,
utilities, several manual pages, and a pkgconfig file to /usr/local.
The installation location can be changed by configuring with --prefix=DIR
or via 'make prefix=DIR install' (see Installation Locations below).
Configuration
=============
By default, './configure' examines your build environment, checking for
requirements such as the zlib development files, and arranges for a plain
HTSlib build. The following configure options can be used to enable
various features and specify further optional external requirements:
--with-irods[=DIR]
Specifies the location of the iRODS client library to use to enable
access to data objects stored in iRODS () via file
paths like 'irods:DATAOBJ'. DIR is the base of an iRODS source tree
such that the library is present as DIR/lib/core/obj/libRodsAPI.* and
headers are present under DIR/lib/api/include and so on. If '=DIR' is
omitted, $IRODS_HOME will be used as a base directory.
The configure script also accepts the usual options and environment variables
for tuning installation locations and compilers: type './configure --help'
for details. For example,
./configure CC=icc --prefix=/opt/icc-compiled
would specify that HTSlib is to be built with icc and installed into bin,
lib, etc subdirectories under /opt/icc-compiled.
Installation Locations
======================
By default, 'make install' installs HTSlib libraries under /usr/local/lib,
HTSlib header files under /usr/local/include, utility programs under
/usr/local/bin, etc. (To be precise, the header files are installed within
a fixed 'htslib' subdirectory under the specified .../include location.)
You can specify a different location to install HTSlib by configuring
with --prefix=DIR or specify locations for particular parts of HTSlib by
configuring with --libdir=DIR and so on. Type './configure --help' for
the full list of such install directory options.
Alternatively you can specify different locations at install time by
typing 'make prefix=DIR install' or 'make libdir=DIR install' and so on.
Consult the list of prefix/exec_prefix/etc variables near the top of the
Makefile for the full list of such variables that can be overridden.
You can also specify a staging area by typing 'make DESTDIR=DIR install',
possibly in conjunction with other --prefix or prefix=DIR settings.
For example,
make DESTDIR=/tmp/staging prefix=/opt
would install into bin, lib, etc subdirectories under /tmp/staging/opt.
htslib-1.2.1/LICENSE 0000664 0000000 0000000 00000006724 12464172677 0014036 0 ustar 00root root 0000000 0000000 [Files in this distribution outwith the cram/ subdirectory are distributed
according to the terms of the following MIT/Expat license.]
The MIT/Expat License
Copyright (C) 2012-2014 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
[Files within the cram/ subdirectory in this distribution are distributed
according to the terms of the following Modified 3-Clause BSD license.]
The Modified-BSD License
Copyright (C) 2012-2014 Genome Research Ltd.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
nor the names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[The use of a range of years within a copyright notice in this distribution
should be interpreted as being equivalent to a list of years including the
first and last year specified and all consecutive years between them.
For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
2011-2012" should be interpreted as being identical to a notice that reads
"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
that reads "Copyright (C) 2005-2012" should be interpreted as being identical
to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012".]
htslib-1.2.1/Makefile 0000664 0000000 0000000 00000036377 12464172677 0014500 0 ustar 00root root 0000000 0000000 # Makefile for htslib, a C library for high-throughput sequencing data formats.
#
# Copyright (C) 2013-2015 Genome Research Ltd.
#
# Author: John Marshall
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
CC = gcc
AR = ar
RANLIB = ranlib
CPPFLAGS = -I.
# TODO: probably update cram code to make it compile cleanly with -Wc++-compat
CFLAGS = -g -Wall -O2
EXTRA_CFLAGS_PIC = -fpic
LDFLAGS =
LDLIBS =
# For now these don't work too well as samtools also needs to know to
# add -lbz2 and -llzma if linking against the static libhts.a library.
# TODO This needs configury and adding to htslib.pc.in.
#
# # Bzip2 support; optionally used by CRAM.
# HAVE_LIBBZ2 := $(shell echo -e "\#include \012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -lbz2 2>/dev/null && echo yes)
# ifeq "$(HAVE_LIBBZ2)" "yes"
# CPPFLAGS += -DHAVE_LIBBZ2
# LDLIBS += -lbz2
# endif
#
# # Lzma support; optionally used by CRAM.
# HAVE_LIBLZMA := $(shell echo -e "\#include \012int main(void){return 0;}" > .test.c && $(CC) $(CFLAGS) $(CPPFLAGS) -o .test .test.c -llzma 2>/dev/null && echo yes)
# ifeq "$(HAVE_LIBLZMA)" "yes"
# CPPFLAGS += -DHAVE_LIBLZMA
# LDLIBS += -llzma
# endif
prefix = /usr/local
exec_prefix = $(prefix)
bindir = $(exec_prefix)/bin
includedir = $(prefix)/include
libdir = $(exec_prefix)/lib
datarootdir = $(prefix)/share
mandir = $(datarootdir)/man
man1dir = $(mandir)/man1
man5dir = $(mandir)/man5
pkgconfigdir= $(libdir)/pkgconfig
MKDIR_P = mkdir -p
INSTALL = install -p
INSTALL_PROGRAM = $(INSTALL)
INSTALL_DATA = $(INSTALL) -m 644
INSTALL_DIR = $(MKDIR_P) -m 755
BUILT_PROGRAMS = \
bgzip \
htsfile \
tabix
BUILT_TEST_PROGRAMS = \
test/fieldarith \
test/hfile \
test/sam \
test/test-regidx \
test/test_view \
test/test-vcf-api \
test/test-vcf-sweep
all: lib-static lib-shared $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
HTSPREFIX =
include htslib_vars.mk
lib-static: libhts.a
# $(shell), :=, and ifeq/.../endif are GNU Make-specific. If you don't have
# GNU Make, comment out the parts of this conditional that don't apply.
PLATFORM := $(shell uname -s)
ifeq "$(PLATFORM)" "Darwin"
SHLIB_FLAVOUR = dylib
lib-shared: libhts.dylib
else
SHLIB_FLAVOUR = so
lib-shared: libhts.so
endif
PACKAGE_VERSION = 1.2.1
LIBHTS_SOVERSION = 1
# $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string
# even if this is a dirty or untagged Git working tree.
NUMERIC_VERSION = $(PACKAGE_VERSION)
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
# as $(PACKAGE_VERSION) above, or an exact description likely based on a tag.
# Much of this is also GNU Make-specific. If you don't have GNU Make and/or
# are not building from a Git repository, comment out this conditional.
ifneq "$(wildcard .git)" ""
original_version := $(PACKAGE_VERSION)
PACKAGE_VERSION := $(shell git describe --always --dirty)
# Unless the Git description matches /\d*\.\d*(\.\d*)?/, i.e., is exactly a tag
# with a numeric name, revert $(NUMERIC_VERSION) to the original version number
# written above, but with the patchlevel field bumped to 255.
ifneq "$(subst ..,.,$(subst 0,,$(subst 1,,$(subst 2,,$(subst 3,,$(subst 4,,$(subst 5,,$(subst 6,,$(subst 7,,$(subst 8,,$(subst 9,,$(PACKAGE_VERSION))))))))))))" "."
empty :=
NUMERIC_VERSION := $(subst $(empty) ,.,$(wordlist 1,2,$(subst ., ,$(original_version))) 255)
endif
# Force version.h to be remade if $(PACKAGE_VERSION) has changed.
version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force))
endif
version.h:
echo '#define HTS_VERSION "$(PACKAGE_VERSION)"' > $@
print-version:
@echo $(PACKAGE_VERSION)
.SUFFIXES: .c .o .pico
.c.o:
$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
.c.pico:
$(CC) $(CFLAGS) $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $<
LIBHTS_OBJS = \
kfunc.o \
knetfile.o \
kstring.o \
bgzf.o \
faidx.o \
hfile.o \
hfile_net.o \
hts.o \
regidx.o \
sam.o \
synced_bcf_reader.o \
vcf_sweep.o \
tbx.o \
vcf.o \
vcfutils.o \
cram/cram_codecs.o \
cram/cram_decode.o \
cram/cram_encode.o \
cram/cram_index.o \
cram/cram_io.o \
cram/cram_samtools.o \
cram/cram_stats.o \
cram/files.o \
cram/mFILE.o \
cram/md5.o \
cram/open_trace_file.o \
cram/pooled_alloc.o \
cram/rANS_static.o \
cram/sam_header.o \
cram/string_alloc.o \
cram/thread_pool.o \
cram/vlen.o \
cram/zfio.o
cram_h = cram/cram.h $(cram_samtools_h) $(cram_sam_header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h
cram_io_h = cram/cram_io.h $(cram_misc_h)
cram_misc_h = cram/misc.h cram/os.h
cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h htslib/khash.h htslib/kstring.h
cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
cram_structs_h = cram/cram_structs.h cram/thread_pool.h cram/string_alloc.h htslib/khash.h
cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
# To be effective, config.mk needs to appear after most Makefile variables are
# set but before most rules appear, so that it can both use previously-set
# variables in its own rules' prerequisites and also update variables for use
# in later rules' prerequisites.
# sinclude is GNU Make-specific. If you don't have GNU Make or another make
# that understands sinclude, change this to 'include' if you are using the
# configure script or just comment the line out if you are not.
sinclude config.mk
libhts.a: $(LIBHTS_OBJS)
@-rm -f $@
$(AR) -rc $@ $(LIBHTS_OBJS)
-$(RANLIB) $@
# The target here is libhts.so, as that is the built file that other rules
# depend upon and that is used when -lhts appears in other program's recipes.
# As a byproduct invisible to make, libhts.so.NN is also created, as it is the
# file used at runtime (when $LD_LIBRARY_PATH includes the build directory).
libhts.so: $(LIBHTS_OBJS:.o=.pico)
$(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) -pthread $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LDLIBS) -lz -lm
ln -sf $@ libhts.so.$(LIBHTS_SOVERSION)
# Similarly this also creates libhts.NN.dylib as a byproduct, so that programs
# when run can find this uninstalled shared library (when $DYLD_LIBRARY_PATH
# includes this project's build directory).
libhts.dylib: $(LIBHTS_OBJS)
$(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LDLIBS) -lz
ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib
bgzf.o bgzf.pico: bgzf.c $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) htslib/khash.h
kstring.o kstring.pico: kstring.c htslib/kstring.h
knetfile.o knetfile.pico: knetfile.c htslib/knetfile.h
hfile.o hfile.pico: hfile.c $(htslib_hfile_h) $(hfile_internal_h)
hfile_irods.o hfile_irods.pico: hfile_irods.c $(hfile_internal_h)
hfile_net.o hfile_net.pico: hfile_net.c $(hfile_internal_h) htslib/knetfile.h
hts.o hts.pico: hts.c version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/ksort.h
vcf.o vcf.pico: vcf.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h
sam.o sam.pico: sam.c $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) htslib/khash.h htslib/kseq.h htslib/kstring.h
tbx.o tbx.pico: tbx.c $(htslib_tbx_h) $(htslib_bgzf_h) htslib/khash.h
faidx.o faidx.pico: faidx.c $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) htslib/khash.h
synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c $(htslib_synced_bcf_reader_h) htslib/kseq.h htslib/khash_str2int.h
vcf_sweep.o vcf_sweep.pico: vcf_sweep.c $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
vcfutils.o vcfutils.pico: vcfutils.c $(htslib_vcfutils_h)
kfunc.o kfunc.pico: kfunc.c htslib/kfunc.h
regidx.o regidx.pico: regidx.c $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h $(HTSPREFIX)htslib/kseq.h $(HTSPREFIX)htslib/khash_str2int.h $(htslib_regidx_h)
cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c $(cram_h)
cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c $(cram_h) cram/os.h cram/md5.h
cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c $(cram_h) cram/os.h cram/md5.h
cram/cram_index.o cram/cram_index.pico: cram/cram_index.c $(htslib_hfile_h) $(cram_h) cram/os.h cram/zfio.h
cram/cram_io.o cram/cram_io.pico: cram/cram_io.c $(cram_h) cram/os.h cram/md5.h $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h)
cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c $(cram_h) $(htslib_sam_h)
cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c $(cram_h) cram/os.h
cram/files.o cram/files.pico: cram/files.c $(cram_misc_h)
cram/mFILE.o cram/mFILE.pico: cram/mFILE.c cram/os.h cram/mFILE.h cram/vlen.h
cram/md5.o cram/md5.pico: cram/md5.c cram/md5.h
cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h)
cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c cram/pooled_alloc.h
cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c cram/rANS_static.h cram/rANS_byte.h
cram/sam_header.o cram/sam_header.pico: cram/sam_header.c $(cram_sam_header_h) cram/string_alloc.h
cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c cram/string_alloc.h
cram/thread_pool.o cram/thread_pool.pico: cram/thread_pool.c cram/thread_pool.h
cram/vlen.o cram/vlen.pico: cram/vlen.c cram/vlen.h cram/os.h
cram/zfio.o cram/zfio.pico: cram/zfio.c cram/os.h cram/zfio.h
bgzip: bgzip.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ bgzip.o libhts.a $(LDLIBS) -lz
htsfile: htsfile.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ htsfile.o libhts.a $(LDLIBS) -lz
tabix: tabix.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ tabix.o libhts.a $(LDLIBS) -lz
bgzip.o: bgzip.c $(htslib_bgzf_h) $(htslib_hts_h)
htsfile.o: htsfile.c $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
tabix.o: tabix.c $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) htslib/kseq.h $(htslib_bgzf_h) $(htslib_hts_h)
# For tests that might use it, set $REF_PATH explicitly to use only reference
# areas within the test suite (or set it to ':' to use no reference areas).
check test: $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
test/sam test/ce.fa
test/test-regidx
cd test && REF_PATH=: ./test_view.pl
cd test && ./test.pl
test/fieldarith: test/fieldarith.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/fieldarith.o libhts.a $(LDLIBS) -lz
test/hfile: test/hfile.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/hfile.o libhts.a $(LDLIBS) -lz
test/sam: test/sam.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/sam.o libhts.a $(LDLIBS) -lz
test/test-regidx: test/test-regidx.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LDLIBS) -lz
test/test_view: test/test_view.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LDLIBS) -lz
test/test-vcf-api: test/test-vcf-api.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-api.o libhts.a $(LDLIBS) -lz
test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
$(CC) -pthread $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LDLIBS) -lz
test/fieldarith.o: test/fieldarith.c $(htslib_sam_h)
test/hfile.o: test/hfile.c $(htslib_hfile_h) $(htslib_hts_defs_h)
test/test-regidx.o: test/test-regidx.c $(htslib_regidx_h)
test/sam.o: test/sam.c $(htslib_sam_h) $(htslib_faidx_h) htslib/kstring.h
test/test_view.o: test/test_view.c $(cram_h) $(htslib_sam_h)
test/test-vcf-api.o: test/test-vcf-api.c $(htslib_hts_h) $(htslib_vcf_h) htslib/kstring.h
test/test-vcf-sweep.o: test/test-vcf-sweep.c $(htslib_vcf_sweep_h)
install: libhts.a $(BUILT_PROGRAMS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig
$(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir)
$(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib
$(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a
$(INSTALL_DATA) htsfile.1 tabix.1 $(DESTDIR)$(man1dir)
$(INSTALL_DATA) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir)
installdirs:
$(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(pkgconfigdir)
# After installation, the real file in $(libdir) will be libhts.so.X.Y.Z,
# with symlinks libhts.so (used via -lhts during linking of client programs)
# and libhts.so.NN (used by client executables at runtime).
install-so: libhts.so installdirs
$(INSTALL_DATA) libhts.so $(DESTDIR)$(libdir)/libhts.so.$(PACKAGE_VERSION)
ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so
ln -sf libhts.so.$(PACKAGE_VERSION) $(DESTDIR)$(libdir)/libhts.so.$(LIBHTS_SOVERSION)
install-dylib: libhts.dylib installdirs
$(INSTALL_PROGRAM) libhts.dylib $(DESTDIR)$(libdir)/libhts.$(PACKAGE_VERSION).dylib
ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.dylib
ln -sf libhts.$(PACKAGE_VERSION).dylib $(DESTDIR)$(libdir)/libhts.$(LIBHTS_SOVERSION).dylib
# Substitute these pseudo-autoconf variables only at install time
# so that "make install prefix=/prefix/path" etc continue to work.
install-pkgconfig: installdirs
sed -e 's#@includedir@#$(includedir)#g;s#@libdir@#$(libdir)#g;s#@PACKAGE_VERSION@#$(PACKAGE_VERSION)#g' htslib.pc.in > $(DESTDIR)$(pkgconfigdir)/htslib.pc
chmod 644 $(DESTDIR)$(pkgconfigdir)/htslib.pc
# A pkg-config file (suitable for copying to $PKG_CONFIG_PATH) that provides
# flags for building against the uninstalled library in this build directory.
htslib-uninstalled.pc: htslib.pc.in
sed -e 's#@includedir@#'`pwd`'#g;s#@libdir@#'`pwd`'#g' htslib.pc.in > $@
testclean:
-rm -f test/*.tmp test/*.tmp.*
mostlyclean: testclean
-rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h
clean: mostlyclean clean-$(SHLIB_FLAVOUR)
-rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
distclean: clean
-rm -f config.cache config.log config.mk config.status
-rm -f TAGS *-uninstalled.pc
clean-so:
-rm -f libhts.so libhts.so.*
clean-dylib:
-rm -f libhts.dylib libhts.*.dylib
tags:
ctags -f TAGS *.[ch] cram/*.[ch] htslib/*.h
force:
.PHONY: all check clean distclean force install install-pkgconfig installdirs
.PHONY: lib-shared lib-static mostlyclean print-version tags test testclean
.PHONY: clean-so install-so
.PHONY: clean-dylib install-dylib
htslib-1.2.1/NEWS 0000664 0000000 0000000 00000004233 12464172677 0013521 0 ustar 00root root 0000000 0000000 Noteworthy changes in release 1.2.1 (3 February 2015)
* Reinstated hts_file_type() and FT_* macros, which were available until 1.1
but briefly removed in 1.2. This function is deprecated and will be removed
in a future release -- you should use hts_detect_format() etc instead
Noteworthy changes in release 1.2 (2 February 2015)
* HTSlib now has a configure script which checks your build environment
and allows for selection of optional extras. See INSTALL for details
* By default, reference sequences are fetched from the EBI CRAM Reference
Registry and cached in your $HOME cache directory. This behaviour can
be controlled by setting REF_PATH and REF_CACHE enviroment variables
(see the samtools(1) man page for details)
* Numerous CRAM improvements:
- Support for CRAM v3.0, an upcoming revision to CRAM supporting
better compression and per-container checksums
- EOF checking for v2.1 and v3.0 (similar to checking BAM EOF blocks)
- Non-standard values for PNEXT and TLEN fields are now preserved
- hts_set_fai_filename() now provides a reference file when encoding
- Generated read names are now numbered from 1, rather than being
labelled 'slice:record-in-slice'
- Multi-threading and speed improvements
* New htsfile command for identifying file formats, and corresponding
file format detection APIs
* New tabix --regions FILE, --targets FILE options for filtering via BED files
* Optional iRODS file access, disabled by default. Configure with --with-irods
to enable accessing iRODS data objects directly via 'irods:DATAOBJ'
* All occurences of 2^29 in the source have been eliminated, so indexing
and querying against reference sequences larger than 512Mbp works (when
using CSI indices)
* Support for plain GZIP compression in various places
* VCF header editing speed improvements
* Added seq_nt16_int[] (equivalent to the samtools API's bam_nt16_nt4_table)
* Reinstated faidx_fetch_nseq(), which was accidentally removed from 1.1.
Now faidx_fetch_nseq() and faidx_nseq() are equivalent; eventually
faidx_fetch_nseq() will be deprecated and removed [#156]
* Fixed bugs #141, #152, #155, #158, #159, and various memory leaks
htslib-1.2.1/README 0000664 0000000 0000000 00000000421 12464172677 0013675 0 ustar 00root root 0000000 0000000 HTSlib is an implementation of a unified C library for accessing common file
formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
data. It is the core library used by samtools and bcftools.
See INSTALL for building and installation instructions.
htslib-1.2.1/bgzf.c 0000664 0000000 0000000 00000104163 12464172677 0014121 0 ustar 00root root 0000000 0000000 /* The MIT License
Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
2011, 2012 Attractive Chaos
Copyright (C) 2009, 2013, 2014 Genome Research Ltd
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "htslib/hts.h"
#include "htslib/bgzf.h"
#include "htslib/hfile.h"
#define BGZF_CACHE
#define BGZF_MT
#define BLOCK_HEADER_LENGTH 18
#define BLOCK_FOOTER_LENGTH 8
/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
BGZF extension:
^ ^ ^ ^
| | | |
FLG.EXTRA XLEN B C
BGZF format is compatible with GZIP. It limits the size of each compressed
block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
records the size.
*/
static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
#ifdef BGZF_CACHE
typedef struct {
int size;
uint8_t *block;
int64_t end_offset;
} cache_t;
#include "htslib/khash.h"
KHASH_MAP_INIT_INT64(cache, cache_t)
#endif
typedef struct
{
uint64_t uaddr; // offset w.r.t. uncompressed data
uint64_t caddr; // offset w.r.t. compressed data
}
bgzidx1_t;
struct __bgzidx_t
{
int noffs, moffs; // the size of the index, n:used, m:allocated
bgzidx1_t *offs; // offsets
uint64_t ublock_addr; // offset of the current block (uncompressed data)
};
void bgzf_index_destroy(BGZF *fp);
int bgzf_index_add_block(BGZF *fp);
static inline void packInt16(uint8_t *buffer, uint16_t value)
{
buffer[0] = value;
buffer[1] = value >> 8;
}
static inline int unpackInt16(const uint8_t *buffer)
{
return buffer[0] | buffer[1] << 8;
}
static inline void packInt32(uint8_t *buffer, uint32_t value)
{
buffer[0] = value;
buffer[1] = value >> 8;
buffer[2] = value >> 16;
buffer[3] = value >> 24;
}
static BGZF *bgzf_read_init(hFILE *hfpr)
{
BGZF *fp;
uint8_t magic[18];
ssize_t n = hpeek(hfpr, magic, 18);
if (n < 0) return NULL;
fp = (BGZF*)calloc(1, sizeof(BGZF));
if (fp == NULL) return NULL;
fp->is_write = 0;
fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
#ifdef BGZF_CACHE
fp->cache = kh_init(cache);
#endif
return fp;
}
// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
static int mode2level(const char *__restrict mode)
{
int i, compress_level = -1;
for (i = 0; mode[i]; ++i)
if (mode[i] >= '0' && mode[i] <= '9') break;
if (mode[i]) compress_level = (int)mode[i] - '0';
if (strchr(mode, 'u')) compress_level = -2;
return compress_level;
}
static BGZF *bgzf_write_init(const char *mode)
{
BGZF *fp;
fp = (BGZF*)calloc(1, sizeof(BGZF));
fp->is_write = 1;
int compress_level = mode2level(mode);
if ( compress_level==-2 )
{
fp->is_compressed = 0;
return fp;
}
fp->is_compressed = 1;
fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
if ( strchr(mode,'g') )
{
// gzip output
fp->is_gzip = 1;
fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
fp->gz_stream->zalloc = NULL;
fp->gz_stream->zfree = NULL;
if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL;
}
return fp;
}
BGZF *bgzf_open(const char *path, const char *mode)
{
BGZF *fp = 0;
assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
if (strchr(mode, 'r')) {
hFILE *fpr;
if ((fpr = hopen(path, mode)) == 0) return 0;
fp = bgzf_read_init(fpr);
if (fp == 0) { hclose_abruptly(fpr); return NULL; }
fp->fp = fpr;
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
hFILE *fpw;
if ((fpw = hopen(path, mode)) == 0) return 0;
fp = bgzf_write_init(mode);
fp->fp = fpw;
}
else { errno = EINVAL; return 0; }
fp->is_be = ed_is_big();
return fp;
}
BGZF *bgzf_dopen(int fd, const char *mode)
{
BGZF *fp = 0;
assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
if (strchr(mode, 'r')) {
hFILE *fpr;
if ((fpr = hdopen(fd, mode)) == 0) return 0;
fp = bgzf_read_init(fpr);
if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd
fp->fp = fpr;
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
hFILE *fpw;
if ((fpw = hdopen(fd, mode)) == 0) return 0;
fp = bgzf_write_init(mode);
fp->fp = fpw;
}
else { errno = EINVAL; return 0; }
fp->is_be = ed_is_big();
return fp;
}
BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
{
BGZF *fp = NULL;
assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
if (strchr(mode, 'r')) {
fp = bgzf_read_init(hfp);
if (fp == NULL) return NULL;
} else if (strchr(mode, 'w') || strchr(mode, 'a')) {
fp = bgzf_write_init(mode);
}
else { errno = EINVAL; return 0; }
fp->fp = hfp;
fp->is_be = ed_is_big();
return fp;
}
static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
{
uint32_t crc;
z_stream zs;
uint8_t *dst = (uint8_t*)_dst;
// compress the body
zs.zalloc = NULL; zs.zfree = NULL;
zs.next_in = (Bytef*)src;
zs.avail_in = slen;
zs.next_out = dst + BLOCK_HEADER_LENGTH;
zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
if (deflateEnd(&zs) != Z_OK) return -1;
*dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
// write the header
memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
// write the footer
crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen);
packInt32((uint8_t*)&dst[*dlen - 8], crc);
packInt32((uint8_t*)&dst[*dlen - 4], slen);
return 0;
}
static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int slen, int level)
{
uint8_t *dst = (uint8_t*)_dst;
z_stream *zs = fp->gz_stream;
int flush = slen ? Z_NO_FLUSH : Z_FINISH;
zs->next_in = (Bytef*)src;
zs->avail_in = slen;
zs->next_out = dst;
zs->avail_out = *dlen;
if ( deflate(zs, flush) == Z_STREAM_ERROR ) return -1;
*dlen = *dlen - zs->avail_out;
return 0;
}
// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
static int deflate_block(BGZF *fp, int block_length)
{
int comp_size = BGZF_MAX_BLOCK_SIZE;
int ret;
if ( !fp->is_gzip )
ret = bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
else
ret = bgzf_gzip_compress(fp, fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
if ( ret != 0 )
{
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
fp->block_offset = 0;
return comp_size;
}
// Inflate the block in fp->compressed_block into fp->uncompressed_block
static int inflate_block(BGZF* fp, int block_length)
{
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_in = (Bytef*)fp->compressed_block + 18;
zs.avail_in = block_length - 16;
zs.next_out = (Bytef*)fp->uncompressed_block;
zs.avail_out = BGZF_MAX_BLOCK_SIZE;
if (inflateInit2(&zs, -15) != Z_OK) {
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
inflateEnd(&zs);
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
if (inflateEnd(&zs) != Z_OK) {
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
return zs.total_out;
}
static int inflate_gzip_block(BGZF *fp, int cached)
{
int ret = Z_OK;
do
{
if ( !cached && fp->gz_stream->avail_out!=0 )
{
fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE);
if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in;
if ( fp->gz_stream->avail_in==0 ) break;
fp->gz_stream->next_in = fp->compressed_block;
}
else cached = 0;
do
{
fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset;
fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset;
ret = inflate(fp->gz_stream, Z_NO_FLUSH);
if ( ret==Z_BUF_ERROR ) continue; // non-critical error
if ( ret<0 ) return -1;
unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
if ( have ) return have;
}
while ( fp->gz_stream->avail_out == 0 );
}
while (ret != Z_STREAM_END);
return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
}
// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error
static int check_header(const uint8_t *header)
{
if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
return ((header[3] & 4) != 0
&& unpackInt16((uint8_t*)&header[10]) == 6
&& header[12] == 'B' && header[13] == 'C'
&& unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
}
#ifdef BGZF_CACHE
static void free_cache(BGZF *fp)
{
khint_t k;
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
if (fp->is_write) return;
for (k = kh_begin(h); k < kh_end(h); ++k)
if (kh_exist(h, k)) free(kh_val(h, k).block);
kh_destroy(cache, h);
}
static int load_block_from_cache(BGZF *fp, int64_t block_address)
{
khint_t k;
cache_t *p;
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
k = kh_get(cache, h, block_address);
if (k == kh_end(h)) return 0;
p = &kh_val(h, k);
if (fp->block_length != 0) fp->block_offset = 0;
fp->block_address = block_address;
fp->block_length = p->size;
memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
{
// todo: move the error up
fprintf(stderr,"Could not hseek to %"PRId64"\n", p->end_offset);
exit(1);
}
return p->size;
}
static void cache_block(BGZF *fp, int size)
{
int ret;
khint_t k;
cache_t *p;
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
/* A better way would be to remove the oldest block in the
* cache, but here we remove a random one for simplicity. This
* should not have a big impact on performance. */
for (k = kh_begin(h); k < kh_end(h); ++k)
if (kh_exist(h, k)) break;
if (k < kh_end(h)) {
free(kh_val(h, k).block);
kh_del(cache, h, k);
}
}
k = kh_put(cache, h, fp->block_address, &ret);
if (ret == 0) return; // if this happens, a bug!
p = &kh_val(h, k);
p->size = fp->block_length;
p->end_offset = fp->block_address + size;
p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
}
#else
static void free_cache(BGZF *fp) {}
static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
static void cache_block(BGZF *fp, int size) {}
#endif
int bgzf_read_block(BGZF *fp)
{
uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
int count, size = 0, block_length, remaining;
// Reading an uncompressed file
if ( !fp->is_compressed )
{
count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
if ( count==0 )
{
fp->block_length = 0;
return 0;
}
if (fp->block_length != 0) fp->block_offset = 0;
fp->block_address += count;
fp->block_length = count;
return 0;
}
// Reading compressed file
int64_t block_address;
block_address = htell(fp->fp);
if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
{
count = inflate_gzip_block(fp, 0);
if ( count<0 )
{
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
fp->block_length = count;
fp->block_address = block_address;
return 0;
}
if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
count = hread(fp->fp, header, sizeof(header));
if (count == 0) { // no data read
fp->block_length = 0;
return 0;
}
int ret;
if ( count != sizeof(header) || (ret=check_header(header))==-2 )
{
fp->errcode |= BGZF_ERR_HEADER;
return -1;
}
if ( ret==-1 )
{
// GZIP, not BGZF
uint8_t *cblock = (uint8_t*)fp->compressed_block;
memcpy(cblock, header, sizeof(header));
count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header);
int nskip = 10;
// Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA
// Note: Some of these fields are untested, I did not have appropriate data available
if ( header[3] & 0x4 ) // FLG.FEXTRA
{
nskip += unpackInt16(&cblock[nskip]) + 2;
}
if ( header[3] & 0x8 ) // FLG.FNAME
{
while ( nskiperrcode |= BGZF_ERR_HEADER;
return -1;
}
nskip++;
}
if ( header[3] & 0x10 ) // FLG.FCOMMENT
{
while ( nskiperrcode |= BGZF_ERR_HEADER;
return -1;
}
nskip++;
}
if ( header[3] & 0x2 ) nskip += 2; // FLG.FHCRC
fp->is_gzip = 1;
fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream));
int ret = inflateInit2(fp->gz_stream, -15);
if (ret != Z_OK)
{
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
fp->gz_stream->avail_in = count - nskip;
fp->gz_stream->next_in = cblock + nskip;
count = inflate_gzip_block(fp, 1);
if ( count<0 )
{
fp->errcode |= BGZF_ERR_ZLIB;
return -1;
}
fp->block_length = count;
fp->block_address = block_address;
if ( fp->idx_build_otf ) return -1; // cannot build index for gzip
return 0;
}
size = count;
block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
compressed_block = (uint8_t*)fp->compressed_block;
memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
remaining = block_length - BLOCK_HEADER_LENGTH;
count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
if (count != remaining) {
fp->errcode |= BGZF_ERR_IO;
return -1;
}
size += count;
if ((count = inflate_block(fp, block_length)) < 0) return -1;
if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
fp->block_address = block_address;
fp->block_length = count;
if ( fp->idx_build_otf )
{
bgzf_index_add_block(fp);
fp->idx->ublock_addr += count;
}
cache_block(fp, size);
return 0;
}
ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
{
ssize_t bytes_read = 0;
uint8_t *output = (uint8_t*)data;
if (length <= 0) return 0;
assert(fp->is_write == 0);
while (bytes_read < length) {
int copy_length, available = fp->block_length - fp->block_offset;
uint8_t *buffer;
if (available <= 0) {
if (bgzf_read_block(fp) != 0) return -1;
available = fp->block_length - fp->block_offset;
if (available <= 0) break;
}
copy_length = length - bytes_read < available? length - bytes_read : available;
buffer = (uint8_t*)fp->uncompressed_block;
memcpy(output, buffer + fp->block_offset, copy_length);
fp->block_offset += copy_length;
output += copy_length;
bytes_read += copy_length;
}
if (fp->block_offset == fp->block_length) {
fp->block_address = htell(fp->fp);
fp->block_offset = fp->block_length = 0;
}
fp->uncompressed_address += bytes_read;
return bytes_read;
}
ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
{
return hread(fp->fp, data, length);
}
#ifdef BGZF_MT
typedef struct {
struct bgzf_mtaux_t *mt;
void *buf;
int i, errcode, toproc, compress_level;
} worker_t;
typedef struct bgzf_mtaux_t {
int n_threads, n_blks, curr, done;
volatile int proc_cnt;
void **blk;
int *len;
worker_t *w;
pthread_t *tid;
pthread_mutex_t lock;
pthread_cond_t cv;
} mtaux_t;
static int worker_aux(worker_t *w)
{
int i, stop = 0;
// wait for condition: to process or all done
pthread_mutex_lock(&w->mt->lock);
while (!w->toproc && !w->mt->done)
pthread_cond_wait(&w->mt->cv, &w->mt->lock);
if (w->mt->done) stop = 1;
w->toproc = 0;
pthread_mutex_unlock(&w->mt->lock);
if (stop) return 1; // to quit the thread
w->errcode = 0;
for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
int clen = BGZF_MAX_BLOCK_SIZE;
if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level) != 0)
w->errcode |= BGZF_ERR_ZLIB;
memcpy(w->mt->blk[i], w->buf, clen);
w->mt->len[i] = clen;
}
__sync_fetch_and_add(&w->mt->proc_cnt, 1);
return 0;
}
static void *mt_worker(void *data)
{
while (worker_aux((worker_t*)data) == 0);
return 0;
}
int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
{
int i;
mtaux_t *mt;
pthread_attr_t attr;
if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
mt = (mtaux_t*)calloc(1, sizeof(mtaux_t));
mt->n_threads = n_threads;
mt->n_blks = n_threads * n_sub_blks;
mt->len = (int*)calloc(mt->n_blks, sizeof(int));
mt->blk = (void**)calloc(mt->n_blks, sizeof(void*));
for (i = 0; i < mt->n_blks; ++i)
mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t));
for (i = 0; i < mt->n_threads; ++i) {
mt->w[i].i = i;
mt->w[i].mt = mt;
mt->w[i].compress_level = fp->compress_level;
mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
}
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
pthread_mutex_init(&mt->lock, 0);
pthread_cond_init(&mt->cv, 0);
for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
fp->mt = mt;
return 0;
}
static void mt_destroy(mtaux_t *mt)
{
int i;
// signal all workers to quit
pthread_mutex_lock(&mt->lock);
mt->done = 1; mt->proc_cnt = 0;
pthread_cond_broadcast(&mt->cv);
pthread_mutex_unlock(&mt->lock);
for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
// free other data allocated on heap
for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
pthread_cond_destroy(&mt->cv);
pthread_mutex_destroy(&mt->lock);
free(mt);
}
static void mt_queue(BGZF *fp)
{
mtaux_t *mt = fp->mt;
assert(mt->curr < mt->n_blks); // guaranteed by the caller
memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
mt->len[mt->curr] = fp->block_offset;
fp->block_offset = 0;
++mt->curr;
}
static int mt_flush_queue(BGZF *fp)
{
int i;
mtaux_t *mt = fp->mt;
// signal all the workers to compress
pthread_mutex_lock(&mt->lock);
for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
mt->proc_cnt = 0;
pthread_cond_broadcast(&mt->cv);
pthread_mutex_unlock(&mt->lock);
// worker 0 is doing things here
worker_aux(&mt->w[0]);
// wait for all the threads to complete
while (mt->proc_cnt < mt->n_threads);
// dump data to disk
for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
for (i = 0; i < mt->curr; ++i)
if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) {
fp->errcode |= BGZF_ERR_IO;
break;
}
mt->curr = 0;
return (fp->errcode == 0)? 0 : -1;
}
static int lazy_flush(BGZF *fp)
{
if (fp->mt) {
if (fp->block_offset) mt_queue(fp);
return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp);
}
else return bgzf_flush(fp);
}
#else // ~ #ifdef BGZF_MT
int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
{
return 0;
}
static inline int lazy_flush(BGZF *fp)
{
return bgzf_flush(fp);
}
#endif // ~ #ifdef BGZF_MT
int bgzf_flush(BGZF *fp)
{
if (!fp->is_write) return 0;
#ifdef BGZF_MT
if (fp->mt) {
if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
return mt_flush_queue(fp);
}
#endif
while (fp->block_offset > 0) {
if ( fp->idx_build_otf )
{
bgzf_index_add_block(fp);
fp->idx->ublock_addr += fp->block_offset;
}
int block_length = deflate_block(fp, fp->block_offset);
if (block_length < 0) return -1;
if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) {
fp->errcode |= BGZF_ERR_IO; // possibly truncated file
return -1;
}
fp->block_address += block_length;
}
return 0;
}
int bgzf_flush_try(BGZF *fp, ssize_t size)
{
if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp);
return 0;
}
ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
{
if ( !fp->is_compressed )
return hwrite(fp->fp, data, length);
const uint8_t *input = (const uint8_t*)data;
ssize_t remaining = length;
assert(fp->is_write);
while (remaining > 0) {
uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
int copy_length = BGZF_BLOCK_SIZE - fp->block_offset;
if (copy_length > remaining) copy_length = remaining;
memcpy(buffer + fp->block_offset, input, copy_length);
fp->block_offset += copy_length;
input += copy_length;
remaining -= copy_length;
if (fp->block_offset == BGZF_BLOCK_SIZE) {
if (lazy_flush(fp) != 0) return -1;
}
}
return length - remaining;
}
ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
{
return hwrite(fp->fp, data, length);
}
int bgzf_close(BGZF* fp)
{
int ret, block_length;
if (fp == 0) return -1;
if (fp->is_write && fp->is_compressed) {
if (bgzf_flush(fp) != 0) return -1;
fp->compress_level = -1;
block_length = deflate_block(fp, 0); // write an empty block
if (hwrite(fp->fp, fp->compressed_block, block_length) < 0
|| hflush(fp->fp) != 0) {
fp->errcode |= BGZF_ERR_IO;
return -1;
}
#ifdef BGZF_MT
if (fp->mt) mt_destroy(fp->mt);
#endif
}
if ( fp->is_gzip )
{
if (!fp->is_write) (void)inflateEnd(fp->gz_stream);
else (void)deflateEnd(fp->gz_stream);
free(fp->gz_stream);
}
ret = hclose(fp->fp);
if (ret != 0) return -1;
bgzf_index_destroy(fp);
free(fp->uncompressed_block);
free(fp->compressed_block);
free_cache(fp);
free(fp);
return 0;
}
void bgzf_set_cache_size(BGZF *fp, int cache_size)
{
if (fp) fp->cache_size = cache_size;
}
int bgzf_check_EOF(BGZF *fp)
{
uint8_t buf[28];
off_t offset = htell(fp->fp);
if (hseek(fp->fp, -28, SEEK_END) < 0) {
if (errno == ESPIPE) { hclearerr(fp->fp); return 2; }
else return -1;
}
if ( hread(fp->fp, buf, 28) != 28 ) return -1;
if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1;
return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0;
}
int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
{
int block_offset;
int64_t block_address;
if (fp->is_write || where != SEEK_SET) {
fp->errcode |= BGZF_ERR_MISUSE;
return -1;
}
block_offset = pos & 0xFFFF;
block_address = pos >> 16;
if (hseek(fp->fp, block_address, SEEK_SET) < 0) {
fp->errcode |= BGZF_ERR_IO;
return -1;
}
fp->block_length = 0; // indicates current block has not been loaded
fp->block_address = block_address;
fp->block_offset = block_offset;
return 0;
}
int bgzf_is_bgzf(const char *fn)
{
uint8_t buf[16];
int n;
hFILE *fp;
if ((fp = hopen(fn, "r")) == 0) return 0;
n = hread(fp, buf, 16);
if ( hclose(fp) < 0 ) return -1;
if (n != 16) return 0;
return memcmp(g_magic, buf, 16) == 0? 1 : 0;
}
int bgzf_getc(BGZF *fp)
{
int c;
if (fp->block_offset >= fp->block_length) {
if (bgzf_read_block(fp) != 0) return -2; /* error */
if (fp->block_length == 0) return -1; /* end-of-file */
}
c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
if (fp->block_offset == fp->block_length) {
fp->block_address = htell(fp->fp);
fp->block_offset = 0;
fp->block_length = 0;
}
fp->uncompressed_address++;
return c;
}
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
{
int l, state = 0;
unsigned char *buf = (unsigned char*)fp->uncompressed_block;
str->l = 0;
do {
if (fp->block_offset >= fp->block_length) {
if (bgzf_read_block(fp) != 0) { state = -2; break; }
if (fp->block_length == 0) { state = -1; break; }
}
for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
if (l < fp->block_length) state = 1;
l -= fp->block_offset;
if (str->l + l + 1 >= str->m) {
str->m = str->l + l + 2;
kroundup32(str->m);
str->s = (char*)realloc(str->s, str->m);
}
memcpy(str->s + str->l, buf + fp->block_offset, l);
str->l += l;
fp->block_offset += l + 1;
if (fp->block_offset >= fp->block_length) {
fp->block_address = htell(fp->fp);
fp->block_offset = 0;
fp->block_length = 0;
}
} while (state == 0);
if (str->l == 0 && state < 0) return state;
fp->uncompressed_address += str->l;
if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
str->s[str->l] = 0;
return str->l;
}
void bgzf_index_destroy(BGZF *fp)
{
if ( !fp->idx ) return;
free(fp->idx->offs);
free(fp->idx);
fp->idx = NULL;
fp->idx_build_otf = 0;
}
int bgzf_index_build_init(BGZF *fp)
{
bgzf_index_destroy(fp);
fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
if ( !fp->idx ) return -1;
fp->idx_build_otf = 1; // build index on the fly
return 0;
}
int bgzf_index_add_block(BGZF *fp)
{
fp->idx->noffs++;
if ( fp->idx->noffs > fp->idx->moffs )
{
fp->idx->moffs = fp->idx->noffs;
kroundup32(fp->idx->moffs);
fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t));
if ( !fp->idx->offs ) return -1;
}
fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr;
fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address;
return 0;
}
int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
{
if (bgzf_flush(fp) != 0) return -1;
assert(fp->idx);
char *tmp = NULL;
if ( suffix )
{
int blen = strlen(bname);
int slen = strlen(suffix);
tmp = (char*) malloc(blen + slen + 1);
if ( !tmp ) return -1;
memcpy(tmp,bname,blen);
memcpy(tmp+blen,suffix,slen+1);
}
FILE *idx = fopen(tmp?tmp:bname,"wb");
if ( tmp ) free(tmp);
if ( !idx ) return -1;
// Note that the index contains one extra record when indexing files opened
// for reading. The terminating record is not present when opened for writing.
// This is not a bug.
int i;
if ( fp->is_be )
{
uint64_t x = fp->idx->noffs - 1;
fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
for (i=1; iidx->noffs; i++)
{
x = fp->idx->offs[i].caddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
x = fp->idx->offs[i].uaddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
}
}
else
{
uint64_t x = fp->idx->noffs - 1;
fwrite(&x, 1, sizeof(x), idx);
for (i=1; iidx->noffs; i++)
{
fwrite(&fp->idx->offs[i].caddr, 1, sizeof(fp->idx->offs[i].caddr), idx);
fwrite(&fp->idx->offs[i].uaddr, 1, sizeof(fp->idx->offs[i].uaddr), idx);
}
}
fclose(idx);
return 0;
}
int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
{
char *tmp = NULL;
if ( suffix )
{
int blen = strlen(bname);
int slen = strlen(suffix);
tmp = (char*) malloc(blen + slen + 1);
if ( !tmp ) return -1;
memcpy(tmp,bname,blen);
memcpy(tmp+blen,suffix,slen+1);
}
FILE *idx = fopen(tmp?tmp:bname,"rb");
if ( tmp ) free(tmp);
if ( !idx ) return -1;
fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
uint64_t x;
if ( fread(&x, 1, sizeof(x), idx) != sizeof(x) ) return -1;
fp->idx->noffs = fp->idx->moffs = 1 + (fp->is_be ? ed_swap_8(x) : x);
fp->idx->offs = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t));
fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0;
int i;
if ( fp->is_be )
{
int ret = 0;
for (i=1; iidx->noffs; i++)
{
ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = ed_swap_8(x);
ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = ed_swap_8(x);
}
if ( ret != sizeof(x)*2*(fp->idx->noffs-1) ) return -1;
}
else
{
int ret = 0;
for (i=1; iidx->noffs; i++)
{
ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = x;
ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = x;
}
if ( ret != sizeof(x)*2*(fp->idx->noffs-1) ) return -1;
}
fclose(idx);
return 0;
}
int bgzf_useek(BGZF *fp, long uoffset, int where)
{
if ( !fp->is_compressed )
{
if (hseek(fp->fp, uoffset, SEEK_SET) < 0)
{
fp->errcode |= BGZF_ERR_IO;
return -1;
}
fp->block_length = 0; // indicates current block has not been loaded
fp->block_address = uoffset;
fp->block_offset = 0;
bgzf_read_block(fp);
fp->uncompressed_address = uoffset;
return 0;
}
if ( !fp->idx )
{
fp->errcode |= BGZF_ERR_IO;
return -1;
}
// binary search
int ilo = 0, ihi = fp->idx->noffs - 1;
while ( ilo<=ihi )
{
int i = (ilo+ihi)*0.5;
if ( uoffset < fp->idx->offs[i].uaddr ) ihi = i - 1;
else if ( uoffset >= fp->idx->offs[i].uaddr ) ilo = i + 1;
else break;
}
int i = ilo-1;
if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0)
{
fp->errcode |= BGZF_ERR_IO;
return -1;
}
fp->block_length = 0; // indicates current block has not been loaded
fp->block_address = fp->idx->offs[i].caddr;
fp->block_offset = 0;
if ( bgzf_read_block(fp) < 0 ) return -1;
if ( uoffset - fp->idx->offs[i].uaddr > 0 )
{
fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks
}
fp->uncompressed_address = uoffset;
return 0;
}
long bgzf_utell(BGZF *fp)
{
return fp->uncompressed_address; // currently maintained only when reading
}
htslib-1.2.1/bgzip.c 0000664 0000000 0000000 00000023761 12464172677 0014310 0 ustar 00root root 0000000 0000000 /* bgzip.c -- Block compression/decompression utility.
Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
Copyright (C) 2010, 2013, 2014 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notices and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "htslib/bgzf.h"
#include "htslib/hts.h"
static const int WINDOW_SIZE = 64 * 1024;
static void error(const char *format, ...)
{
va_list ap;
va_start(ap, format);
vfprintf(stderr, format, ap);
va_end(ap);
exit(EXIT_FAILURE);
}
static int write_open(const char *fn, int is_forced)
{
int fd = -1;
char c;
if (!is_forced) {
if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
if ( scanf("%c", &c) != 1 ) c = 'n';
if (c != 'Y' && c != 'y') {
fprintf(stderr, "[bgzip] not overwritten\n");
exit(EXIT_FAILURE);
}
}
}
if (fd < 0) {
if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
exit(EXIT_FAILURE);
}
}
return fd;
}
static int bgzip_main_usage(void)
{
fprintf(stderr, "\n");
fprintf(stderr, "Version: %s\n", hts_version());
fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n");
fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n");
fprintf(stderr, " -d, --decompress decompress\n");
fprintf(stderr, " -f, --force overwrite files without asking\n");
fprintf(stderr, " -h, --help give this help\n");
fprintf(stderr, " -i, --index compress and create BGZF index\n");
fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
fprintf(stderr, " -r, --reindex (re)index compressed file\n");
fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
fprintf(stderr, "\n");
return 1;
}
int main(int argc, char **argv)
{
int c, compress, pstdout, is_forced, index = 0, reindex = 0;
BGZF *fp;
void *buffer;
long start, end, size;
char *index_fname = NULL;
static struct option loptions[] =
{
{"help",0,0,'h'},
{"offset",1,0,'b'},
{"stdout",0,0,'c'},
{"decompress",0,0,'d'},
{"force",0,0,'f'},
{"index",0,0,'i'},
{"index-name",1,0,'I'},
{"reindex",0,0,'r'},
{"size",1,0,'s'},
{0,0,0,0}
};
compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){
switch(c){
case 'd': compress = 0; break;
case 'c': pstdout = 1; break;
case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
case 's': size = atol(optarg); pstdout = 1; break;
case 'f': is_forced = 1; break;
case 'i': index = 1; break;
case 'I': index_fname = optarg; break;
case 'r': reindex = 1; compress = 0; break;
case 'h':
case '?': return bgzip_main_usage();
}
}
if (size >= 0) end = start + size;
if (end >= 0 && end < start) {
fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
return 1;
}
if (compress == 1) {
struct stat sbuf;
int f_src = fileno(stdin);
int f_dst = fileno(stdout);
if ( argc>optind )
{
if ( stat(argv[optind],&sbuf)<0 )
{
fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}
if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}
if (pstdout)
f_dst = fileno(stdout);
else
{
char *name = malloc(strlen(argv[optind]) + 5);
strcpy(name, argv[optind]);
strcat(name, ".gz");
f_dst = write_open(name, is_forced);
free(name);
if (f_dst < 0) return 1;
}
}
else if (!pstdout && isatty(fileno((FILE *)stdout)) )
return bgzip_main_usage();
else if ( index && !index_fname )
{
fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
return 1;
}
fp = bgzf_fdopen(f_dst, "w");
if ( index ) bgzf_index_build_init(fp);
buffer = malloc(WINDOW_SIZE);
while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
// f_dst will be closed here
if ( index )
{
if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL);
else bgzf_index_dump(fp, argv[optind], ".gz.gzi");
}
if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
if (argc > optind && !pstdout) unlink(argv[optind]);
free(buffer);
close(f_src);
return 0;
}
else if ( reindex )
{
if ( argc>optind )
{
fp = bgzf_open(argv[optind], "r");
if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
}
else
{
if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
fp = bgzf_fdopen(fileno(stdin), "r");
if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
}
buffer = malloc(BGZF_BLOCK_SIZE);
bgzf_index_build_init(fp);
int ret;
while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
free(buffer);
if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
if ( index_fname )
bgzf_index_dump(fp, index_fname, NULL);
else
bgzf_index_dump(fp, argv[optind], ".gzi");
if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
return 0;
}
else
{
struct stat sbuf;
int f_dst;
if ( argc>optind )
{
if ( stat(argv[optind],&sbuf)<0 )
{
fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}
char *name;
int len = strlen(argv[optind]);
if ( strcmp(argv[optind]+len-3,".gz") )
{
fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
return 1;
}
fp = bgzf_open(argv[optind], "r");
if (fp == NULL) {
fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
return 1;
}
if (pstdout) {
f_dst = fileno(stdout);
}
else {
name = strdup(argv[optind]);
name[strlen(name) - 3] = '\0';
f_dst = write_open(name, is_forced);
free(name);
}
}
else if (!pstdout && isatty(fileno((FILE *)stdin)) )
return bgzip_main_usage();
else
{
f_dst = fileno(stdout);
fp = bgzf_fdopen(fileno(stdin), "r");
if (fp == NULL) {
fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
return 1;
}
}
buffer = malloc(WINDOW_SIZE);
if ( start>0 )
{
if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
}
while (1) {
if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
if (c == 0) break;
if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
start += c;
if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
if (end >= 0 && start >= end) break;
}
free(buffer);
if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
if (!pstdout) unlink(argv[optind]);
return 0;
}
return 0;
}
htslib-1.2.1/config.mk.in 0000664 0000000 0000000 00000004736 12464172677 0015235 0 ustar 00root root 0000000 0000000 # Optional configure Makefile overrides for htslib.
#
# Copyright (C) 2015 Genome Research Ltd.
#
# Author: John Marshall
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# This is @configure_input@
#
# If you use configure, this file overrides variables and augments rules
# in the Makefile to reflect your configuration choices. If you don't run
# configure, the main Makefile contains suitable conservative defaults.
prefix = @prefix@
exec_prefix = @exec_prefix@
bindir = @bindir@
includedir = @includedir@
libdir = @libdir@
datarootdir = @datarootdir@
mandir = @mandir@
CC = @CC@
RANLIB = @RANLIB@
CFLAGS = @CFLAGS@
LDFLAGS = @LDFLAGS@
LDLIBS = @LIBS@
# ifeq/.../endif, +=, and target-specific variables are GNU Make-specific.
# If you don't have GNU Make, comment out this conditional and note that
# to enable iRODS you will need to implement the following elsewhere.
ifeq "iRODS-@irods@" "iRODS-enabled"
@define_IRODS_HOME@
EXTRA_CPPFLAGS_IRODS = \
-I$(IRODS_HOME)/lib/api/include \
-I$(IRODS_HOME)/lib/core/include \
-I$(IRODS_HOME)/lib/md5/include \
-I$(IRODS_HOME)/lib/sha1/include \
-I$(IRODS_HOME)/server/core/include \
-I$(IRODS_HOME)/server/drivers/include \
-I$(IRODS_HOME)/server/icat/include
LDFLAGS += -L$(IRODS_HOME)/lib/core/obj
LDLIBS += -lRodsAPIs -lgssapi_krb5
LIBHTS_OBJS += hfile_irods.o
hfile.o hfile.pico: CPPFLAGS += -DHAVE_IRODS
hfile_irods.o hfile_irods.pico: CPPFLAGS += $(EXTRA_CPPFLAGS_IRODS)
endif
htslib-1.2.1/configure.ac 0000664 0000000 0000000 00000007364 12464172677 0015320 0 ustar 00root root 0000000 0000000 # Configure script for htslib, a C library for high-throughput sequencing data.
#
# Copyright (C) 2015 Genome Research Ltd.
#
# Author: John Marshall
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
dnl Process this file with autoconf to produce a configure script
AC_INIT([HTSlib], m4_esyscmd_s([make print-version]),
[samtools-help@lists.sourceforge.net], [], [http://www.htslib.org/])
AC_PREREQ(2.63) dnl This version introduced 4-argument AC_CHECK_HEADER
AC_CONFIG_SRCDIR(hts.c)
dnl Copyright notice to be copied into the generated configure script
AC_COPYRIGHT([Portions copyright (C) 2015 Genome Research Ltd.
This configure script is free software: you are free to change and
redistribute it. There is NO WARRANTY, to the extent permitted by law.])
AC_PROG_CC
AC_PROG_RANLIB
AC_ARG_WITH([irods],
[AS_HELP_STRING([[--with-irods[=DIR]]],
[use RodsAPIs library (in DIR) to support iRODS URLs])],
[case $withval in
no) irods=disabled ;;
yes) irods=enabled ;;
*) irods=enabled; IRODS_HOME=$withval ;;
esac],
[irods=disabled])
save_LIBS=$LIBS
zlib_devel=ok
dnl Set a trivial non-empty INCLUDES to avoid excess default includes tests
AC_CHECK_HEADER([zlib.h], [], [zlib_devel=missing], [;])
AC_CHECK_LIB(z, inflate, [], [zlib_devel=missing])
LIBS=$save_LIBS
if test $zlib_devel != ok; then
AC_MSG_ERROR([zlib development files not found
HTSlib uses compression routines from the zlib library .
Building HTSlib requires zlib development files to be installed on the build
machine; you may need to ensure a package such as zlib1g-dev (on Debian or
Ubuntu Linux) or zlib-devel (on RPM-based Linux distributions) is installed.
FAILED. This error must be resolved in order to build HTSlib successfully.])
fi
if test $irods = enabled; then
# TODO Also test whether we require libgssapi_krb5 and AC_CHECK_LIB it
save_LDFLAGS=$LDFLAGS
LDFLAGS="$LDFLAGS -L$IRODS_HOME/lib/core/obj"
AC_CHECK_LIB([RodsAPIs], [getRodsEnvFileName],
[case $with_irods in
yes) define_IRODS_HOME='# Uses $(IRODS_HOME) from the environment' ;;
*) define_IRODS_HOME="IRODS_HOME = $with_irods" ;;
esac],
[AC_MSG_ERROR([iRODS development files not found
Support for iRODS URLs requires the libRodsAPI client library and headers.
Configure with --with-irods=DIR (or just --with-irods if \$IRODS_HOME has
been exported with a suitable value), where DIR is the base of an iRODS tree
such that the library is present as DIR/lib/core/obj/libRodsAPI.* and headers
are present under DIR/lib/api/include and so on.])],
[-lgssapi_krb5 -lpthread])
LDFLAGS=$save_LDFLAGS
else
define_IRODS_HOME='IRODS_HOME ?= /disabled'
fi
AC_SUBST([irods])
AC_SUBST([define_IRODS_HOME])
AC_CONFIG_FILES(config.mk)
AC_OUTPUT
htslib-1.2.1/cram/ 0000775 0000000 0000000 00000000000 12464172677 0013742 5 ustar 00root root 0000000 0000000 htslib-1.2.1/cram/cram.h 0000664 0000000 0000000 00000004367 12464172677 0015047 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* CRAM interface.
*
* Consider using the higher level hts_*() API for programs that wish to
* be file format agnostic (see htslib/hts.h).
*
* This API should be used for CRAM specific code. The specifics of the
* public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
* although these should not be included directly (use this file instead).
*/
#ifdef __cplusplus
extern "C" {
#endif
#ifndef _CRAM_H_
#define _CRAM_H_
#include "cram/cram_samtools.h"
#include "cram/sam_header.h"
#include "cram_structs.h"
#include "cram_io.h"
#include "cram_encode.h"
#include "cram_decode.h"
#include "cram_stats.h"
#include "cram_codecs.h"
#include "cram_index.h"
#endif
#ifdef __cplusplus
}
#endif
htslib-1.2.1/cram/cram_codecs.c 0000664 0000000 0000000 00000130206 12464172677 0016352 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* FIXME: add checking of cram_external_type to return NULL on unsupported
* {codec,type} tuples.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include "cram/cram.h"
static char *codec2str(enum cram_encoding codec) {
switch (codec) {
case E_NULL: return "NULL";
case E_EXTERNAL: return "EXTERNAL";
case E_GOLOMB: return "GOLOMB";
case E_HUFFMAN: return "HUFFMAN";
case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
case E_BETA: return "BETA";
case E_SUBEXP: return "SUBEXP";
case E_GOLOMB_RICE: return "GOLOMB_RICE";
case E_GAMMA: return "GAMMA";
}
return "(unknown)";
}
/*
* ---------------------------------------------------------------------------
* Block bit-level I/O functions.
* All defined static here to promote easy inlining by the compiler.
*/
#if 0
/* Get a single bit, MSB first */
static signed int get_bit_MSB(cram_block *block) {
unsigned int val;
if (block->byte > block->alloc)
return -1;
val = block->data[block->byte] >> block->bit;
if (--block->bit == -1) {
block->bit = 7;
block->byte++;
//printf("(%02X)", block->data[block->byte]);
}
//printf("-B%d-", val&1);
return val & 1;
}
#endif
/*
* Count number of successive 0 and 1 bits
*/
static int get_one_bits_MSB(cram_block *block) {
int n = 0, b;
do {
b = block->data[block->byte] >> block->bit;
if (--block->bit == -1) {
block->bit = 7;
block->byte++;
}
n++;
} while (b&1);
return n-1;
}
static int get_zero_bits_MSB(cram_block *block) {
int n = 0, b;
do {
b = block->data[block->byte] >> block->bit;
if (--block->bit == -1) {
block->bit = 7;
block->byte++;
}
n++;
} while (!(b&1));
return n-1;
}
#if 0
/* Stores a single bit */
static void store_bit_MSB(cram_block *block, unsigned int bit) {
if (block->byte >= block->alloc) {
block->alloc = block->alloc ? block->alloc*2 : 1024;
block->data = realloc(block->data, block->alloc);
}
if (bit)
block->data[block->byte] |= (1 << block->bit);
if (--block->bit == -1) {
block->bit = 7;
block->byte++;
block->data[block->byte] = 0;
}
}
#endif
#if 0
/* Rounds to the next whole byte boundary first */
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
if (block->bit != 7) {
block->bit = 7;
block->byte++;
}
while (block->byte + len >= block->alloc) {
block->alloc = block->alloc ? block->alloc*2 : 1024;
block->data = realloc(block->data, block->alloc);
}
memcpy(&block->data[block->byte], bytes, len);
block->byte += len;
}
#endif
/* Local optimised copy for inlining */
static inline unsigned int get_bits_MSB(cram_block *block, int nbits) {
unsigned int val = 0;
int i;
#if 0
// Fits within the current byte */
if (nbits <= block->bit+1) {
val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits) == -1) {
block->bit = 7;
block->byte++;
}
return val;
}
// partial first byte
val = block->data[block->byte] & ((1<<(block->bit+1))-1);
nbits -= block->bit+1;
block->bit = 7;
block->byte++;
// whole middle bytes
while (nbits >= 8) {
val = (val << 8) | block->data[block->byte++];
nbits -= 8;
}
val <<= nbits;
val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits;
return val;
#endif
#if 0
/* Inefficient implementation! */
//printf("{");
for (i = 0; i < nbits; i++)
//val = (val << 1) | get_bit_MSB(block);
GET_BIT_MSB(block, val);
#endif
#if 1
/* Combination of 1st two methods */
if (nbits <= block->bit+1) {
val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<bit -= nbits) == -1) {
block->bit = 7;
block->byte++;
}
return val;
}
switch(nbits) {
// case 15: GET_BIT_MSB(block, val);
// case 14: GET_BIT_MSB(block, val);
// case 13: GET_BIT_MSB(block, val);
// case 12: GET_BIT_MSB(block, val);
// case 11: GET_BIT_MSB(block, val);
// case 10: GET_BIT_MSB(block, val);
// case 9: GET_BIT_MSB(block, val);
case 8: GET_BIT_MSB(block, val);
case 7: GET_BIT_MSB(block, val);
case 6: GET_BIT_MSB(block, val);
case 5: GET_BIT_MSB(block, val);
case 4: GET_BIT_MSB(block, val);
case 3: GET_BIT_MSB(block, val);
case 2: GET_BIT_MSB(block, val);
case 1: GET_BIT_MSB(block, val);
break;
default:
for (i = 0; i < nbits; i++)
//val = (val << 1) | get_bit_MSB(block);
GET_BIT_MSB(block, val);
}
#endif
//printf("=0x%x}", val);
return val;
}
/*
* Can store up to 24-bits worth of data encoded in an integer value
* Possibly we'd want to have a less optimal store_bits function when dealing
* with nbits > 24, but for now we assume the codes generated are never
* that big. (Given this is only possible with 121392 or more
* characters with exactly the correct frequency distribution we check
* for it elsewhere.)
*/
static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
/* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */
/*
* Use slow mode until we tweak the huffman generator to never generate
* codes longer than 24-bits.
*/
unsigned int mask;
if (block->byte+4 >= block->alloc) {
if (block->byte) {
block->alloc *= 2;
block->data = realloc(block->data, block->alloc + 4);
if (!block->data)
return -1;
} else {
block->alloc = 1024;
block->data = realloc(block->data, block->alloc + 4);
if (!block->data)
return -1;
block->data[0] = 0; // initialise first byte of buffer
}
}
/* fits in current bit-field */
if (nbits <= block->bit+1) {
block->data[block->byte] |= (val << (block->bit+1-nbits));
if ((block->bit-=nbits) == -1) {
block->bit = 7;
block->byte++;
block->data[block->byte] = 0;
}
return 0;
}
block->data[block->byte] |= (val >> (nbits -= block->bit+1));
block->bit = 7;
block->byte++;
block->data[block->byte] = 0;
mask = 1<<(nbits-1);
do {
if (val & mask)
block->data[block->byte] |= (1 << block->bit);
if (--block->bit == -1) {
block->bit = 7;
block->byte++;
block->data[block->byte] = 0;
}
mask >>= 1;
} while(--nbits);
return 0;
}
/*
* Returns the next 'size' bytes from a block, or NULL if insufficient
* data left.This is just a pointer into the block data and not an
* allocated object, so do not free the result.
*/
static char *cram_extract_block(cram_block *b, int size) {
char *cp = (char *)b->data + b->idx;
b->idx += size;
if (b->idx > b->uncomp_size)
return NULL;
return cp;
}
/*
* ---------------------------------------------------------------------------
* EXTERNAL
*/
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int i;
char *cp;
cram_block *b = NULL;
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
}
if (i == slice->hdr->num_blocks || !b)
return -1;
}
cp = (char *)b->data + b->idx;
// E_INT and E_LONG are guaranteed single item queries
b->idx += itf8_get(cp, (int32_t *)out);
*out_size = 1;
return 0;
}
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
cram_block *in, char *out,
int *out_size) {
int i;
char *cp;
cram_block *b = NULL;
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
}
if (i == slice->hdr->num_blocks || !b)
return -1;
}
cp = cram_extract_block(b, *out_size);
if (!cp)
return -1;
memcpy(out, cp, *out_size);
return 0;
}
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
cram_block *in, char *out_,
int *out_size) {
int i;
char *cp;
cram_block *b = NULL;
cram_block *out = (cram_block *)out_;
/* Find the external block */
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->external.content_id]))
return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
if (b && b->content_type == EXTERNAL &&
b->content_id == c->external.content_id) {
break;
}
}
if (i == slice->hdr->num_blocks || !b)
return -1;
}
cp = cram_extract_block(b, *out_size);
if (!cp)
return -1;
BLOCK_APPEND(out, cp, *out_size);
return 0;
}
void cram_external_decode_free(cram_codec *c) {
if (c)
free(c);
}
cram_codec *cram_external_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
char *cp = data;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_EXTERNAL;
if (option == E_INT || option == E_LONG)
c->decode = cram_external_decode_int;
else if (option == E_BYTE_ARRAY || option == E_BYTE)
c->decode = cram_external_decode_char;
else
c->decode = cram_external_decode_block;
c->free = cram_external_decode_free;
cp += itf8_get(cp, &c->external.content_id);
if (cp - data != size) {
fprintf(stderr, "Malformed external header stream\n");
free(c);
return NULL;
}
c->external.type = option;
return c;
}
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
uint32_t *i32 = (uint32_t *)in;
itf8_put_blk(c->out, *i32);
return 0;
}
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
BLOCK_APPEND(c->out, in, in_size);
return 0;
}
void cram_external_encode_free(cram_codec *c) {
if (!c)
return;
free(c);
}
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
int version) {
char tmp[99], *tp = tmp;
int len = 0;
if (prefix) {
size_t l = strlen(prefix);
BLOCK_APPEND(b, prefix, l);
len += l;
}
tp += itf8_put(tp, c->e_external.content_id);
len += itf8_put_blk(b, c->codec);
len += itf8_put_blk(b, tp-tmp);
BLOCK_APPEND(b, tmp, tp-tmp);
len += tp-tmp;
return len;
}
cram_codec *cram_external_encode_init(cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
cram_codec *c;
c = malloc(sizeof(*c));
if (!c)
return NULL;
c->codec = E_EXTERNAL;
c->free = cram_external_encode_free;
if (option == E_INT || option == E_LONG)
c->encode = cram_external_encode_int;
else if (option == E_BYTE_ARRAY || option == E_BYTE)
c->encode = cram_external_encode_char;
else
abort();
c->store = cram_external_encode_store;
c->e_external.content_id = (size_t)dat;
return c;
}
/*
* ---------------------------------------------------------------------------
* BETA
*/
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int i, n;
if (c->beta.nbits) {
for (i = 0, n = *out_size; i < n; i++)
out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
} else {
for (i = 0, n = *out_size; i < n; i++)
out_i[i] = -c->beta.offset;
}
return 0;
}
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
int i, n;
if (c->beta.nbits) {
for (i = 0, n = *out_size; i < n; i++)
out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
} else {
for (i = 0, n = *out_size; i < n; i++)
out[i] = -c->beta.offset;
}
return 0;
}
void cram_beta_decode_free(cram_codec *c) {
if (c)
free(c);
}
cram_codec *cram_beta_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
char *cp = data;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_BETA;
if (option == E_INT || option == E_LONG)
c->decode = cram_beta_decode_int;
else if (option == E_BYTE_ARRAY || option == E_BYTE)
c->decode = cram_beta_decode_char;
else
abort();
c->free = cram_beta_decode_free;
cp += itf8_get(cp, &c->beta.offset);
cp += itf8_get(cp, &c->beta.nbits);
if (cp - data != size) {
fprintf(stderr, "Malformed beta header stream\n");
free(c);
return NULL;
}
return c;
}
int cram_beta_encode_store(cram_codec *c, cram_block *b,
char *prefix, int version) {
int len = 0;
if (prefix) {
size_t l = strlen(prefix);
BLOCK_APPEND(b, prefix, l);
len += l;
}
len += itf8_put_blk(b, c->codec);
len += itf8_put_blk(b, itf8_size(c->e_beta.offset)
+ itf8_size(c->e_beta.nbits)); // codec length
len += itf8_put_blk(b, c->e_beta.offset);
len += itf8_put_blk(b, c->e_beta.nbits);
return len;
}
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
int *syms = (int *)in;
int i, r = 0;
for (i = 0; i < in_size; i++)
r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
c->e_beta.nbits);
return r;
}
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
unsigned char *syms = (unsigned char *)in;
int i, r = 0;
for (i = 0; i < in_size; i++)
r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
c->e_beta.nbits);
return r;
}
void cram_beta_encode_free(cram_codec *c) {
if (c) free(c);
}
cram_codec *cram_beta_encode_init(cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
cram_codec *c;
int min_val, max_val, len = 0;
c = malloc(sizeof(*c));
if (!c)
return NULL;
c->codec = E_BETA;
c->free = cram_beta_encode_free;
if (option == E_INT)
c->encode = cram_beta_encode_int;
else
c->encode = cram_beta_encode_char;
c->store = cram_beta_encode_store;
if (dat) {
min_val = ((int *)dat)[0];
max_val = ((int *)dat)[1];
} else {
min_val = INT_MAX;
max_val = INT_MIN;
int i;
for (i = 0; i < MAX_STAT_VAL; i++) {
if (!st->freqs[i])
continue;
if (min_val > i)
min_val = i;
max_val = i;
}
if (st->h) {
khint_t k;
for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
if (!kh_exist(st->h, k))
continue;
i = kh_key(st->h, k);
if (min_val > i)
min_val = i;
if (max_val < i)
max_val = i;
}
}
}
assert(max_val >= min_val);
c->e_beta.offset = -min_val;
max_val -= min_val;
while (max_val) {
len++;
max_val >>= 1;
}
c->e_beta.nbits = len;
return c;
}
/*
* ---------------------------------------------------------------------------
* SUBEXP
*/
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int n, count;
int k = c->subexp.k;
for (count = 0, n = *out_size; count < n; count++) {
int i = 0, tail;
int val;
/* Get number of 1s */
//while (get_bit_MSB(in) == 1) i++;
i = get_one_bits_MSB(in);
/*
* Val is
* i > 0: 2^(k+i-1) + k+i-1 bits
* i = 0: k bits
*/
if (i) {
tail = i + k-1;
val = 0;
while (tail) {
//val = val<<1; val |= get_bit_MSB(in);
GET_BIT_MSB(in, val);
tail--;
}
val += 1 << (i + k-1);
} else {
tail = k;
val = 0;
while (tail) {
//val = val<<1; val |= get_bit_MSB(in);
GET_BIT_MSB(in, val);
tail--;
}
}
out_i[count] = val - c->subexp.offset;
}
return 0;
}
void cram_subexp_decode_free(cram_codec *c) {
if (c)
free(c);
}
cram_codec *cram_subexp_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
char *cp = data;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_SUBEXP;
c->decode = cram_subexp_decode;
c->free = cram_subexp_decode_free;
cp += itf8_get(cp, &c->subexp.offset);
cp += itf8_get(cp, &c->subexp.k);
if (cp - data != size) {
fprintf(stderr, "Malformed subexp header stream\n");
free(c);
return NULL;
}
return c;
}
/*
* ---------------------------------------------------------------------------
* GAMMA
*/
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int i, n;
for (i = 0, n = *out_size; i < n; i++) {
int nz = 0;
int val;
//while (get_bit_MSB(in) == 0) nz++;
nz = get_zero_bits_MSB(in);
val = 1;
while (nz > 0) {
//val <<= 1; val |= get_bit_MSB(in);
GET_BIT_MSB(in, val);
nz--;
}
out_i[i] = val - c->gamma.offset;
}
return 0;
}
void cram_gamma_decode_free(cram_codec *c) {
if (c)
free(c);
}
cram_codec *cram_gamma_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
char *cp = data;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_GAMMA;
c->decode = cram_gamma_decode;
c->free = cram_gamma_decode_free;
cp += itf8_get(cp, &c->gamma.offset);
if (cp - data != size) {
fprintf(stderr, "Malformed gamma header stream\n");
free(c);
return NULL;
}
return c;
}
/*
* ---------------------------------------------------------------------------
* HUFFMAN
*/
static int code_sort(const void *vp1, const void *vp2) {
const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
if (c1->len != c2->len)
return c1->len - c2->len;
else
return c1->symbol - c2->symbol;
}
void cram_huffman_decode_free(cram_codec *c) {
if (!c)
return;
if (c->huffman.codes)
free(c->huffman.codes);
free(c);
}
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int i, n;
/* Special case of 0 length codes */
for (i = 0, n = *out_size; i < n; i++) {
out[i] = c->huffman.codes[0].symbol;
}
return 0;
}
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int i, n, ncodes = c->huffman.ncodes;
const cram_huffman_code * const codes = c->huffman.codes;
for (i = 0, n = *out_size; i < n; i++) {
int idx = 0;
int val = 0, len = 0, last_len = 0;
for (;;) {
int dlen = codes[idx].len - last_len;
if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen)
return -1;
//val <<= dlen;
//val |= get_bits_MSB(in, dlen);
//last_len = (len += dlen);
last_len = (len += dlen);
for (; dlen; dlen--) GET_BIT_MSB(in, val);
idx = val - codes[idx].p;
if (idx >= ncodes || idx < 0)
return -1;
if (codes[idx].code == val && codes[idx].len == len) {
out[i] = codes[idx].symbol;
break;
}
}
}
return 0;
}
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int i, n;
const cram_huffman_code * const codes = c->huffman.codes;
/* Special case of 0 length codes */
for (i = 0, n = *out_size; i < n; i++) {
out_i[i] = codes[0].symbol;
}
return 0;
}
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
cram_block *in, char *out, int *out_size) {
int32_t *out_i = (int32_t *)out;
int i, n, ncodes = c->huffman.ncodes;
const cram_huffman_code * const codes = c->huffman.codes;
for (i = 0, n = *out_size; i < n; i++) {
int idx = 0;
int val = 0, len = 0, last_len = 0;
// Now one bit at a time for remaining checks
for (;;) {
int dlen = codes[idx].len - last_len;
if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen)
return -1;
//val <<= dlen;
//val |= get_bits_MSB(in, dlen);
//last_len = (len += dlen);
last_len = (len += dlen);
for (; dlen; dlen--) GET_BIT_MSB(in, val);
idx = val - codes[idx].p;
if (idx >= ncodes || idx < 0)
return -1;
if (codes[idx].code == val && codes[idx].len == len) {
out_i[i] = codes[idx].symbol;
break;
}
}
}
return 0;
}
/*
* Initialises a huffman decoder from an encoding data stream.
*/
cram_codec *cram_huffman_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
int32_t ncodes, i, j;
char *cp = data, *data_end = &data[size];
cram_codec *h;
cram_huffman_code *codes;
int32_t val, last_len, max_len = 0;
cp += itf8_get(cp, &ncodes);
h = calloc(1, sizeof(*h));
if (!h)
return NULL;
h->free = cram_huffman_decode_free;
h->huffman.ncodes = ncodes;
codes = h->huffman.codes = malloc(ncodes * sizeof(*codes));
if (!codes) {
free(h);
return NULL;
}
/* Read symbols and bit-lengths */
for (i = 0; i < ncodes && cp < data_end; i++) {
cp += itf8_get(cp, &codes[i].symbol);
}
if (cp >= data_end) {
fprintf(stderr, "Malformed huffman header stream\n");
free(h);
return NULL;
}
cp += itf8_get(cp, &i);
if (i != ncodes) {
fprintf(stderr, "Malformed huffman header stream\n");
free(h);
return NULL;
}
if (ncodes == 0) {
/* NULL huffman stream */
return h;
}
for (i = 0; i < ncodes && cp < data_end; i++) {
cp += itf8_get(cp, &codes[i].len);
if (max_len < codes[i].len)
max_len = codes[i].len;
}
if (cp - data != size || max_len >= ncodes) {
fprintf(stderr, "Malformed huffman header stream\n");
free(h);
return NULL;
}
/* Sort by bit length and then by symbol value */
qsort(codes, ncodes, sizeof(*codes), code_sort);
/* Assign canonical codes */
val = -1, last_len = 0;
for (i = 0; i < ncodes; i++) {
val++;
if (codes[i].len > last_len) {
while (codes[i].len > last_len) {
val <<= 1;
last_len++;
}
}
codes[i].code = val;
}
/*
* Compute the next starting point, offset by the i'th value.
* For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
* codes[10..13].p = 30 - 10.
*/
last_len = 0;
for (i = j = 0; i < ncodes; i++) {
if (codes[i].len > last_len) {
j = codes[i].code - i;
last_len = codes[i].len;
}
codes[i].p = j;
}
// puts("==HUFF LEN==");
// for (i = 0; i <= last_len+1; i++) {
// printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]);
// }
// puts("===HUFFMAN CODES===");
// for (i = 0; i < ncodes; i++) {
// int j;
// printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
// j = codes[i].len;
// while (j) {
// putchar(codes[i].code & (1 << --j) ? '1' : '0');
// }
// printf(" %d\n", codes[i].code);
// }
h->codec = E_HUFFMAN;
if (option == E_BYTE || option == E_BYTE_ARRAY) {
if (h->huffman.codes[0].len == 0)
h->decode = cram_huffman_decode_char0;
else
h->decode = cram_huffman_decode_char;
} else if (option == E_BYTE_ARRAY_BLOCK) {
abort();
} else {
if (h->huffman.codes[0].len == 0)
h->decode = cram_huffman_decode_int0;
else
h->decode = cram_huffman_decode_int;
}
return (cram_codec *)h;
}
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
return 0;
}
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
int i, code, len, r = 0;
unsigned char *syms = (unsigned char *)in;
do {
int sym = *syms++;
if (sym >= -1 && sym < MAX_HUFF) {
i = c->e_huffman.val2code[sym+1];
assert(c->e_huffman.codes[i].symbol == sym);
code = c->e_huffman.codes[i].code;
len = c->e_huffman.codes[i].len;
} else {
/* Slow - use a lookup table for when sym < MAX_HUFF? */
for (i = 0; i < c->e_huffman.nvals; i++) {
if (c->e_huffman.codes[i].symbol == sym)
break;
}
if (i == c->e_huffman.nvals)
return -1;
code = c->e_huffman.codes[i].code;
len = c->e_huffman.codes[i].len;
}
r |= store_bits_MSB(c->out, code, len);
} while (--in_size);
return r;
}
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
return 0;
}
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
int i, code, len, r = 0;
int *syms = (int *)in;
do {
int sym = *syms++;
if (sym >= -1 && sym < MAX_HUFF) {
i = c->e_huffman.val2code[sym+1];
assert(c->e_huffman.codes[i].symbol == sym);
code = c->e_huffman.codes[i].code;
len = c->e_huffman.codes[i].len;
} else {
/* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
for (i = 0; i < c->e_huffman.nvals; i++) {
if (c->e_huffman.codes[i].symbol == sym)
break;
}
if (i == c->e_huffman.nvals)
return -1;
code = c->e_huffman.codes[i].code;
len = c->e_huffman.codes[i].len;
}
r |= store_bits_MSB(c->out, code, len);
} while (--in_size);
return r;
}
void cram_huffman_encode_free(cram_codec *c) {
if (!c)
return;
if (c->e_huffman.codes)
free(c->e_huffman.codes);
free(c);
}
/*
* Encodes a huffman tree.
* Returns number of bytes written.
*/
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
int version) {
int i, len = 0;
cram_huffman_code *codes = c->e_huffman.codes;
/*
* Up to code length 127 means 2.5e+26 bytes of data required (worst
* case huffman tree needs symbols with freqs matching the Fibonacci
* series). So guaranteed 1 byte per code.
*
* Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
*
* Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
*/
char *tmp = malloc(6*c->e_huffman.nvals+16);
char *tp = tmp;
if (!tmp)
return -1;
if (prefix) {
size_t l = strlen(prefix);
BLOCK_APPEND(b, prefix, l);
len += l;
}
tp += itf8_put(tp, c->e_huffman.nvals);
for (i = 0; i < c->e_huffman.nvals; i++) {
tp += itf8_put(tp, codes[i].symbol);
}
tp += itf8_put(tp, c->e_huffman.nvals);
for (i = 0; i < c->e_huffman.nvals; i++) {
tp += itf8_put(tp, codes[i].len);
}
len += itf8_put_blk(b, c->codec);
len += itf8_put_blk(b, tp-tmp);
BLOCK_APPEND(b, tmp, tp-tmp);
len += tp-tmp;
free(tmp);
return len;
}
cram_codec *cram_huffman_encode_init(cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len;
int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
cram_codec *c;
cram_huffman_code *codes;
c = malloc(sizeof(*c));
if (!c)
return NULL;
c->codec = E_HUFFMAN;
/* Count number of unique symbols */
for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
if (!st->freqs[i])
continue;
if (nvals >= vals_alloc) {
vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
vals = realloc(vals, vals_alloc * sizeof(int));
freqs = realloc(freqs, vals_alloc * sizeof(int));
if (!vals || !freqs) {
if (vals) free(vals);
if (freqs) free(freqs);
free(c);
return NULL;
}
}
vals[nvals] = i;
freqs[nvals] = st->freqs[i];
assert(st->freqs[i] > 0);
ntot += freqs[nvals];
if (max_val < i) max_val = i;
if (min_val > i) min_val = i;
nvals++;
}
if (st->h) {
khint_t k;
for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
if (!kh_exist(st->h, k))
continue;
if (nvals >= vals_alloc) {
vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
vals = realloc(vals, vals_alloc * sizeof(int));
freqs = realloc(freqs, vals_alloc * sizeof(int));
if (!vals || !freqs)
return NULL;
}
vals[nvals]= kh_key(st->h, k);
freqs[nvals] = kh_val(st->h, k);
assert(freqs[nvals] > 0);
ntot += freqs[nvals];
if (max_val < i) max_val = i;
if (min_val > i) min_val = i;
nvals++;
}
}
assert(nvals > 0);
freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
lens = calloc(2*nvals, sizeof(*lens));
if (!lens || !freqs)
return NULL;
/* Inefficient, use pointers to form chain so we can insert and maintain
* a sorted list? This is currently O(nvals^2) complexity.
*/
for (;;) {
int low1 = INT_MAX, low2 = INT_MAX;
int ind1 = 0, ind2 = 0;
for (i = 0; i < nvals; i++) {
if (freqs[i] < 0)
continue;
if (low1 > freqs[i])
low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
else if (low2 > freqs[i])
low2 = freqs[i], ind2 = i;
}
if (low2 == INT_MAX)
break;
freqs[nvals] = low1 + low2;
lens[ind1] = nvals;
lens[ind2] = nvals;
freqs[ind1] *= -1;
freqs[ind2] *= -1;
nvals++;
}
nvals = nvals/2+1;
/* Assign lengths */
for (i = 0; i < nvals; i++) {
int code_len = 0;
for (k = lens[i]; k; k = lens[k])
code_len++;
lens[i] = code_len;
freqs[i] *= -1;
//fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
}
/* Sort, need in a struct */
if (!(codes = malloc(nvals * sizeof(*codes))))
return NULL;
for (i = 0; i < nvals; i++) {
codes[i].symbol = vals[i];
codes[i].len = lens[i];
}
qsort(codes, nvals, sizeof(*codes), code_sort);
/*
* Generate canonical codes from lengths.
* Sort by length.
* Start with 0.
* Every new code of same length is +1.
* Every new code of new length is +1 then <<1 per extra length.
*
* /\
* a/\
* /\/\
* bcd/\
* ef
*
* a 1 0
* b 3 4 (0+1)<<2
* c 3 5
* d 3 6
* e 4 14 (6+1)<<1
* f 5 15
*/
code = 0; len = codes[0].len;
for (i = 0; i < nvals; i++) {
while (len != codes[i].len) {
code<<=1;
len++;
}
codes[i].code = code++;
if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
c->e_huffman.val2code[codes[i].symbol+1] = i;
//fprintf(stderr, "sym %d, code %d, len %d\n",
// codes[i].symbol, codes[i].code, codes[i].len);
}
free(lens);
free(vals);
free(freqs);
c->e_huffman.codes = codes;
c->e_huffman.nvals = nvals;
c->free = cram_huffman_encode_free;
if (option == E_BYTE || option == E_BYTE_ARRAY) {
if (c->e_huffman.codes[0].len == 0)
c->encode = cram_huffman_encode_char0;
else
c->encode = cram_huffman_encode_char;
} else {
if (c->e_huffman.codes[0].len == 0)
c->encode = cram_huffman_encode_int0;
else
c->encode = cram_huffman_encode_int;
}
c->store = cram_huffman_encode_store;
return c;
}
/*
* ---------------------------------------------------------------------------
* BYTE_ARRAY_LEN
*/
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
cram_block *in, char *out,
int *out_size) {
/* Fetch length */
int32_t len, one = 1;
c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec, in, (char *)&len, &one);
//printf("ByteArray Len=%d\n", len);
if (c->byte_array_len.value_codec) {
c->byte_array_len.value_codec->decode(slice,
c->byte_array_len.value_codec,
in, out, &len);
} else {
return -1;
}
*out_size = len;
return 0;
}
void cram_byte_array_len_decode_free(cram_codec *c) {
if (!c) return;
if (c->byte_array_len.len_codec)
c->byte_array_len.len_codec->free(c->byte_array_len.len_codec);
if (c->byte_array_len.value_codec)
c->byte_array_len.value_codec->free(c->byte_array_len.value_codec);
free(c);
}
cram_codec *cram_byte_array_len_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
char *cp = data;
int32_t encoding;
int32_t sub_size;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_BYTE_ARRAY_LEN;
c->decode = cram_byte_array_len_decode;
c->free = cram_byte_array_len_decode_free;
cp += itf8_get(cp, &encoding);
cp += itf8_get(cp, &sub_size);
c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size,
E_INT, version);
cp += sub_size;
cp += itf8_get(cp, &encoding);
cp += itf8_get(cp, &sub_size);
c->byte_array_len.value_codec = cram_decoder_init(encoding, cp, sub_size,
option, version);
cp += sub_size;
if (cp - data != size) {
fprintf(stderr, "Malformed byte_array_len header stream\n");
free(c);
return NULL;
}
return c;
}
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
int32_t i32 = in_size;
int r = 0;
r |= c->e_byte_array_len.len_codec->encode(slice,
c->e_byte_array_len.len_codec,
(char *)&i32, 1);
r |= c->e_byte_array_len.val_codec->encode(slice,
c->e_byte_array_len.val_codec,
in, in_size);
return r;
}
void cram_byte_array_len_encode_free(cram_codec *c) {
if (!c)
return;
if (c->e_byte_array_len.len_codec)
c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
if (c->e_byte_array_len.val_codec)
c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
free(c);
}
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
char *prefix, int version) {
int len = 0, len2, len3;
cram_codec *tc;
cram_block *b_len, *b_val;
if (prefix) {
size_t l = strlen(prefix);
BLOCK_APPEND(b, prefix, l);
len += l;
}
tc = c->e_byte_array_len.len_codec;
b_len = cram_new_block(0, 0);
len2 = tc->store(tc, b_len, NULL, version);
tc = c->e_byte_array_len.val_codec;
b_val = cram_new_block(0, 0);
len3 = tc->store(tc, b_val, NULL, version);
len += itf8_put_blk(b, c->codec);
len += itf8_put_blk(b, len2+len3);
BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
cram_free_block(b_len);
cram_free_block(b_val);
return len + len2 + len3;
}
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
cram_codec *c;
cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
c = malloc(sizeof(*c));
if (!c)
return NULL;
c->codec = E_BYTE_ARRAY_LEN;
c->free = cram_byte_array_len_encode_free;
c->encode = cram_byte_array_len_encode;
c->store = cram_byte_array_len_encode_store;
c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
NULL, E_INT,
e->len_dat,
version);
c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
NULL, E_BYTE_ARRAY,
e->val_dat,
version);
return c;
}
/*
* ---------------------------------------------------------------------------
* BYTE_ARRAY_STOP
*/
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
cram_block *in, char *out,
int *out_size) {
int i;
cram_block *b = NULL;
char *cp, ch;
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
return *out_size?-1:0;
} else {
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
if (b && b->content_type == EXTERNAL &&
b->content_id == c->byte_array_stop.content_id) {
break;
}
}
if (i == slice->hdr->num_blocks || !b)
return -1;
}
if (b->idx >= b->uncomp_size)
return -1;
cp = (char *)b->data + b->idx;
while ((ch = *cp) != (char)c->byte_array_stop.stop) {
if (cp - (char *)b->data >= b->uncomp_size)
return -1;
*out++ = ch;
cp++;
}
*out_size = cp - (char *)(b->data + b->idx);
b->idx = cp - (char *)b->data + 1;
return 0;
}
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
cram_block *in, char *out_,
int *out_size) {
cram_block *b = NULL;
cram_block *out = (cram_block *)out_;
char *cp, *out_cp, *cp_end;
char stop;
if (slice->block_by_id) {
if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
return *out_size?-1:0;
} else {
int i;
for (i = 0; i < slice->hdr->num_blocks; i++) {
b = slice->block[i];
if (b && b->content_type == EXTERNAL &&
b->content_id == c->byte_array_stop.content_id) {
break;
}
}
if (i == slice->hdr->num_blocks || !b)
return -1;
}
if (b->idx >= b->uncomp_size)
return -1;
cp = (char *)b->data + b->idx;
cp_end = (char *)b->data + b->uncomp_size;
out_cp = (char *)BLOCK_END(out);
stop = c->byte_array_stop.stop;
if (cp_end - cp < out->alloc - out->byte) {
while (*cp != stop && cp != cp_end)
*out_cp++ = *cp++;
BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
} else {
char *cp_start;
for (cp_start = cp; *cp != stop && cp != cp_end; cp++)
;
BLOCK_APPEND(out, cp_start, cp - cp_start);
BLOCK_GROW(out, cp - cp_start);
}
*out_size = cp - (char *)(b->data + b->idx);
b->idx = cp - (char *)b->data + 1;
return 0;
}
void cram_byte_array_stop_decode_free(cram_codec *c) {
if (!c) return;
free(c);
}
cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
enum cram_external_type option,
int version) {
cram_codec *c;
unsigned char *cp = (unsigned char *)data;
if (!(c = malloc(sizeof(*c))))
return NULL;
c->codec = E_BYTE_ARRAY_STOP;
c->decode = (option == E_BYTE_ARRAY_BLOCK)
? cram_byte_array_stop_decode_block
: cram_byte_array_stop_decode_char;
c->free = cram_byte_array_stop_decode_free;
c->byte_array_stop.stop = *cp++;
if (CRAM_MAJOR_VERS(version) == 1) {
c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
+ (cp[3]<<24);
cp += 4;
} else {
cp += itf8_get(cp, &c->byte_array_stop.content_id);
}
if ((char *)cp - data != size) {
fprintf(stderr, "Malformed byte_array_stop header stream\n");
free(c);
return NULL;
}
return c;
}
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
char *in, int in_size) {
BLOCK_APPEND(c->out, in, in_size);
BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
return 0;
}
void cram_byte_array_stop_encode_free(cram_codec *c) {
if (!c)
return;
free(c);
}
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
char *prefix, int version) {
int len = 0;
char buf[20], *cp = buf;
if (prefix) {
size_t l = strlen(prefix);
BLOCK_APPEND(b, prefix, l);
len += l;
}
cp += itf8_put(cp, c->codec);
if (CRAM_MAJOR_VERS(version) == 1) {
cp += itf8_put(cp, 5);
*cp++ = c->e_byte_array_stop.stop;
*cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff;
*cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff;
*cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff;
*cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff;
} else {
cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id));
*cp++ = c->e_byte_array_stop.stop;
cp += itf8_put(cp, c->e_byte_array_stop.content_id);
}
BLOCK_APPEND(b, buf, cp-buf);
len += cp-buf;
return len;
}
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
cram_codec *c;
c = malloc(sizeof(*c));
if (!c)
return NULL;
c->codec = E_BYTE_ARRAY_STOP;
c->free = cram_byte_array_stop_encode_free;
c->encode = cram_byte_array_stop_encode;
c->store = cram_byte_array_stop_encode_store;
c->e_byte_array_stop.stop = ((int *)dat)[0];
c->e_byte_array_stop.content_id = ((int *)dat)[1];
return c;
}
/*
* ---------------------------------------------------------------------------
*/
char *cram_encoding2str(enum cram_encoding t) {
switch (t) {
case E_NULL: return "NULL";
case E_EXTERNAL: return "EXTERNAL";
case E_GOLOMB: return "GOLOMB";
case E_HUFFMAN: return "HUFFMAN";
case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
case E_BETA: return "BETA";
case E_SUBEXP: return "SUBEXP";
case E_GOLOMB_RICE: return "GOLOMB_RICE";
case E_GAMMA: return "GAMMA";
}
return "?";
}
static cram_codec *(*decode_init[])(char *data,
int size,
enum cram_external_type option,
int version) = {
NULL,
cram_external_decode_init,
NULL,
cram_huffman_decode_init,
cram_byte_array_len_decode_init,
cram_byte_array_stop_decode_init,
cram_beta_decode_init,
cram_subexp_decode_init,
NULL,
cram_gamma_decode_init,
};
cram_codec *cram_decoder_init(enum cram_encoding codec,
char *data, int size,
enum cram_external_type option,
int version) {
if (decode_init[codec]) {
return decode_init[codec](data, size, option, version);
} else {
fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec));
return NULL;
}
}
static cram_codec *(*encode_init[])(cram_stats *stx,
enum cram_external_type option,
void *opt,
int version) = {
NULL,
cram_external_encode_init,
NULL,
cram_huffman_encode_init,
cram_byte_array_len_encode_init,
cram_byte_array_stop_encode_init,
cram_beta_encode_init,
NULL, //cram_subexp_encode_init,
NULL,
NULL, //cram_gamma_encode_init,
};
cram_codec *cram_encoder_init(enum cram_encoding codec,
cram_stats *st,
enum cram_external_type option,
void *dat,
int version) {
if (st && !st->nvals)
return NULL;
if (encode_init[codec]) {
cram_codec *r;
if ((r = encode_init[codec](st, option, dat, version)))
r->out = NULL;
return r;
} else {
fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec));
abort();
}
}
/*
* Returns the content_id used by this codec, also in id2 if byte_array_len.
* Returns -1 for the CORE block and -2 for unneeded.
* id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
*/
int cram_codec_to_id(cram_codec *c, int *id2) {
int bnum1, bnum2 = -2;
switch (c->codec) {
case E_HUFFMAN:
bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
break;
case E_GOLOMB:
case E_BETA:
case E_SUBEXP:
case E_GOLOMB_RICE:
case E_GAMMA:
bnum1 = -1;
break;
case E_EXTERNAL:
bnum1 = c->external.content_id;
break;
case E_BYTE_ARRAY_LEN:
bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
bnum2 = cram_codec_to_id(c->byte_array_len.value_codec, NULL);
break;
case E_BYTE_ARRAY_STOP:
bnum1 = c->byte_array_stop.content_id;
break;
case E_NULL:
bnum1 = -2;
break;
default:
fprintf(stderr, "Unknown codec type %d\n", c->codec);
bnum1 = -1;
}
if (id2)
*id2 = bnum2;
return bnum1;
}
htslib-1.2.1/cram/cram_codecs.h 0000664 0000000 0000000 00000012105 12464172677 0016354 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CRAM_ENCODINGS_H_
#define _CRAM_ENCODINGS_H_
#ifdef __cplusplus
extern "C" {
#endif
#include
struct cram_codec;
/*
* Slow but simple huffman decoder to start with.
* Read a bit at a time, keeping track of {length, value}
* eg. 1 1 0 1 => {1,1}, {2,3}, {3,6}, {4,13}
*
* Keep track of this through the huffman code table.
* For fast scanning we have an index of where the first code of length X
* appears.
*/
typedef struct {
int32_t symbol;
int32_t p; // next code start value, minus index to codes[]
int32_t code;
int32_t len;
} cram_huffman_code;
typedef struct {
int ncodes;
cram_huffman_code *codes;
} cram_huffman_decoder;
#define MAX_HUFF 128
typedef struct {
cram_huffman_code *codes;
int nvals;
int val2code[MAX_HUFF+1]; // value to code lookup for small values
} cram_huffman_encoder;
typedef struct {
int32_t offset;
int32_t nbits;
} cram_beta_decoder;
typedef struct {
int32_t offset;
} cram_gamma_decoder;
typedef struct {
int32_t offset;
int32_t k;
} cram_subexp_decoder;
typedef struct {
int32_t content_id;
enum cram_external_type type;
} cram_external_decoder;
typedef struct {
struct cram_codec *len_codec;
struct cram_codec *value_codec;
} cram_byte_array_len_decoder;
typedef struct {
unsigned char stop;
int32_t content_id;
} cram_byte_array_stop_decoder;
typedef struct {
enum cram_encoding len_encoding;
enum cram_encoding val_encoding;
void *len_dat;
void *val_dat;
struct cram_codec *len_codec;
struct cram_codec *val_codec;
} cram_byte_array_len_encoder;
/*
* A generic codec structure.
*/
typedef struct cram_codec {
enum cram_encoding codec;
cram_block *out;
void (*free)(struct cram_codec *codec);
int (*decode)(cram_slice *slice, struct cram_codec *codec,
cram_block *in, char *out, int *out_size);
int (*encode)(cram_slice *slice, struct cram_codec *codec,
char *in, int in_size);
int (*store)(struct cram_codec *codec, cram_block *b, char *prefix,
int version);
union {
cram_huffman_decoder huffman;
cram_external_decoder external;
cram_beta_decoder beta;
cram_gamma_decoder gamma;
cram_subexp_decoder subexp;
cram_byte_array_len_decoder byte_array_len;
cram_byte_array_stop_decoder byte_array_stop;
cram_huffman_encoder e_huffman;
cram_external_decoder e_external;
cram_byte_array_stop_decoder e_byte_array_stop;
cram_byte_array_len_encoder e_byte_array_len;
cram_beta_decoder e_beta;
};
} cram_codec;
char *cram_encoding2str(enum cram_encoding t);
cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size,
enum cram_external_type option,
int version);
cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st,
enum cram_external_type option, void *dat,
int version);
//int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size);
//void cram_decoder_free(void *codes);
//#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, (--b->bit == -1) && (b->bit = 7, b->byte++))
#define GET_BIT_MSB(b,v) (void)(v<<=1, v|=(b->data[b->byte] >> b->bit)&1, b->byte += (--b->bit<0), b->bit&=7)
/*
* Returns the content_id used by this codec, also in id2 if byte_array_len.
* Returns -1 for the CORE block and -2 for unneeded.
* id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
*/
int cram_codec_to_id(cram_codec *c, int *id2);
#ifdef __cplusplus
}
#endif
#endif /* _CRAM_ENCODINGS_H_ */
htslib-1.2.1/cram/cram_decode.c 0000664 0000000 0000000 00000214224 12464172677 0016340 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* - In-memory decoding of CRAM data structures.
* - Iterator for reading CRAM record by record.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "cram/cram.h"
#include "cram/os.h"
#include "cram/md5.h"
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
//#define USE_X
/* ----------------------------------------------------------------------
* CRAM compression headers
*/
/*
* Decodes the Tag Dictionary record in the preservation map
* Updates the cram compression header.
*
* Returns number of bytes decoded on success
* -1 on failure
*/
int cram_decode_TD(char *cp, cram_block_compression_hdr *h) {
char *op = cp;
unsigned char *dat;
cram_block *b;
int32_t blk_size;
int nTL, i, sz;
if (!(b = cram_new_block(0, 0)))
return -1;
h->TD_blk = b;
/* Decode */
cp += itf8_get(cp, &blk_size);
if (!blk_size) {
h->nTL = 0;
h->TL = NULL;
cram_free_block(b);
return cp - op;
}
BLOCK_APPEND(b, cp, blk_size);
cp += blk_size;
sz = cp - op;
// Force nul termination if missing
if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
BLOCK_APPEND_CHAR(b, '\0');
/* Set up TL lookup table */
dat = BLOCK_DATA(b);
// Count
for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
nTL++;
while (dat[i])
i++;
}
// Copy
h->nTL = nTL;
if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *))))
return -1;
for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
h->TL[nTL++] = &dat[i];
while (dat[i])
i++;
}
return sz;
}
/*
* Decodes a CRAM block compression header.
* Returns header ptr on success
* NULL on failure
*/
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
cram_block *b) {
char *cp, *cp_copy;
cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
int i;
int32_t map_size, map_count;
if (!hdr)
return NULL;
if (b->method != RAW) {
if (cram_uncompress_block(b)) {
free(hdr);
return NULL;
}
}
cp = (char *)b->data;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
cp += itf8_get(cp, &hdr->ref_seq_id);
cp += itf8_get(cp, &hdr->ref_seq_start);
cp += itf8_get(cp, &hdr->ref_seq_span);
cp += itf8_get(cp, &hdr->num_records);
cp += itf8_get(cp, &hdr->num_landmarks);
if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
free(hdr);
return NULL;
}
for (i = 0; i < hdr->num_landmarks; i++) {
cp += itf8_get(cp, &hdr->landmark[i]);
}
}
hdr->preservation_map = kh_init(map);
memset(hdr->rec_encoding_map, 0,
CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
memset(hdr->tag_encoding_map, 0,
CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
if (!hdr->preservation_map) {
cram_free_compression_header(hdr);
return NULL;
}
/* Initialise defaults for preservation map */
hdr->mapped_qs_included = 0;
hdr->unmapped_qs_included = 0;
hdr->unmapped_placed = 0;
hdr->qs_included = 0;
hdr->read_names_included = 0;
hdr->AP_delta = 1;
memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
/* Preservation map */
cp += itf8_get(cp, &map_size); cp_copy = cp;
cp += itf8_get(cp, &map_count);
for (i = 0; i < map_count; i++) {
pmap_t hd;
khint_t k;
int r;
cp += 2;
switch(CRAM_KEY(cp[-2],cp[-1])) {
case CRAM_KEY('M','I'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "MI", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
hdr->mapped_qs_included = hd.i;
break;
case CRAM_KEY('U','I'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "UI", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
hdr->unmapped_qs_included = hd.i;
break;
case CRAM_KEY('P','I'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "PI", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
hdr->unmapped_placed = hd.i;
break;
case CRAM_KEY('R','N'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "RN", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
hdr->read_names_included = hd.i;
break;
case CRAM_KEY('A','P'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "AP", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
hdr->AP_delta = hd.i;
break;
case CRAM_KEY('R','R'):
hd.i = *cp++;
k = kh_put(map, hdr->preservation_map, "RR", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
fd->no_ref = !hd.i;
break;
case CRAM_KEY('S','M'):
hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
hd.p = cp;
cp += 5;
k = kh_put(map, hdr->preservation_map, "SM", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
break;
case CRAM_KEY('T','D'): {
int sz = cram_decode_TD(cp, hdr); // tag dictionary
if (sz < 0) {
cram_free_compression_header(hdr);
return NULL;
}
hd.p = cp;
cp += sz;
k = kh_put(map, hdr->preservation_map, "TD", &r);
if (-1 == r) {
cram_free_compression_header(hdr);
return NULL;
}
kh_val(hdr->preservation_map, k) = hd;
break;
}
default:
fprintf(stderr, "Unrecognised preservation map key %c%c\n",
cp[-2], cp[-1]);
// guess byte;
cp++;
break;
}
}
if (cp - cp_copy != map_size) {
cram_free_compression_header(hdr);
return NULL;
}
/* Record encoding map */
cp += itf8_get(cp, &map_size); cp_copy = cp;
cp += itf8_get(cp, &map_count);
for (i = 0; i < map_count; i++) {
char *key = cp;
int32_t encoding;
int32_t size;
cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
if (!m) {
cram_free_compression_header(hdr);
return NULL;
}
cp += 2;
cp += itf8_get(cp, &encoding);
cp += itf8_get(cp, &size);
// Fill out cram_map purely for cram_dump to dump out.
m->key = (key[0]<<8)|key[1];
m->encoding = encoding;
m->size = size;
m->offset = cp - (char *)b->data;
m->codec = NULL;
if (m->encoding == E_NULL)
continue;
//printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
/*
* For CRAM1.0 CF and BF are Byte and not Int.
* Practically speaking it makes no difference unless we have a
* 1.0 format file that stores these in EXTERNAL as only then
* does Byte vs Int matter.
*
* Neither this C code nor Java reference implementations did this,
* so we gloss over it and treat them as int.
*/
if (key[0] == 'B' && key[1] == 'F') {
if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'C' && key[1] == 'F') {
if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'I') {
if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'L') {
if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'A' && key[1] == 'P') {
if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'G') {
if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'M' && key[1] == 'F') {
if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'S') {
if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'P') {
if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'S') {
if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'N' && key[1] == 'F') {
if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'C') {
if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
E_BYTE,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'N') {
if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'N') {
if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'C') {
if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
E_BYTE,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'F' && key[1] == 'P') {
if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'B' && key[1] == 'S') {
if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
E_BYTE,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'I' && key[1] == 'N') {
if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'S' && key[1] == 'C') {
if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'D' && key[1] == 'L') {
if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'B' && key[1] == 'A') {
if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
E_BYTE,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'B' && key[1] == 'B') {
if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'S') {
if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'P' && key[1] == 'D') {
if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'H' && key[1] == 'C') {
if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'M' && key[1] == 'Q') {
if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'R' && key[1] == 'N') {
if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY_BLOCK,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'Q' && key[1] == 'S') {
if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
E_BYTE,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'Q' && key[1] == 'Q') {
if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'L') {
if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
E_INT,
fd->version))) {
cram_free_compression_header(hdr);
return NULL;
}
} else if (key[0] == 'T' && key[1] == 'M') {
} else if (key[0] == 'T' && key[1] == 'V') {
} else
fprintf(stderr, "Unrecognised key: %.2s\n", key);
cp += size;
m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
}
if (cp - cp_copy != map_size) {
cram_free_compression_header(hdr);
return NULL;
}
/* Tag encoding map */
cp += itf8_get(cp, &map_size); cp_copy = cp;
cp += itf8_get(cp, &map_count);
for (i = 0; i < map_count; i++) {
int32_t encoding;
int32_t size;
cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
char *key = cp+1;
if (!m) {
cram_free_compression_header(hdr);
return NULL;
}
m->key = (key[0]<<16)|(key[1]<<8)|key[2];
cp += 4; // Strictly ITF8, but this suffices
cp += itf8_get(cp, &encoding);
cp += itf8_get(cp, &size);
m->encoding = encoding;
m->size = size;
m->offset = cp - (char *)b->data;
if (!(m->codec = cram_decoder_init(encoding, cp, size,
E_BYTE_ARRAY_BLOCK, fd->version))) {
cram_free_compression_header(hdr);
free(m);
return NULL;
}
cp += size;
m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
}
if (cp - cp_copy != map_size) {
cram_free_compression_header(hdr);
return NULL;
}
return hdr;
}
/*
* Note we also need to scan through the record encoding map to
* see which data series share the same block, either external or
* CORE. For example if we need the BF data series but MQ and CF
* are also encoded in the same block then we need to add those in
* as a dependency in order to correctly decode BF.
*
* Returns 0 on success
* -1 on failure
*/
int cram_dependent_data_series(cram_fd *fd,
cram_block_compression_hdr *hdr,
cram_slice *s) {
int *block_used;
int core_used = 0;
int i;
static int i_to_id[] = {
DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
DS_HC, DS_SC, DS_BB, DS_QQ,
};
uint32_t orig_ds;
/*
* Set the data_series bit field based on fd->required_fields
* contents.
*/
if (fd->required_fields && fd->required_fields != INT_MAX) {
hdr->data_series = 0;
if (fd->required_fields & SAM_QNAME)
hdr->data_series |= CRAM_RN;
if (fd->required_fields & SAM_FLAG)
hdr->data_series |= CRAM_BF;
if (fd->required_fields & SAM_RNAME)
hdr->data_series |= CRAM_RI | CRAM_BF;
if (fd->required_fields & SAM_POS)
hdr->data_series |= CRAM_AP | CRAM_BF;
if (fd->required_fields & SAM_MAPQ)
hdr->data_series |= CRAM_MQ;
if (fd->required_fields & SAM_CIGAR)
hdr->data_series |= CRAM_CIGAR;
if (fd->required_fields & SAM_RNEXT)
hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
if (fd->required_fields & SAM_PNEXT)
hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
if (fd->required_fields & SAM_TLEN)
hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
if (fd->required_fields & SAM_SEQ)
hdr->data_series |= CRAM_SEQ;
if (!(fd->required_fields & SAM_AUX))
// No easy way to get MD/NM without other tags at present
fd->decode_md = 0;
if (fd->required_fields & SAM_QUAL)
hdr->data_series |= CRAM_SEQ;
if (fd->required_fields & SAM_AUX)
hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
if (fd->required_fields & SAM_RGAUX)
hdr->data_series |= CRAM_RG | CRAM_BF;
// Always uncompress CORE block
if (cram_uncompress_block(s->block[0]))
return -1;
} else {
hdr->data_series = CRAM_ALL;
for (i = 0; i < s->hdr->num_blocks; i++) {
if (cram_uncompress_block(s->block[i]))
return -1;
}
return 0;
}
block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
if (!block_used)
return -1;
do {
/*
* Also set data_series based on code prerequisites. Eg if we need
* CRAM_QS then we also need to know CRAM_RL so we know how long it
* is, or if we need FC/FP then we also need FN (number of features).
*
* It's not reciprocal though. We may be needing to decode FN
* but have no need to decode FC, FP and cigar ops.
*/
if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
// cram_decode_seq() needs seq[] array
if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
hdr->data_series |= CRAM_CF;
if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
hdr->data_series |= CRAM_CF | CRAM_NF;
if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
orig_ds = hdr->data_series;
// Find which blocks are in use.
for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
int bnum1, bnum2, j;
cram_codec *c = hdr->codecs[i_to_id[i]];
if (!(hdr->data_series & (1<hdr->num_blocks; j++) {
if (s->block[j]->content_type == EXTERNAL &&
s->block[j]->content_id == bnum1) {
block_used[j] = 1;
if (cram_uncompress_block(s->block[j])) {
free(block_used);
return -1;
}
}
}
break;
}
if (bnum2 == -2 || bnum1 == bnum2)
break;
bnum1 = bnum2; // 2nd pass
}
}
// Tags too
if ((fd->required_fields & SAM_AUX) ||
(hdr->data_series & CRAM_aux)) {
for (i = 0; i < CRAM_MAP_HASH; i++) {
int bnum1, bnum2, j;
cram_map *m = hdr->tag_encoding_map[i];
while (m) {
cram_codec *c = m->codec;
if (!c)
continue;
bnum1 = cram_codec_to_id(c, &bnum2);
for (;;) {
switch (bnum1) {
case -2:
break;
case -1:
core_used = 1;
break;
default:
for (j = 0; j < s->hdr->num_blocks; j++) {
if (s->block[j]->content_type == EXTERNAL &&
s->block[j]->content_id == bnum1) {
block_used[j] = 1;
if (cram_uncompress_block(s->block[j])) {
free(block_used);
return -1;
}
}
}
break;
}
if (bnum2 == -2 || bnum1 == bnum2)
break;
bnum1 = bnum2; // 2nd pass
}
m = m->next;
}
}
}
// We now know which blocks are in used, so repeat and find
// which other data series need to be added.
for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
int bnum1, bnum2, j;
cram_codec *c = hdr->codecs[i_to_id[i]];
if (!c)
continue;
bnum1 = cram_codec_to_id(c, &bnum2);
for (;;) {
switch (bnum1) {
case -2:
break;
case -1:
if (core_used) {
//printf(" + data series %08x:\n", 1<data_series |= 1<hdr->num_blocks; j++) {
if (s->block[j]->content_type == EXTERNAL &&
s->block[j]->content_id == bnum1) {
if (block_used[j]) {
//printf(" + data series %08x:\n", 1<data_series |= 1<tag_encoding_map[i];
while (m) {
cram_codec *c = m->codec;
if (!c)
continue;
bnum1 = cram_codec_to_id(c, &bnum2);
for (;;) {
switch (bnum1) {
case -2:
break;
case -1:
//printf(" + data series %08x:\n", CRAM_aux);
hdr->data_series |= CRAM_aux;
break;
default:
for (j = 0; j < s->hdr->num_blocks; j++) {
if (s->block[j]->content_type &&
s->block[j]->content_id == bnum1) {
if (block_used[j]) {
//printf(" + data series %08x:\n",
// CRAM_aux);
hdr->data_series |= CRAM_aux;
}
}
}
break;
}
if (bnum2 == -2 || bnum1 == bnum2)
break;
bnum1 = bnum2; // 2nd pass
}
m = m->next;
}
}
} while (orig_ds != hdr->data_series);
free(block_used);
return 0;
}
/* ----------------------------------------------------------------------
* CRAM slices
*/
/*
* Decodes a CRAM (un)mapped slice header block.
* Returns slice header ptr on success
* NULL on failure
*/
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
cram_block_slice_hdr *hdr;
char *cp = (char *)b->data;
int i;
if (b->content_type != MAPPED_SLICE &&
b->content_type != UNMAPPED_SLICE)
return NULL;
if (!(hdr = calloc(1, sizeof(*hdr))))
return NULL;
hdr->content_type = b->content_type;
if (b->content_type == MAPPED_SLICE) {
cp += itf8_get(cp, &hdr->ref_seq_id);
cp += itf8_get(cp, &hdr->ref_seq_start);
cp += itf8_get(cp, &hdr->ref_seq_span);
}
cp += itf8_get(cp, &hdr->num_records);
hdr->record_counter = 0;
if (CRAM_MAJOR_VERS(fd->version) == 2) {
int32_t i32;
cp += itf8_get(cp, &i32);
hdr->record_counter = i32;
} else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
cp += ltf8_get(cp, &hdr->record_counter);
}
cp += itf8_get(cp, &hdr->num_blocks);
cp += itf8_get(cp, &hdr->num_content_ids);
hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
if (!hdr->block_content_ids) {
free(hdr);
return NULL;
}
for (i = 0; i < hdr->num_content_ids; i++) {
cp += itf8_get(cp, &hdr->block_content_ids[i]);
}
if (b->content_type == MAPPED_SLICE) {
cp += itf8_get(cp, &hdr->ref_base_id);
}
if (CRAM_MAJOR_VERS(fd->version) != 1) {
memcpy(hdr->md5, cp, 16);
} else {
memset(hdr->md5, 0, 16);
}
return hdr;
}
#if 0
/* Returns the number of bits set in val; it the highest bit used */
static int nbits(int v) {
static const int MultiplyDeBruijnBitPosition[32] = {
1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
};
v |= v >> 1; // first up to set all bits 1 after the first 1 */
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
// DeBruijn magic to find top bit
return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
}
#endif
#if 0
static int sort_freqs(const void *vp1, const void *vp2) {
const int i1 = *(const int *)vp1;
const int i2 = *(const int *)vp2;
return i1-i2;
}
#endif
/* ----------------------------------------------------------------------
* Primary CRAM sequence decoder
*/
/*
* Internal part of cram_decode_slice().
* Generates the sequence, quality and cigar components.
*/
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
cram_block *blk, cram_record *cr, SAM_hdr *bfd,
int cf, char *seq, char *qual) {
int prev_pos = 0, f, r = 0, out_sz = 1;
int seq_pos = 1;
int cig_len = 0, ref_pos = cr->apos;
int32_t fn, i32;
enum cigar_op cig_op = BAM_CMATCH;
uint32_t *cigar = s->cigar;
uint32_t ncigar = s->ncigar;
uint32_t cigar_alloc = s->cigar_alloc;
uint32_t nm = 0, md_dist = 0;
int orig_aux = 0;
int decode_md = fd->decode_md && s->ref;
uint32_t ds = c->comp_hdr->data_series;
if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
memset(qual, 30, cr->len);
}
if (decode_md) {
orig_aux = BLOCK_SIZE(s->aux_blk);
BLOCK_APPEND(s->aux_blk, "MDZ", 3);
}
if (ds & CRAM_FN) {
if (!c->comp_hdr->codecs[DS_FN]) return -1;
r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
blk, (char *)&fn, &out_sz);
} else {
fn = 0;
}
ref_pos--; // count from 0
cr->cigar = ncigar;
if (!(ds & (CRAM_FC | CRAM_FP)))
goto skip_cigar;
for (f = 0; f < fn; f++) {
int32_t pos = 0;
char op;
if (ncigar+2 >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
s->cigar = cigar;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
}
if (ds & CRAM_FC) {
if (!c->comp_hdr->codecs[DS_FC]) return -1;
r |= c->comp_hdr->codecs[DS_FC]->decode(s,
c->comp_hdr->codecs[DS_FC],
blk,
&op, &out_sz);
}
if (!(ds & CRAM_FP))
continue;
if (!c->comp_hdr->codecs[DS_FP]) return -1;
r |= c->comp_hdr->codecs[DS_FP]->decode(s,
c->comp_hdr->codecs[DS_FP],
blk,
(char *)&pos, &out_sz);
pos += prev_pos;
if (pos > seq_pos) {
if (pos > cr->len+1)
return -1;
if (s->ref && cr->ref_id >= 0) {
if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
static int whinged = 0;
if (!whinged)
fprintf(stderr, "Ref pos outside of ref "
"sequence boundary\n");
whinged = 1;
} else {
memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
pos - seq_pos);
}
}
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
cig_op = BAM_CBASE_MATCH;
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
cig_op = BAM_CMATCH;
#endif
cig_len += pos - seq_pos;
ref_pos += pos - seq_pos;
md_dist += pos - seq_pos;
seq_pos = pos;
}
prev_pos = pos;
if (!(ds & CRAM_FC))
goto skip_cigar;
if (!(ds & CRAM_FC))
continue;
switch(op) {
case 'S': { // soft clip: IN
int32_t out_sz2 = 1;
if (cig_len) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_IN) {
switch (CRAM_MAJOR_VERS(fd->version)) {
case 1:
r |= c->comp_hdr->codecs[DS_IN]
? c->comp_hdr->codecs[DS_IN]
->decode(s, c->comp_hdr->codecs[DS_IN],
blk, &seq[pos-1], &out_sz2)
: (seq[pos-1] = 'N', out_sz2 = 1, 0);
break;
case 2:
default:
r |= c->comp_hdr->codecs[DS_SC]
? c->comp_hdr->codecs[DS_SC]
->decode(s, c->comp_hdr->codecs[DS_SC],
blk, &seq[pos-1], &out_sz2)
: (seq[pos-1] = 'N', out_sz2 = 1, 0);
break;
// default:
// r |= c->comp_hdr->codecs[DS_BB]
// ? c->comp_hdr->codecs[DS_BB]
// ->decode(s, c->comp_hdr->codecs[DS_BB],
// blk, &seq[pos-1], &out_sz2)
// : (seq[pos-1] = 'N', out_sz2 = 1, 0);
}
cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
cig_op = BAM_CSOFT_CLIP;
seq_pos += out_sz2;
}
break;
}
case 'X': { // Substitution; BS
unsigned char base;
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_BS) {
if (!c->comp_hdr->codecs[DS_BS]) return -1;
r |= c->comp_hdr->codecs[DS_BS]
->decode(s, c->comp_hdr->codecs[DS_BS], blk,
(char *)&base, &out_sz);
seq[pos-1] = 'N'; // FIXME look up BS=base value
}
cig_op = BAM_CBASE_MISMATCH;
#else
int ref_base;
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_BS) {
if (!c->comp_hdr->codecs[DS_BS]) return -1;
r |= c->comp_hdr->codecs[DS_BS]
->decode(s, c->comp_hdr->codecs[DS_BS], blk,
(char *)&base, &out_sz);
if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
seq[pos-1] = 'N';
} else {
ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
seq[pos-1] = c->comp_hdr->
substitution_matrix[ref_base][base];
if (decode_md) {
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
BLOCK_APPEND_CHAR(s->aux_blk,
s->ref[ref_pos-s->ref_start +1]);
md_dist = 0;
}
}
}
cig_op = BAM_CMATCH;
#endif
nm++;
cig_len++;
seq_pos++;
ref_pos++;
break;
}
case 'D': { // Deletion; DL
if (cig_len && cig_op != BAM_CDEL) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_DL) {
if (!c->comp_hdr->codecs[DS_DL]) return -1;
r |= c->comp_hdr->codecs[DS_DL]
->decode(s, c->comp_hdr->codecs[DS_DL], blk,
(char *)&i32, &out_sz);
if (decode_md) {
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
BLOCK_APPEND_CHAR(s->aux_blk, '^');
BLOCK_APPEND(s->aux_blk,
&s->ref[ref_pos - s->ref_start +1],
i32);
md_dist = 0;
}
cig_op = BAM_CDEL;
cig_len += i32;
ref_pos += i32;
nm += i32;
//printf(" %d: DL = %d (ret %d)\n", f, i32, r);
}
break;
}
case 'I': { // Insertion (several bases); IN
int32_t out_sz2 = 1;
if (cig_len && cig_op != BAM_CINS) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_IN) {
if (!c->comp_hdr->codecs[DS_IN]) return -1;
r |= c->comp_hdr->codecs[DS_IN]
->decode(s, c->comp_hdr->codecs[DS_IN], blk,
&seq[pos-1], &out_sz2);
cig_op = BAM_CINS;
cig_len += out_sz2;
seq_pos += out_sz2;
nm += out_sz2;
//printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
}
break;
}
case 'i': { // Insertion (single base); BA
if (cig_len && cig_op != BAM_CINS) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_BA) {
if (!c->comp_hdr->codecs[DS_BA]) return -1;
r |= c->comp_hdr->codecs[DS_BA]
->decode(s, c->comp_hdr->codecs[DS_BA], blk,
(char *)&seq[pos-1], &out_sz);
}
cig_op = BAM_CINS;
cig_len++;
seq_pos++;
nm++;
break;
}
case 'b': { // Several bases
int32_t len = 1;
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_BB) {
if (!c->comp_hdr->codecs[DS_BB]) return -1;
r |= c->comp_hdr->codecs[DS_BB]
->decode(s, c->comp_hdr->codecs[DS_BB], blk,
(char *)&seq[pos-1], &len);
}
cig_op = BAM_CMATCH;
cig_len+=len;
seq_pos+=len;
ref_pos+=len;
//prev_pos+=len;
break;
}
case 'q': { // Several quality values
int32_t len = 1;
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_QQ) {
if (!c->comp_hdr->codecs[DS_QQ]) return -1;
r |= c->comp_hdr->codecs[DS_QQ]
->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
(char *)&qual[pos-1], &len);
}
cig_op = BAM_CMATCH;
cig_len+=len;
seq_pos+=len;
ref_pos+=len;
//prev_pos+=len;
break;
}
case 'B': { // Read base; BA, QS
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
#endif
if (ds & CRAM_BA) {
if (!c->comp_hdr->codecs[DS_BA]) return -1;
r |= c->comp_hdr->codecs[DS_BA]
->decode(s, c->comp_hdr->codecs[DS_BA], blk,
(char *)&seq[pos-1], &out_sz);
}
if (ds & CRAM_QS) {
if (!c->comp_hdr->codecs[DS_QS]) return -1;
r |= c->comp_hdr->codecs[DS_QS]
->decode(s, c->comp_hdr->codecs[DS_QS], blk,
(char *)&qual[pos-1], &out_sz);
}
#ifdef USE_X
cig_op = BAM_CBASE_MISMATCH;
#else
cig_op = BAM_CMATCH;
#endif
cig_len++;
seq_pos++;
ref_pos++;
//printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
break;
}
case 'Q': { // Quality score; QS
if (ds & CRAM_QS) {
if (!c->comp_hdr->codecs[DS_QS]) return -1;
r |= c->comp_hdr->codecs[DS_QS]
->decode(s, c->comp_hdr->codecs[DS_QS], blk,
(char *)&qual[pos-1], &out_sz);
//printf(" %d: QS = %d (ret %d)\n", f, qc, r);
}
break;
}
case 'H': { // hard clip; HC
if (cig_len && cig_op != BAM_CHARD_CLIP) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_HC) {
if (!c->comp_hdr->codecs[DS_HC]) return -1;
r |= c->comp_hdr->codecs[DS_HC]
->decode(s, c->comp_hdr->codecs[DS_HC], blk,
(char *)&i32, &out_sz);
cig_op = BAM_CHARD_CLIP;
cig_len += i32;
nm += i32;
}
break;
}
case 'P': { // padding; PD
if (cig_len && cig_op != BAM_CPAD) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_PD) {
if (!c->comp_hdr->codecs[DS_PD]) return -1;
r |= c->comp_hdr->codecs[DS_PD]
->decode(s, c->comp_hdr->codecs[DS_PD], blk,
(char *)&i32, &out_sz);
cig_op = BAM_CPAD;
cig_len += i32;
nm += i32;
}
break;
}
case 'N': { // Ref skip; RS
if (cig_len && cig_op != BAM_CREF_SKIP) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
if (ds & CRAM_RS) {
if (!c->comp_hdr->codecs[DS_RS]) return -1;
r |= c->comp_hdr->codecs[DS_RS]
->decode(s, c->comp_hdr->codecs[DS_RS], blk,
(char *)&i32, &out_sz);
cig_op = BAM_CREF_SKIP;
cig_len += i32;
ref_pos += i32;
nm += i32;
}
break;
}
default:
abort();
}
}
if (!(ds & CRAM_FC))
goto skip_cigar;
/* An implement match op for any unaccounted for bases */
if ((ds & CRAM_FN) && cr->len >= seq_pos) {
if (s->ref) {
if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
static int whinged = 0;
if (!whinged)
fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
whinged = 1;
} else {
memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
cr->len - seq_pos + 1);
ref_pos += cr->len - seq_pos + 1;
md_dist += cr->len - seq_pos + 1;
}
}
if (ncigar+1 >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
s->cigar = cigar;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
}
#ifdef USE_X
if (cig_len && cig_op != BAM_CBASE_MATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
cig_op = BAM_CBASE_MATCH;
#else
if (cig_len && cig_op != BAM_CMATCH) {
cigar[ncigar++] = (cig_len<<4) + cig_op;
cig_len = 0;
}
cig_op = BAM_CMATCH;
#endif
cig_len += cr->len - seq_pos+1;
}
skip_cigar:
if ((ds & CRAM_FN) && decode_md) {
BLOCK_APPEND_UINT(s->aux_blk, md_dist);
}
if (cig_len) {
if (ncigar >= cigar_alloc) {
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
s->cigar = cigar;
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
return -1;
}
cigar[ncigar++] = (cig_len<<4) + cig_op;
}
cr->ncigar = ncigar - cr->cigar;
cr->aend = ref_pos;
//printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
if (ds & CRAM_MQ) {
if (!c->comp_hdr->codecs[DS_MQ]) return -1;
r |= c->comp_hdr->codecs[DS_MQ]
->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
(char *)&cr->mqual, &out_sz);
} else {
cr->mqual = 40;
}
if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
int32_t out_sz2 = cr->len;
if (ds & CRAM_QS) {
if (!c->comp_hdr->codecs[DS_QS]) return -1;
r |= c->comp_hdr->codecs[DS_QS]
->decode(s, c->comp_hdr->codecs[DS_QS], blk,
qual, &out_sz2);
}
}
s->cigar = cigar;
s->cigar_alloc = cigar_alloc;
s->ncigar = ncigar;
if (decode_md) {
char buf[7];
BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
buf[3] = (nm>> 0) & 0xff;
buf[4] = (nm>> 8) & 0xff;
buf[5] = (nm>>16) & 0xff;
buf[6] = (nm>>24) & 0xff;
BLOCK_APPEND(s->aux_blk, buf, 7);
cr->aux_size += 7;
}
return r;
}
/*
* Quick and simple hash lookup for cram_map arrays
*/
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
cram_map *m;
m = map[CRAM_MAP(key[0],key[1])];
while (m && m->key != id)
m= m->next;
return m;
}
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
cram_block *blk, cram_record *cr) {
int i, r = 0, out_sz = 1;
unsigned char ntags;
if (!c->comp_hdr->codecs[DS_TC]) return -1;
r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
(char *)&ntags, &out_sz);
cr->ntags = ntags;
//printf("TC=%d\n", cr->ntags);
cr->aux_size = 0;
cr->aux = BLOCK_SIZE(s->aux_blk);
for (i = 0; i < cr->ntags; i++) {
int32_t id, out_sz = 1;
unsigned char tag_data[3];
cram_map *m;
//printf("Tag %d/%d\n", i+1, cr->ntags);
if (!c->comp_hdr->codecs[DS_TN]) return -1;
r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
blk, (char *)&id, &out_sz);
if (out_sz == 3) {
tag_data[0] = ((char *)&id)[0];
tag_data[1] = ((char *)&id)[1];
tag_data[2] = ((char *)&id)[2];
} else {
tag_data[0] = (id>>16) & 0xff;
tag_data[1] = (id>>8) & 0xff;
tag_data[2] = id & 0xff;
}
m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
if (!m)
return -1;
BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
if (!m->codec) return -1;
r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
cr->aux_size += out_sz + 3;
}
return r;
}
static int cram_decode_aux(cram_container *c, cram_slice *s,
cram_block *blk, cram_record *cr) {
int i, r = 0, out_sz = 1;
int32_t TL = 0;
unsigned char *TN;
uint32_t ds = c->comp_hdr->data_series;
if (!(ds & (CRAM_TL|CRAM_aux))) {
cr->aux = 0;
cr->aux_size = 0;
return 0;
}
if (!c->comp_hdr->codecs[DS_TL]) return -1;
r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
(char *)&TL, &out_sz);
if (r || TL < 0 || TL >= c->comp_hdr->nTL)
return -1;
TN = c->comp_hdr->TL[TL];
cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
//printf("TC=%d\n", cr->ntags);
cr->aux_size = 0;
cr->aux = BLOCK_SIZE(s->aux_blk);
if (!(ds & CRAM_aux))
return 0;
for (i = 0; i < cr->ntags; i++) {
int32_t id, out_sz = 1;
unsigned char tag_data[3];
cram_map *m;
//printf("Tag %d/%d\n", i+1, cr->ntags);
tag_data[0] = *TN++;
tag_data[1] = *TN++;
tag_data[2] = *TN++;
id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
if (!m)
return -1;
BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
if (!m->codec) return -1;
r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
cr->aux_size += out_sz + 3;
}
return r;
}
/* Resolve mate pair cross-references between recs within this slice */
static void cram_decode_slice_xref(cram_slice *s, int required_fields) {
int rec;
if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
cr->tlen = 0;
cr->mate_pos = 0;
cr->mate_ref_id = -1;
}
return;
}
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
if (cr->mate_line >= 0) {
if (cr->mate_line < s->hdr->num_records) {
/*
* On the first read, loop through computing lengths.
* It's not perfect as we have one slice per reference so we
* cannot detect when TLEN should be zero due to seqs that
* map to multiple references.
*
* We also cannot set tlen correct when it spans a slice for
* other reasons. This may make tlen too small. Should we
* fix this by forcing TLEN to be stored verbatim in such cases?
*
* Or do we just admit defeat and output 0 for tlen? It's the
* safe option...
*/
if (cr->tlen == INT_MIN) {
int id1 = rec, id2 = rec;
int aleft = cr->apos, aright = cr->aend;
int tlen;
int ref = cr->ref_id;
// number of segments starting at the same point.
int left_cnt = 0;
do {
if (aleft > s->crecs[id2].apos)
aleft = s->crecs[id2].apos, left_cnt = 1;
else if (aleft == s->crecs[id2].apos)
left_cnt++;
if (aright < s->crecs[id2].aend)
aright = s->crecs[id2].aend;
if (s->crecs[id2].mate_line == -1) {
s->crecs[id2].mate_line = rec;
break;
}
assert(s->crecs[id2].mate_line > id2);
id2 = s->crecs[id2].mate_line;
if (s->crecs[id2].ref_id != ref)
ref = -1;
} while (id2 != id1);
if (ref != -1) {
tlen = aright - aleft + 1;
id1 = id2 = rec;
/*
* When we have two seqs with identical start and
* end coordinates, set +/- tlen based on 1st/last
* bit flags instead, as a tie breaker.
*/
if (s->crecs[id2].apos == aleft) {
if (left_cnt == 1 ||
(s->crecs[id2].flags & BAM_FREAD1))
s->crecs[id2].tlen = tlen;
else
s->crecs[id2].tlen = -tlen;
} else {
s->crecs[id2].tlen = -tlen;
}
id2 = s->crecs[id2].mate_line;
while (id2 != id1) {
if (s->crecs[id2].apos == aleft) {
if (left_cnt == 1 ||
(s->crecs[id2].flags & BAM_FREAD1))
s->crecs[id2].tlen = tlen;
else
s->crecs[id2].tlen = -tlen;
} else {
s->crecs[id2].tlen = -tlen;
}
id2 = s->crecs[id2].mate_line;
}
} else {
id1 = id2 = rec;
s->crecs[id2].tlen = 0;
id2 = s->crecs[id2].mate_line;
while (id2 != id1) {
s->crecs[id2].tlen = 0;
id2 = s->crecs[id2].mate_line;
}
}
}
cr->mate_pos = s->crecs[cr->mate_line].apos;
cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
// paired
cr->flags |= BAM_FPAIRED;
// set mate unmapped if needed
if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
cr->flags |= BAM_FMUNMAP;
cr->tlen = 0;
}
if (cr->flags & BAM_FUNMAP) {
cr->tlen = 0;
}
// set mate reversed if needed
if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
cr->flags |= BAM_FMREVERSE;
} else {
fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
cr->mate_line, s->hdr->num_records-1);
}
/* FIXME: construct read names here too if needed */
} else {
if (cr->mate_flags & CRAM_M_REVERSE) {
cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
}
if (cr->mate_flags & CRAM_M_UNMAP) {
cr->flags |= BAM_FMUNMAP;
//cr->mate_ref_id = -1;
}
if (!(cr->flags & BAM_FPAIRED))
cr->mate_ref_id = -1;
}
if (cr->tlen == INT_MIN)
cr->tlen = 0; // Just incase
}
}
static char *md5_print(unsigned char *md5, char *out) {
int i;
for (i = 0; i < 16; i++) {
out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
out[i*2+1] = "0123456789abcdef"[md5[i]&15];
}
out[32] = 0;
return out;
}
/*
* Decode an entire slice from container blocks. Fills out s->crecs[] array.
* Returns 0 on success
* -1 on failure
*/
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
SAM_hdr *bfd) {
cram_block *blk = s->block[0];
int32_t bf, ref_id;
unsigned char cf;
int out_sz, r = 0;
int rec;
char *seq = NULL, *qual = NULL;
int unknown_rg = -1;
int embed_ref;
char **refs = NULL;
uint32_t ds;
if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
return -1;
ds = c->comp_hdr->data_series;
blk->bit = 7; // MSB first
/* Look for unknown RG, added as last by Java CRAM? */
if (bfd->nrg > 0 &&
!strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
unknown_rg = bfd->nrg-1;
if (blk->content_type != CORE)
return -1;
if (s->crecs)
free(s->crecs);
if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
return -1;
ref_id = s->hdr->ref_seq_id;
embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
if (ref_id >= 0) {
if (embed_ref) {
cram_block *b;
if (s->hdr->ref_base_id < 0) {
fprintf(stderr, "No reference specified and "
"no embedded reference is available.\n");
return -1;
}
if (!s->block_by_id ||
!(b = s->block_by_id[s->hdr->ref_base_id]))
return -1;
cram_uncompress_block(b);
s->ref = (char *)BLOCK_DATA(b);
s->ref_start = s->hdr->ref_seq_start;
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
} else if (!fd->no_ref) {
//// Avoid Java cramtools bug by loading entire reference seq
//s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
//s->ref_start = 1;
if (fd->required_fields & SAM_SEQ)
s->ref =
cram_get_ref(fd, s->hdr->ref_seq_id,
s->hdr->ref_seq_start,
s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
s->ref_start = s->hdr->ref_seq_start;
s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
/* Sanity check */
if (s->ref_start < 0) {
fprintf(stderr, "Slice starts before base 1.\n");
s->ref_start = 0;
}
pthread_mutex_lock(&fd->ref_lock);
pthread_mutex_lock(&fd->refs->lock);
if ((fd->required_fields & SAM_SEQ) &&
s->ref_end > fd->refs->ref_id[ref_id]->length) {
fprintf(stderr, "Slice ends beyond reference end.\n");
s->ref_end = fd->refs->ref_id[ref_id]->length;
}
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
}
}
if ((fd->required_fields & SAM_SEQ) &&
s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
s->hdr->ref_seq_id, s->hdr->ref_seq_start,
s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
return -1;
}
if (CRAM_MAJOR_VERS(fd->version) != 1
&& (fd->required_fields & SAM_SEQ)
&& s->hdr->ref_seq_id >= 0
&& !fd->ignore_md5
&& memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
MD5_CTX md5;
unsigned char digest[16];
if (s->ref && s->hdr->ref_seq_id >= 0) {
int start, len;
if (s->hdr->ref_seq_start >= s->ref_start) {
start = s->hdr->ref_seq_start - s->ref_start;
} else {
fprintf(stderr, "Slice starts before base 1.\n");
start = 0;
}
if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
len = s->hdr->ref_seq_span;
} else {
fprintf(stderr, "Slice ends beyond reference end.\n");
len = s->ref_end - s->ref_start + 1;
}
MD5_Init(&md5);
if (start + len > s->ref_end - s->ref_start + 1)
len = s->ref_end - s->ref_start + 1 - start;
if (len >= 0)
MD5_Update(&md5, s->ref + start, len);
MD5_Final(digest, &md5);
} else if (!s->ref && s->hdr->ref_base_id >= 0) {
cram_block *b;
if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) {
MD5_Init(&md5);
MD5_Update(&md5, b->data, b->uncomp_size);
MD5_Final(digest, &md5);
}
}
if ((!s->ref && s->hdr->ref_base_id < 0)
|| memcmp(digest, s->hdr->md5, 16) != 0) {
char M[33];
fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
"%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
return -1;
}
}
if (ref_id == -2) {
pthread_mutex_lock(&fd->ref_lock);
pthread_mutex_lock(&fd->refs->lock);
refs = calloc(fd->refs->nref, sizeof(char *));
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
if (!refs)
return -1;
}
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
//fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
cr->s = s;
out_sz = 1; /* decode 1 item */
if (ds & CRAM_BF) {
if (!c->comp_hdr->codecs[DS_BF]) return -1;
r |= c->comp_hdr->codecs[DS_BF]
->decode(s, c->comp_hdr->codecs[DS_BF], blk,
(char *)&bf, &out_sz);
if (bf < 0 ||
bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
return -1;
bf = fd->bam_flag_swap[bf];
cr->flags = bf;
} else {
cr->flags = bf = 0x4; // unmapped
}
if (ds & CRAM_CF) {
if (CRAM_MAJOR_VERS(fd->version) == 1) {
/* CF is byte in 1.0, int32 in 2.0 */
if (!c->comp_hdr->codecs[DS_CF]) return -1;
r |= c->comp_hdr->codecs[DS_CF]
->decode(s, c->comp_hdr->codecs[DS_CF], blk,
(char *)&cf, &out_sz);
cr->cram_flags = cf;
} else {
if (!c->comp_hdr->codecs[DS_CF]) return -1;
r |= c->comp_hdr->codecs[DS_CF]
->decode(s, c->comp_hdr->codecs[DS_CF], blk,
(char *)&cr->cram_flags,
&out_sz);
cf = cr->cram_flags;
}
}
if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
if (ds & CRAM_RI) {
if (!c->comp_hdr->codecs[DS_RI]) return -1;
r |= c->comp_hdr->codecs[DS_RI]
->decode(s, c->comp_hdr->codecs[DS_RI], blk,
(char *)&cr->ref_id, &out_sz);
if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
&& cr->ref_id >= 0) {
if (!fd->no_ref) {
if (!refs[cr->ref_id])
refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
1, 0);
s->ref = refs[cr->ref_id];
}
s->ref_start = 1;
pthread_mutex_lock(&fd->ref_lock);
pthread_mutex_lock(&fd->refs->lock);
s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
}
} else {
cr->ref_id = 0;
}
} else {
cr->ref_id = ref_id; // Forced constant in CRAM 1.0
}
if (ds & CRAM_RL) {
if (!c->comp_hdr->codecs[DS_RL]) return -1;
r |= c->comp_hdr->codecs[DS_RL]
->decode(s, c->comp_hdr->codecs[DS_RL], blk,
(char *)&cr->len, &out_sz);
}
if (ds & CRAM_AP) {
if (!c->comp_hdr->codecs[DS_AP]) return -1;
r |= c->comp_hdr->codecs[DS_AP]
->decode(s, c->comp_hdr->codecs[DS_AP], blk,
(char *)&cr->apos, &out_sz);
if (c->comp_hdr->AP_delta)
cr->apos += s->last_apos;
s->last_apos= cr->apos;
} else {
cr->apos = c->ref_seq_start;
}
if (ds & CRAM_RG) {
if (!c->comp_hdr->codecs[DS_RG]) return -1;
r |= c->comp_hdr->codecs[DS_RG]
->decode(s, c->comp_hdr->codecs[DS_RG], blk,
(char *)&cr->rg, &out_sz);
if (cr->rg == unknown_rg)
cr->rg = -1;
} else {
cr->rg = -1;
}
cr->name_len = 0;
if (c->comp_hdr->read_names_included) {
int32_t out_sz2 = 1;
// Read directly into name cram_block
cr->name = BLOCK_SIZE(s->name_blk);
if (ds & CRAM_RN) {
if (!c->comp_hdr->codecs[DS_RN]) return -1;
r |= c->comp_hdr->codecs[DS_RN]
->decode(s, c->comp_hdr->codecs[DS_RN], blk,
(char *)s->name_blk, &out_sz2);
cr->name_len = out_sz2;
}
}
cr->mate_pos = 0;
cr->mate_line = -1;
cr->mate_ref_id = -1;
if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
if (ds & CRAM_MF) {
if (CRAM_MAJOR_VERS(fd->version) == 1) {
/* MF is byte in 1.0, int32 in 2.0 */
unsigned char mf;
if (!c->comp_hdr->codecs[DS_MF]) return -1;
r |= c->comp_hdr->codecs[DS_MF]
->decode(s, c->comp_hdr->codecs[DS_MF],
blk, (char *)&mf, &out_sz);
cr->mate_flags = mf;
} else {
if (!c->comp_hdr->codecs[DS_MF]) return -1;
r |= c->comp_hdr->codecs[DS_MF]
->decode(s, c->comp_hdr->codecs[DS_MF],
blk,
(char *)&cr->mate_flags,
&out_sz);
}
} else {
cr->mate_flags = 0;
}
if (!c->comp_hdr->read_names_included) {
int32_t out_sz2 = 1;
// Read directly into name cram_block
cr->name = BLOCK_SIZE(s->name_blk);
if (ds & CRAM_RN) {
if (!c->comp_hdr->codecs[DS_RN]) return -1;
r |= c->comp_hdr->codecs[DS_RN]
->decode(s, c->comp_hdr->codecs[DS_RN],
blk, (char *)s->name_blk,
&out_sz2);
cr->name_len = out_sz2;
}
}
if (ds & CRAM_NS) {
if (!c->comp_hdr->codecs[DS_NS]) return -1;
r |= c->comp_hdr->codecs[DS_NS]
->decode(s, c->comp_hdr->codecs[DS_NS], blk,
(char *)&cr->mate_ref_id, &out_sz);
}
// Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
// if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
// /* Paired, but unmapped */
// cr->flags |= BAM_FMUNMAP;
// }
if (ds & CRAM_NP) {
if (!c->comp_hdr->codecs[DS_NP]) return -1;
r |= c->comp_hdr->codecs[DS_NP]
->decode(s, c->comp_hdr->codecs[DS_NP], blk,
(char *)&cr->mate_pos, &out_sz);
}
if (ds & CRAM_TS) {
if (!c->comp_hdr->codecs[DS_TS]) return -1;
r |= c->comp_hdr->codecs[DS_TS]
->decode(s, c->comp_hdr->codecs[DS_TS], blk,
(char *)&cr->tlen, &out_sz);
} else {
cr->tlen = INT_MIN;
}
} else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
if (ds & CRAM_NF) {
if (!c->comp_hdr->codecs[DS_NF]) return -1;
r |= c->comp_hdr->codecs[DS_NF]
->decode(s, c->comp_hdr->codecs[DS_NF], blk,
(char *)&cr->mate_line, &out_sz);
cr->mate_line += rec + 1;
//cr->name_len = sprintf(name, "%d", name_id++);
//cr->name = DSTRING_LEN(name_ds);
//dstring_nappend(name_ds, name, cr->name_len);
cr->mate_ref_id = -1;
cr->tlen = INT_MIN;
cr->mate_pos = 0;
} else {
cr->mate_flags = 0;
cr->tlen = INT_MIN;
}
} else {
cr->mate_flags = 0;
cr->tlen = INT_MIN;
}
/*
else if (!name[0]) {
//name[0] = '?'; name[1] = 0;
//cr->name_len = 1;
//cr->name= DSTRING_LEN(s->name_ds);
//dstring_nappend(s->name_ds, "?", 1);
cr->mate_ref_id = -1;
cr->tlen = 0;
cr->mate_pos = 0;
}
*/
/* Auxiliary tags */
if (CRAM_MAJOR_VERS(fd->version) == 1)
r |= cram_decode_aux_1_0(c, s, blk, cr);
else
r |= cram_decode_aux(c, s, blk, cr);
/* Fake up dynamic string growth and appending */
if (ds & CRAM_RL) {
cr->seq = BLOCK_SIZE(s->seqs_blk);
BLOCK_GROW(s->seqs_blk, cr->len);
seq = (char *)BLOCK_END(s->seqs_blk);
BLOCK_SIZE(s->seqs_blk) += cr->len;
if (!seq)
return -1;
cr->qual = BLOCK_SIZE(s->qual_blk);
BLOCK_GROW(s->qual_blk, cr->len);
qual = (char *)BLOCK_END(s->qual_blk);
BLOCK_SIZE(s->qual_blk) += cr->len;
if (!s->ref)
memset(seq, '=', cr->len);
}
if (!(bf & BAM_FUNMAP)) {
/* Decode sequence and generate CIGAR */
if (ds & (CRAM_SEQ | CRAM_MQ)) {
r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
} else {
cr->cigar = 0;
cr->ncigar = 0;
cr->aend = cr->apos;
cr->mqual = 0;
}
} else {
int out_sz2 = cr->len;
//puts("Unmapped");
cr->cigar = 0;
cr->ncigar = 0;
cr->aend = cr->apos;
cr->mqual = 0;
if (ds & CRAM_BA) {
if (!c->comp_hdr->codecs[DS_BA]) return -1;
r |= c->comp_hdr->codecs[DS_BA]
->decode(s, c->comp_hdr->codecs[DS_BA], blk,
(char *)seq, &out_sz2);
}
if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
out_sz2 = cr->len;
if (ds & CRAM_QS) {
if (!c->comp_hdr->codecs[DS_QS]) return -1;
r |= c->comp_hdr->codecs[DS_QS]
->decode(s, c->comp_hdr->codecs[DS_QS],
blk, qual, &out_sz2);
}
} else {
if (ds & CRAM_RL)
memset(qual, 30, cr->len);
}
}
}
pthread_mutex_lock(&fd->ref_lock);
if (refs) {
int i;
for (i = 0; i < fd->refs->nref; i++) {
if (refs[i])
cram_ref_decr(fd->refs, i);
}
free(refs);
} else if (ref_id >= 0 && s->ref != fd->ref_free) {
cram_ref_decr(fd->refs, ref_id);
}
pthread_mutex_unlock(&fd->ref_lock);
/* Resolve mate pair cross-references between recs within this slice */
cram_decode_slice_xref(s, fd->required_fields);
return r;
}
typedef struct {
cram_fd *fd;
cram_container *c;
cram_slice *s;
SAM_hdr *h;
int exit_code;
} cram_decode_job;
void *cram_decode_slice_thread(void *arg) {
cram_decode_job *j = (cram_decode_job *)arg;
j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
return j;
}
/*
* Spawn a multi-threaded version of cram_decode_slice().
*/
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
SAM_hdr *bfd) {
cram_decode_job *j;
int nonblock;
if (!fd->pool)
return cram_decode_slice(fd, c, s, bfd);
if (!(j = malloc(sizeof(*j))))
return -1;
j->fd = fd;
j->c = c;
j->s = s;
j->h = bfd;
nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
j, nonblock)) {
/* Would block */
fd->job_pending = j;
} else {
fd->job_pending = NULL;
}
// flush too
return 0;
}
/* ----------------------------------------------------------------------
* CRAM sequence iterators.
*/
/*
* Converts a cram in-memory record into a bam in-memory record. We
* pass a pointer to a bam_seq_t pointer along with the a pointer to
* the allocated size. These can initially be pointers to NULL and zero.
*
* This function will reallocate the bam buffer as required and update
* (*bam)->alloc accordingly, allowing it to be used within a loop
* efficiently without needing to allocate new bam objects over and
* over again.
*
* Returns the used size of the bam record on success
* -1 on failure.
*/
static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
cram_record *cr, int rec, bam_seq_t **bam) {
int bam_idx, rg_len;
char name_a[1024], *name;
int name_len;
char *aux, *aux_orig;
char *seq, *qual;
/* Assign names if not explicitly set */
if (fd->required_fields & SAM_QNAME) {
if (cr->name_len) {
name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
name_len = cr->name_len;
} else {
name = name_a;
name_len = strlen(fd->prefix);
memcpy(name, fd->prefix, name_len);
name += name_len;
*name++ = ':';
if (cr->mate_line >= 0 && cr->mate_line < rec)
name = (char *)append_uint64((unsigned char *)name,
s->hdr->record_counter +
cr->mate_line + 1);
else
name = (char *)append_uint64((unsigned char *)name,
s->hdr->record_counter +
rec + 1);
name_len = name - name_a;
name = name_a;
}
} else {
name = "?";
name_len = 1;
}
/* Generate BAM record */
if (cr->rg < -1 || cr->rg >= bfd->nrg)
return -1;
rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
if (!BLOCK_DATA(s->seqs_blk))
return -1;
seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
} else {
seq = "*";
cr->len = 1;
}
if (fd->required_fields & SAM_QUAL) {
if (!BLOCK_DATA(s->qual_blk))
return -1;
qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
} else {
qual = NULL;
}
bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
name, name_len,
cr->flags,
cr->ref_id,
cr->apos,
cr->aend,
cr->mqual,
cr->ncigar, &s->cigar[cr->cigar],
cr->mate_ref_id,
cr->mate_pos,
cr->tlen,
cr->len,
seq,
qual);
if (bam_idx == -1)
return -1;
aux = aux_orig = (char *)bam_aux(*bam);
/* Auxiliary strings */
if (cr->aux_size != 0) {
memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
aux += cr->aux_size;
}
/* RG:Z: */
if (cr->rg != -1) {
int len = bfd->rg[cr->rg].name_len;
*aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
memcpy(aux, bfd->rg[cr->rg].name, len);
aux += len;
*aux++ = 0;
}
return bam_idx + (aux - aux_orig);
}
/*
* Here be dragons! The multi-threading code in this is crufty beyond belief.
*/
static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
cram_container *c;
cram_slice *s = NULL;
if (!(c = fd->ctr)) {
// Load first container.
do {
if (!(c = fd->ctr = cram_read_container(fd)))
return NULL;
} while (c->length == 0);
/*
* The first container may be a result of a sub-range query.
* In which case it may still not be the optimal starting point
* due to skipped containers/slices in the index.
*/
if (fd->range.refid != -2) {
while (c->ref_seq_id != -2 &&
(c->ref_seq_id < fd->range.refid ||
c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
if (0 != cram_seek(fd, c->length, SEEK_CUR))
return NULL;
cram_free_container(fd->ctr);
do {
if (!(c = fd->ctr = cram_read_container(fd)))
return NULL;
} while (c->length == 0);
}
if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
return NULL;
}
if (!(c->comp_hdr_block = cram_read_block(fd)))
return NULL;
if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
return NULL;
c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
if (!c->comp_hdr)
return NULL;
if (!c->comp_hdr->AP_delta) {
pthread_mutex_lock(&fd->ref_lock);
fd->unsorted = 1;
pthread_mutex_unlock(&fd->ref_lock);
}
}
if ((s = c->slice)) {
c->slice = NULL;
cram_free_slice(s);
s = NULL;
}
if (c->curr_slice == c->max_slice) {
cram_free_container(c);
c = NULL;
}
/* Sorry this is so contorted! */
for (;;) {
if (fd->job_pending) {
cram_decode_job *j = (cram_decode_job *)fd->job_pending;
c = j->c;
s = j->s;
free(fd->job_pending);
fd->job_pending = NULL;
} else if (!fd->ooc) {
empty_container:
if (!c || c->curr_slice == c->max_slice) {
// new container
do {
if (!(c = fd->ctr = cram_read_container(fd))) {
if (fd->pool) {
fd->ooc = 1;
break;
}
return NULL;
}
} while (c->length == 0);
if (fd->ooc)
break;
/* Skip containers not yet spanning our range */
if (fd->range.refid != -2 && c->ref_seq_id != -2) {
fd->required_fields |= SAM_POS;
if (c->ref_seq_id != fd->range.refid) {
cram_free_container(c);
fd->ctr = NULL;
fd->ooc = 1;
fd->eof = 1;
break;
}
if (c->ref_seq_start > fd->range.end) {
cram_free_container(c);
fd->ctr = NULL;
fd->ooc = 1;
fd->eof = 1;
break;
}
if (c->ref_seq_start + c->ref_seq_span-1 <
fd->range.start) {
c->curr_rec = c->max_rec;
c->curr_slice = c->max_slice;
cram_seek(fd, c->length, SEEK_CUR);
cram_free_container(c);
c = NULL;
continue;
}
}
if (!(c->comp_hdr_block = cram_read_block(fd)))
return NULL;
if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
return NULL;
c->comp_hdr =
cram_decode_compression_header(fd, c->comp_hdr_block);
if (!c->comp_hdr)
return NULL;
if (!c->comp_hdr->AP_delta) {
pthread_mutex_lock(&fd->ref_lock);
fd->unsorted = 1;
pthread_mutex_unlock(&fd->ref_lock);
}
}
if (c->num_records == 0) {
cram_free_container(c); c = NULL;
goto empty_container;
}
if (!(s = c->slice = cram_read_slice(fd)))
return NULL;
c->curr_slice++;
c->curr_rec = 0;
c->max_rec = s->hdr->num_records;
s->last_apos = s->hdr->ref_seq_start;
/* Skip slices not yet spanning our range */
if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
if (s->hdr->ref_seq_id != fd->range.refid) {
fd->eof = 1;
cram_free_slice(s);
c->slice = NULL;
return NULL;
}
if (s->hdr->ref_seq_start > fd->range.end) {
fd->eof = 1;
cram_free_slice(s);
c->slice = NULL;
return NULL;
}
if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
fd->range.start) {
cram_free_slice(s);
c->slice = NULL;
cram_free_container(c);
c = NULL;
continue;
}
}
}
/* Test decoding of 1st seq */
if (!c || !s)
break;
if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
// if (cram_decode_slice(fd, c, s, fd->header) != 0) {
fprintf(stderr, "Failure to decode slice\n");
cram_free_slice(s);
c->slice = NULL;
return NULL;
}
if (!fd->pool || fd->job_pending)
break;
// Push it a bit far, to qsize in queue rather than pending arrival,
// as cram tends to be a bit bursty in decode timings.
if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
break;
}
if (fd->pool) {
t_pool_result *res;
cram_decode_job *j;
// fprintf(stderr, "Thread pool len = %d, %d\n",
// t_pool_results_queue_len(fd->rqueue),
// t_pool_results_queue_sz(fd->rqueue));
if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
return NULL;
res = t_pool_next_result_wait(fd->rqueue);
if (!res || !res->data) {
fprintf(stderr, "t_pool_next_result failure\n");
return NULL;
}
j = (cram_decode_job *)res->data;
c = j->c;
s = j->s;
fd->ctr = c;
t_pool_delete_result(res, 1);
}
*cp = c;
return s;
}
/*
* Read the next cram record and return it.
* Note that to decode cram_record the caller will need to look up some data
* in the current slice, pointed to by fd->ctr->slice. This is valid until
* the next call to cram_get_seq (which may invalidate it).
*
* Returns record pointer on success (do not free)
* NULL on failure
*/
cram_record *cram_get_seq(cram_fd *fd) {
cram_container *c;
cram_slice *s;
for (;;) {
c = fd->ctr;
if (c && c->slice && c->curr_rec < c->max_rec) {
s = c->slice;
} else {
if (!(s = cram_next_slice(fd, &c)))
return NULL;
}
if (fd->range.refid != -2) {
if (s->crecs[c->curr_rec].ref_id < fd->range.refid) {
c->curr_rec++;
continue;
}
if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
fd->eof = 1;
cram_free_slice(s);
c->slice = NULL;
return NULL;
}
if (s->crecs[c->curr_rec].apos > fd->range.end) {
fd->eof = 1;
cram_free_slice(s);
c->slice = NULL;
return NULL;
}
if (s->crecs[c->curr_rec].aend < fd->range.start) {
c->curr_rec++;
continue;
}
}
break;
}
fd->ctr = c;
c->slice = s;
return &s->crecs[c->curr_rec++];
}
/*
* Read the next cram record and convert it to a bam_seq_t struct.
*
* Returns 0 on success
* -1 on EOF or failure (check fd->err)
*/
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
cram_record *cr;
cram_container *c;
cram_slice *s;
if (!(cr = cram_get_seq(fd)))
return -1;
c = fd->ctr;
s = c->slice;
return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
}
htslib-1.2.1/cram/cram_decode.h 0000664 0000000 0000000 00000006673 12464172677 0016354 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* Include cram.h instead.
*
* This is an internal part of the CRAM system and is automatically included
* when you #include cram.h.
*
* Implements the decoding portion of CRAM I/O. Also see
* cram_codecs.[ch] for the actual encoding functions themselves.
*/
#ifndef _CRAM_READ_H_
#define _CRAM_READ_H_
#ifdef __cplusplus
extern "C" {
#endif
/* ----------------------------------------------------------------------
* CRAM sequence iterators.
*/
/*! Read the next cram record and return it as a cram_record.
*
* Note that to decode cram_record the caller will need to look up some data
* in the current slice, pointed to by fd->ctr->slice. This is valid until
* the next call to cram_get_seq (which may invalidate it).
*
* @return
* Returns record pointer on success (do not free);
* NULL on failure
*/
cram_record *cram_get_seq(cram_fd *fd);
/*! Read the next cram record and convert it to a bam_seq_t struct.
*
* @return
* Returns 0 on success;
* -1 on EOF or failure (check fd->err)
*/
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam);
/* ----------------------------------------------------------------------
* Internal functions
*/
/*! INTERNAL:
* Decodes a CRAM block compression header.
*
* @return
* Returns header ptr on success;
* NULL on failure
*/
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
cram_block *b);
/*! INTERNAL:
* Decodes a CRAM (un)mapped slice header block.
*
* @return
* Returns slice header ptr on success;
* NULL on failure
*/
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
/*! INTERNAL:
* Decode an entire slice from container blocks. Fills out s->crecs[] array.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
SAM_hdr *hdr);
#ifdef __cplusplus
}
#endif
#endif
htslib-1.2.1/cram/cram_encode.c 0000664 0000000 0000000 00000250640 12464172677 0016354 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "cram/cram.h"
#include "cram/os.h"
#include "cram/md5.h"
#define Z_CRAM_STRAT Z_FILTERED
//#define Z_CRAM_STRAT Z_RLE
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
static int process_one_read(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *cr,
bam_seq_t *b, int rnum);
/*
* Returns index of val into key.
* Basically strchr(key, val)-key;
*/
static int sub_idx(char *key, char val) {
int i;
for (i = 0; *key && *key++ != val; i++);
return i;
}
/*
* Encodes a compression header block into a generic cram_block structure.
*
* Returns cram_block ptr on success
* NULL on failure
*/
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
cram_block_compression_hdr *h) {
cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
int i, mc;
if (!cb || !map)
return NULL;
/*
* This is a concatenation of several blocks of data:
* header + landmarks, preservation map, read encoding map, and the tag
* encoding map.
* All 4 are variable sized and we need to know how large these are
* before creating the compression header itself as this starts with
* the total size (stored as a variable length string).
*/
// Duplicated from container itself, and removed in 1.1
if (CRAM_MAJOR_VERS(fd->version) == 1) {
itf8_put_blk(cb, h->ref_seq_id);
itf8_put_blk(cb, h->ref_seq_start);
itf8_put_blk(cb, h->ref_seq_span);
itf8_put_blk(cb, h->num_records);
itf8_put_blk(cb, h->num_landmarks);
for (i = 0; i < h->num_landmarks; i++) {
itf8_put_blk(cb, h->landmark[i]);
}
}
/* Create in-memory preservation map */
/* FIXME: should create this when we create the container */
{
khint_t k;
int r;
if (!(h->preservation_map = kh_init(map)))
return NULL;
k = kh_put(map, h->preservation_map, "RN", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
k = kh_put(map, h->preservation_map, "PI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "UI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
k = kh_put(map, h->preservation_map, "MI", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 1;
} else {
// Technically SM was in 1.0, but wasn't in Java impl.
k = kh_put(map, h->preservation_map, "SM", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "TD", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
k = kh_put(map, h->preservation_map, "AP", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = c->pos_sorted;
if (fd->no_ref || fd->embed_ref) {
// Reference Required == No
k = kh_put(map, h->preservation_map, "RR", &r);
if (-1 == r) return NULL;
kh_val(h->preservation_map, k).i = 0;
}
}
}
/* Encode preservation map; could collapse this and above into one */
mc = 0;
BLOCK_SIZE(map) = 0;
if (h->preservation_map) {
khint_t k;
for (k = kh_begin(h->preservation_map);
k != kh_end(h->preservation_map);
k++) {
const char *key;
khash_t(map) *pmap = h->preservation_map;
if (!kh_exist(pmap, k))
continue;
key = kh_key(pmap, k);
BLOCK_APPEND(map, key, 2);
switch(CRAM_KEY(key[0], key[1])) {
case CRAM_KEY('M','I'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('U','I'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('P','I'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('A','P'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('R','N'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('R','R'):
BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
break;
case CRAM_KEY('S','M'): {
char smat[5], *mp = smat;
*mp++ =
(sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
(sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
(sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
(sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
*mp++ =
(sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
(sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
(sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
(sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
*mp++ =
(sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
(sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
(sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
(sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
*mp++ =
(sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
(sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
(sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
(sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
*mp++ =
(sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
(sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
(sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
(sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
BLOCK_APPEND(map, smat, 5);
break;
}
case CRAM_KEY('T','D'): {
itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
BLOCK_APPEND(map,
BLOCK_DATA(h->TD_blk),
BLOCK_SIZE(h->TD_blk));
break;
}
default:
fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
break;
}
mc++;
}
}
itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
itf8_put_blk(cb, mc);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
/* rec encoding map */
mc = 0;
BLOCK_SIZE(map) = 0;
if (h->codecs[DS_BF]) {
if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_CF]) {
if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RL]) {
if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_AP]) {
if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RG]) {
if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_MF]) {
if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NS]) {
if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NP]) {
if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TS]) {
if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_NF]) {
if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TC]) {
if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TN]) {
if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TL]) {
if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FN]) {
if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FC]) {
if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_FP]) {
if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BS]) {
if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_IN]) {
if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_DL]) {
if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BA]) {
if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_BB]) {
if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_MQ]) {
if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RN]) {
if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_QS]) {
if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_QQ]) {
if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RI]) {
if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
fd->version))
return NULL;
mc++;
}
if (CRAM_MAJOR_VERS(fd->version) != 1) {
if (h->codecs[DS_SC]) {
if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_RS]) {
if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_PD]) {
if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_HC]) {
if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
fd->version))
return NULL;
mc++;
}
}
if (h->codecs[DS_TM]) {
if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
fd->version))
return NULL;
mc++;
}
if (h->codecs[DS_TV]) {
if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
fd->version))
return NULL;
mc++;
}
itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
itf8_put_blk(cb, mc);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
/* tag encoding map */
#if 0
mp = map; mc = 0;
if (h->tag_encoding_map) {
HashItem *hi;
HashIter *iter = HashTableIterCreate();
if (!iter)
return NULL;
while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
cram_map *m = hi->data.p;
int sz;
mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
return NULL;
mp += sz;
mc++;
}
HashTableIterDestroy(iter);
}
#else
mc = 0;
BLOCK_SIZE(map) = 0;
if (c->tags_used) {
khint_t k;
#define TAG_ID(a) ((#a[0]<<8)+#a[1])
for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
int key;
if (!kh_exist(c->tags_used, k))
continue;
mc++;
itf8_put_blk(map, kh_key(c->tags_used, k));
// use block content id 4
switch((key = kh_key(c->tags_used, k)) & 0xff) {
case 'Z': case 'H':
// string as byte_array_stop
if (CRAM_MAJOR_VERS(fd->version) == 1) {
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\005" // len
"\t" // stop-byte is also SAM separator
DS_aux_S "\000\000\000",
7);
} else {
if (key>>8 == TAG_ID(OQ))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_OQ_S,
4);
else if (key>>8 == TAG_ID(BQ))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_BQ_S,
4);
else if (key>>8 == TAG_ID(BD))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_BD_S,
4);
else if (key>>8 == TAG_ID(BI))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_BI_S,
4);
else if ((key>>8 == TAG_ID(Q2)) ||
(key>>8 == TAG_ID(U2)) ||
(key>>8 == TAG_ID(QT)) ||
(key>>8 == TAG_ID(CQ)))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_oq_S,
4);
else if ((key>>8 == TAG_ID(R2)) ||
(key>>8 == TAG_ID(E2)) ||
(key>>8 == TAG_ID(CS)) ||
(key>>8 == TAG_ID(BC)) ||
(key>>8 == TAG_ID(RT)))
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_os_S,
4);
else
BLOCK_APPEND(map,
"\005" // BYTE_ARRAY_STOP
"\002" // len
"\t" // stop-byte is also SAM separator
DS_aux_oz_S,
4);
}
break;
case 'A': case 'c': case 'C':
// byte array len, 1 byte
BLOCK_APPEND(map,
"\004" // BYTE_ARRAY_LEN
"\011" // length
"\003" // HUFFMAN (len)
"\004" // huffman-len
"\001" // 1 symbol
"\001" // symbol=1 byte value
"\001" // 1 length
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
DS_aux_S,// content-id
11);
break;
case 's': case 'S':
// byte array len, 2 byte
BLOCK_APPEND(map,
"\004" // BYTE_ARRAY_LEN
"\011" // length
"\003" // HUFFMAN (len)
"\004" // huffman-len
"\001" // 1 symbol
"\002" // symbol=2 byte value
"\001" // 1 length
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
DS_aux_S,// content-id
11);
break;
case 'i': case 'I': case 'f':
// byte array len, 4 byte
BLOCK_APPEND(map,
"\004" // BYTE_ARRAY_LEN
"\011" // length
"\003" // HUFFMAN (len)
"\004" // huffman-len
"\001" // 1 symbol
"\004" // symbol=4 byte value
"\001" // 1 length
"\000" // length=0
"\001" // EXTERNAL (val)
"\001" // external-len
DS_aux_S,// content-id
11);
break;
case 'B':
// Byte array of variable size, but we generate our tag
// byte stream at the wrong stage (during reading and not
// after slice header construction). So we use
// BYTE_ARRAY_LEN with the length codec being external
// too.
if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM)))
BLOCK_APPEND(map,
"\004" // BYTE_ARRAY_LEN
"\006" // length
"\001" // EXTERNAL (len)
"\001" // external-len
DS_aux_FZ_S // content-id
"\001" // EXTERNAL (val)
"\001" // external-len
DS_aux_FZ_S,// content-id
8);
else
BLOCK_APPEND(map,
"\004" // BYTE_ARRAY_LEN
"\006" // length
"\001" // EXTERNAL (len)
"\001" // external-len
DS_aux_S // content-id
"\001" // EXTERNAL (val)
"\001" // external-len
DS_aux_S,// content-id
8);
break;
default:
fprintf(stderr, "Unsupported SAM aux type '%c'\n",
kh_key(c->tags_used, k) & 0xff);
}
//mp += m->codec->store(m->codec, mp, NULL, fd->version);
}
}
#endif
itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
itf8_put_blk(cb, mc);
BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
if (fd->verbose)
fprintf(stderr, "Wrote compression block header in %d bytes\n",
(int)BLOCK_SIZE(cb));
BLOCK_UPLEN(cb);
cram_free_block(map);
return cb;
}
/*
* Encodes a slice compression header.
*
* Returns cram_block on success
* NULL on failure
*/
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
char *buf;
char *cp;
cram_block *b = cram_new_block(MAPPED_SLICE, 0);
int j;
if (!b)
return NULL;
if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
cram_free_block(b);
return NULL;
}
cp += itf8_put(cp, s->hdr->ref_seq_id);
cp += itf8_put(cp, s->hdr->ref_seq_start);
cp += itf8_put(cp, s->hdr->ref_seq_span);
cp += itf8_put(cp, s->hdr->num_records);
if (CRAM_MAJOR_VERS(fd->version) == 2)
cp += itf8_put(cp, s->hdr->record_counter);
else if (CRAM_MAJOR_VERS(fd->version) >= 3)
cp += ltf8_put(cp, s->hdr->record_counter);
cp += itf8_put(cp, s->hdr->num_blocks);
cp += itf8_put(cp, s->hdr->num_content_ids);
for (j = 0; j < s->hdr->num_content_ids; j++) {
cp += itf8_put(cp, s->hdr->block_content_ids[j]);
}
if (s->hdr->content_type == MAPPED_SLICE)
cp += itf8_put(cp, s->hdr->ref_base_id);
if (CRAM_MAJOR_VERS(fd->version) != 1) {
memcpy(cp, s->hdr->md5, 16); cp += 16;
}
assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
b->data = (unsigned char *)buf;
b->comp_size = b->uncomp_size = cp-buf;
return b;
}
/*
* Encodes a single read.
*
* Returns 0 on success
* -1 on failure
*/
static int cram_encode_slice_read(cram_fd *fd,
cram_container *c,
cram_block_compression_hdr *h,
cram_slice *s,
cram_record *cr,
int *last_pos) {
int r = 0;
int32_t i32;
unsigned char uc;
//fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
//printf("BF=0x%x\n", cr->flags);
// bf = cram_flag_swap[cr->flags];
i32 = fd->cram_flag_swap[cr->flags & 0xfff];
r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
i32 = cr->cram_flags;
r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
if (c->pos_sorted) {
i32 = cr->apos - *last_pos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
*last_pos = cr->apos;
} else {
i32 = cr->apos;
r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
}
r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
if (c->comp_hdr->read_names_included) {
// RN codec: Already stored in block[3].
}
if (cr->cram_flags & CRAM_FLAG_DETACHED) {
i32 = cr->mate_flags;
r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
if (!c->comp_hdr->read_names_included) {
// RN codec: Already stored in block[3].
}
r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
(char *)&cr->mate_ref_id, 1);
r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
(char *)&cr->mate_pos, 1);
r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
(char *)&cr->tlen, 1);
} else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
(char *)&cr->mate_line, 1);
}
/* Aux tags */
if (CRAM_MAJOR_VERS(fd->version) == 1) {
int j;
uc = cr->ntags;
r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
for (j = 0; j < cr->ntags; j++) {
uint32_t i32 = s->TN[cr->TN_idx + j]; // id
r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
}
} else {
r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
}
// qual
// QS codec : Already stored in block[2].
// features (diffs)
if (!(cr->flags & BAM_FUNMAP)) {
int prev_pos = 0, j;
r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
(char *)&cr->nfeature, 1);
for (j = 0; j < cr->nfeature; j++) {
cram_feature *f = &s->features[cr->feature + j];
uc = f->X.code;
r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
i32 = f->X.pos - prev_pos;
r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
prev_pos = f->X.pos;
switch(f->X.code) {
//char *seq;
case 'X':
//fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
uc = f->X.base;
r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
(char *)&uc, 1);
break;
case 'S':
// Already done
// r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
// BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
// f->S.len);
// if (IS_CRAM_3_VERS(fd)) {
// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
// BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
// f->S.len);
// }
break;
case 'I':
//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
// seq, f->S.len);
// if (IS_CRAM_3_VERS(fd)) {
// r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
// BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
// f->I.len);
// }
break;
case 'i':
uc = f->i.base;
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
(char *)&uc, 1);
//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
// seq, 1);
break;
case 'D':
i32 = f->D.len;
r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
(char *)&i32, 1);
break;
case 'B':
// // Used when we try to store a non ACGTN base or an N
// // that aligns against a non ACGTN reference
uc = f->B.base;
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
(char *)&uc, 1);
// Already added
// uc = f->B.qual;
// r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
// (char *)&uc, 1);
break;
case 'b':
// string of bases
r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
(char *)BLOCK_DATA(s->seqs_blk)
+ f->b.seq_idx,
f->b.len);
break;
case 'Q':
// Already added
// uc = f->B.qual;
// r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
// (char *)&uc, 1);
break;
case 'N':
i32 = f->N.len;
r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
(char *)&i32, 1);
break;
case 'P':
i32 = f->P.len;
r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
(char *)&i32, 1);
break;
case 'H':
i32 = f->H.len;
r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
(char *)&i32, 1);
break;
default:
fprintf(stderr, "unhandled feature code %c\n",
f->X.code);
return -1;
}
}
r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
(char *)&cr->mqual, 1);
} else {
char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
}
return r ? -1 : 0;
}
/*
* Applies various compression methods to specific blocks, depending on
* known observations of how data series compress.
*
* Returns 0 on success
* -1 on failure
*/
static int cram_compress_slice(cram_fd *fd, cram_slice *s) {
int level = fd->level, i;
int method = 1< 5 && s->block[0]->uncomp_size > 500)
cram_compress_block(fd, s->block[0], NULL, GZIP, 1);
if (fd->use_bz2)
method |= 1<use_rans)
method |= (1<use_lzma)
method |= (1<= 6)
methodF = method;
/* Specific compression methods for certain block types */
if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
method, level))
return -1;
if (fd->level == 0) {
/* Do nothing */
} else if (fd->level == 1) {
if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
methodF, 1))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block(fd, s->block[i], fd->m[i],
method, 1))
return -1;
}
} else if (fd->level < 3) {
if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
method, 1))
return -1;
if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
method, 1))
return -1;
if (s->block[DS_BB])
if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
method, 1))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block(fd, s->block[i], fd->m[i],
method, level))
return -1;
}
} else {
if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
method, level))
return -1;
if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
method, level))
return -1;
if (s->block[DS_BB])
if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
method, level))
return -1;
for (i = DS_aux; i <= DS_aux_oz; i++) {
if (s->block[i])
if (cram_compress_block(fd, s->block[i], fd->m[i],
method, level))
return -1;
}
}
// NAME: best is generally xz, bzip2, zlib then rans1
// It benefits well from a little bit extra compression level.
if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
method & ~(1<block[DS_NS] != s->block[0])
if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
method, level))
return -1;
/*
* Minimal compression of any block still uncompressed, bar CORE
*/
{
int i;
for (i = 1; i < DS_END; i++) {
if (!s->block[i] || s->block[i] == s->block[0])
continue;
// fast methods only
if (s->block[i]->method == RAW) {
cram_compress_block(fd, s->block[i], fd->m[i],
methodF, level);
}
}
}
return 0;
}
/*
* Encodes a single slice from a container
*
* Returns 0 on success
* -1 on failure
*/
static int cram_encode_slice(cram_fd *fd, cram_container *c,
cram_block_compression_hdr *h, cram_slice *s) {
int rec, r = 0, last_pos;
int embed_ref;
enum cram_DS_ID id;
embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
/*
* Slice external blocks:
* ID 0 => base calls (insertions, soft-clip)
* ID 1 => qualities
* ID 2 => names
* ID 3 => TS (insert size), NP (next frag)
* ID 4 => tag values
* ID 6 => tag IDs (TN), if CRAM_V1.0
* ID 7 => TD tag dictionary, if !CRAM_V1.0
*/
/* Create cram slice header */
s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
s->hdr->record_counter = c->num_records + c->record_counter;
c->num_records += s->hdr->num_records;
s->block = calloc(DS_END, sizeof(s->block[0]));
s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
if (!s->block || !s->hdr->block_content_ids)
return -1;
// Create first fixed blocks, always external.
// CORE
if (!(s->block[0] = cram_new_block(CORE, 0)))
return -1;
// TN block for CRAM v1
if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
h->codecs[DS_TN]->external.content_id = DS_TN;
} else {
s->block[DS_TN] = s->block[0];
}
s->block[DS_TN] = s->block[DS_TN];
}
// Embedded reference
if (embed_ref) {
if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
return -1;
s->ref_id = DS_ref; // needed?
BLOCK_APPEND(s->block[DS_ref],
c->ref + c->first_base - c->ref_start,
c->last_base - c->first_base + 1);
}
/*
* All the data-series blocks if appropriate.
*/
for (id = DS_BF; id < DS_TN; id++) {
if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
switch (h->codecs[id]->codec) {
case E_EXTERNAL:
if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
return -1;
h->codecs[id]->external.content_id = id;
break;
case E_BYTE_ARRAY_STOP:
if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
return -1;
h->codecs[id]->byte_array_stop.content_id = id;
break;
case E_BYTE_ARRAY_LEN: {
cram_codec *cc;
cc = h->codecs[id]->e_byte_array_len.len_codec;
if (cc->codec == E_EXTERNAL) {
int eid = cc->external.content_id;
if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
return -1;
cc->external.content_id = eid;
cc->out = s->block[eid];
}
cc = h->codecs[id]->e_byte_array_len.val_codec;
if (cc->codec == E_EXTERNAL) {
int eid = cc->external.content_id;
if (!s->block[eid])
if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
return -1;
cc->external.content_id = eid;
cc->out = s->block[eid];
}
break;
}
default:
break;
}
} else {
if (!(id == DS_BB && !h->codecs[DS_BB]))
s->block[id] = s->block[0];
}
if (h->codecs[id])
h->codecs[id]->out = s->block[id];
}
/* Encode reads */
last_pos = s->hdr->ref_seq_start;
for (rec = 0; rec < s->hdr->num_records; rec++) {
cram_record *cr = &s->crecs[rec];
if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
return -1;
}
s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
s->block[0]->comp_size = s->block[0]->uncomp_size;
// Make sure the fixed blocks point to the correct sources
s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL;
s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL;
s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL;
s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL;
s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL;
s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL;
s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL;
s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL;
s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL;
// Ensure block sizes are up to date.
for (id = 1; id < DS_END; id++) {
if (!s->block[id] || s->block[id] == s->block[0])
continue;
if (s->block[id]->uncomp_size == 0)
BLOCK_UPLEN(s->block[id]);
}
// Compress it all
if (cram_compress_slice(fd, s) == -1)
return -1;
// Collapse empty blocks and create hdr_block
{
int i, j;
for (i = j = 1; i < DS_END; i++) {
if (!s->block[i] || s->block[i] == s->block[0])
continue;
if (s->block[i]->uncomp_size == 0) {
cram_free_block(s->block[i]);
s->block[i] = NULL;
continue;
}
s->block[j] = s->block[i];
s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
j++;
}
s->hdr->num_content_ids = j-1;
s->hdr->num_blocks = j;
if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
return -1;
}
return r ? -1 : 0;
}
/*
* Encodes all slices in a container into blocks.
* Returns 0 on success
* -1 on failure
*/
int cram_encode_container(cram_fd *fd, cram_container *c) {
int i, j, slice_offset;
cram_block_compression_hdr *h = c->comp_hdr;
cram_block *c_hdr;
int multi_ref = 0;
int r1, r2, sn, nref;
spare_bams *spares;
/* Cache references up-front if we have unsorted access patterns */
pthread_mutex_lock(&fd->ref_lock);
nref = fd->refs->nref;
pthread_mutex_unlock(&fd->ref_lock);
if (!fd->no_ref && c->refs_used) {
for (i = 0; i < nref; i++) {
if (c->refs_used[i])
cram_get_ref(fd, i, 1, 0);
}
}
/* To create M5 strings */
/* Fetch reference sequence */
if (!fd->no_ref) {
bam_seq_t *b = c->bams[0];
char *ref;
ref = cram_get_ref(fd, bam_ref(b), 1, 0);
if (!ref && bam_ref(b) >= 0) {
fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
return -1;
}
if ((c->ref_id = bam_ref(b)) >= 0) {
c->ref_seq_id = c->ref_id;
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
c->ref_start = 1;
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
} else {
c->ref_seq_id = c->ref_id; // FIXME remove one var!
}
} else {
c->ref_id = bam_ref(c->bams[0]);
cram_ref_incr(fd->refs, c->ref_id);
c->ref_seq_id = c->ref_id;
}
/* Turn bams into cram_records and gather basic stats */
for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
cram_slice *s = c->slices[sn];
int first_base = INT_MAX, last_base = INT_MIN;
assert(sn < c->curr_slice);
/* FIXME: we could create our slice objects here too instead of
* in cram_put_bam_seq. It's more natural here and also this is
* bit is threaded so it's less work in the main thread.
*/
for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) {
cram_record *cr = &s->crecs[r2];
bam_seq_t *b = c->bams[r1];
/* If multi-ref we need to cope with changing reference per seq */
if (c->multi_seq && !fd->no_ref) {
if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
if (c->ref_seq_id >= 0)
cram_ref_decr(fd->refs, c->ref_seq_id);
if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
fprintf(stderr, "Failed to load reference #%d\n",
bam_ref(b));
return -1;
}
c->ref_seq_id = bam_ref(b); // overwritten later by -2
assert(fd->refs->ref_id[c->ref_seq_id]->seq);
c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
c->ref_start = 1;
c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
}
}
process_one_read(fd, c, s, cr, b, r2);
if (first_base > cr->apos)
first_base = cr->apos;
if (last_base < cr->aend)
last_base = cr->aend;
}
if (c->multi_seq) {
s->hdr->ref_seq_id = -2;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else {
s->hdr->ref_seq_id = c->ref_id;
s->hdr->ref_seq_start = first_base;
s->hdr->ref_seq_span = last_base - first_base + 1;
}
s->hdr->num_records = r2;
}
if (c->multi_seq && !fd->no_ref) {
if (c->ref_seq_id >= 0)
cram_ref_decr(fd->refs, c->ref_seq_id);
}
/* Link our bams[] array onto the spare bam list for reuse */
spares = malloc(sizeof(*spares));
pthread_mutex_lock(&fd->bam_list_lock);
spares->bams = c->bams;
spares->next = fd->bl;
fd->bl = spares;
pthread_mutex_unlock(&fd->bam_list_lock);
c->bams = NULL;
/* Detect if a multi-seq container */
cram_stats_encoding(fd, c->stats[DS_RI]);
multi_ref = c->stats[DS_RI]->nvals > 1;
if (multi_ref) {
if (fd->verbose)
fprintf(stderr, "Multi-ref container\n");
c->ref_seq_id = -2;
c->ref_seq_start = 0;
c->ref_seq_span = 0;
}
/* Compute MD5s */
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
if (CRAM_MAJOR_VERS(fd->version) != 1) {
if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
MD5_CTX md5;
MD5_Init(&md5);
MD5_Update(&md5,
c->ref + s->hdr->ref_seq_start - c->ref_start,
s->hdr->ref_seq_span);
MD5_Final(s->hdr->md5, &md5);
} else {
memset(s->hdr->md5, 0, 16);
}
}
}
c->num_records = 0;
c->num_blocks = 0;
c->length = 0;
//fprintf(stderr, "=== BF ===\n");
h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
c->stats[DS_BF], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== CF ===\n");
h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
c->stats[DS_CF], E_INT, NULL,
fd->version);
// fprintf(stderr, "=== RN ===\n");
// h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
// c->stats[DS_RN], E_BYTE_ARRAY, NULL,
// fd->version);
//fprintf(stderr, "=== AP ===\n");
if (c->pos_sorted) {
h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
c->stats[DS_AP], E_INT, NULL,
fd->version);
} else {
int p[2] = {0, c->max_apos};
h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
fd->version);
}
//fprintf(stderr, "=== RG ===\n");
h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
c->stats[DS_RG], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== MQ ===\n");
h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
c->stats[DS_MQ], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== NS ===\n");
h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
c->stats[DS_NS], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== MF ===\n");
h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
c->stats[DS_MF], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== TS ===\n");
h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
c->stats[DS_TS], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== NP ===\n");
h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
c->stats[DS_NP], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== NF ===\n");
h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
c->stats[DS_NF], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== RL ===\n");
h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
c->stats[DS_RL], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== FN ===\n");
h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
c->stats[DS_FN], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== FC ===\n");
h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
c->stats[DS_FC], E_BYTE, NULL,
fd->version);
//fprintf(stderr, "=== FP ===\n");
h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
c->stats[DS_FP], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== DL ===\n");
h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
c->stats[DS_DL], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== BA ===\n");
h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
c->stats[DS_BA], E_BYTE, NULL,
fd->version);
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
cram_byte_array_len_encoder e;
e.len_encoding = E_EXTERNAL;
e.len_dat = (void *)DS_BB_len;
//e.len_dat = (void *)DS_BB;
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)DS_BB;
h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
E_BYTE_ARRAY, (void *)&e,
fd->version);
} else {
h->codecs[DS_BB] = NULL;
}
//fprintf(stderr, "=== BS ===\n");
h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
c->stats[DS_BS], E_BYTE, NULL,
fd->version);
if (CRAM_MAJOR_VERS(fd->version) == 1) {
h->codecs[DS_TL] = NULL;
h->codecs[DS_RI] = NULL;
h->codecs[DS_RS] = NULL;
h->codecs[DS_PD] = NULL;
h->codecs[DS_HC] = NULL;
h->codecs[DS_SC] = NULL;
//fprintf(stderr, "=== TC ===\n");
h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
c->stats[DS_TC], E_BYTE, NULL,
fd->version);
//fprintf(stderr, "=== TN ===\n");
h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
c->stats[DS_TN], E_INT, NULL,
fd->version);
} else {
h->codecs[DS_TC] = NULL;
h->codecs[DS_TN] = NULL;
//fprintf(stderr, "=== TL ===\n");
h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
c->stats[DS_TL], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== RI ===\n");
h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
c->stats[DS_RI], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== RS ===\n");
h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
c->stats[DS_RS], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== PD ===\n");
h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
c->stats[DS_PD], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== HC ===\n");
h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
c->stats[DS_HC], E_INT, NULL,
fd->version);
//fprintf(stderr, "=== SC ===\n");
if (1) {
int i2[2] = {0, DS_SC};
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version);
} else {
// Appears to be no practical benefit to using this method,
// but it may work better if we start mixing SC, IN and BB
// elements into the same external block.
cram_byte_array_len_encoder e;
e.len_encoding = E_EXTERNAL;
e.len_dat = (void *)DS_SC_len;
e.val_encoding = E_EXTERNAL;
e.val_dat = (void *)DS_SC;
h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
E_BYTE_ARRAY, (void *)&e,
fd->version);
}
}
//fprintf(stderr, "=== IN ===\n");
{
int i2[2] = {0, DS_IN};
h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version);
}
h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
(void *)DS_QS,
fd->version);
{
int i2[2] = {0, DS_RN};
h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
E_BYTE_ARRAY, (void *)i2,
fd->version);
}
/* Encode slices */
for (i = 0; i < c->curr_slice; i++) {
if (fd->verbose)
fprintf(stderr, "Encode slice %d\n", i);
if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
return -1;
}
/* Create compression header */
{
h->ref_seq_id = c->ref_seq_id;
h->ref_seq_start = c->ref_seq_start;
h->ref_seq_span = c->ref_seq_span;
h->num_records = c->num_records;
h->mapped_qs_included = 0; // fixme
h->unmapped_qs_included = 0; // fixme
// h->... fixme
memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
return -1;
}
/* Compute landmarks */
/* Fill out slice landmarks */
c->num_landmarks = c->curr_slice;
c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
if (!c->landmark)
return -1;
/*
* Slice offset starts after the first block, so we need to simulate
* writing it to work out the correct offset
*/
{
slice_offset = c_hdr->method == RAW
? c_hdr->uncomp_size
: c_hdr->comp_size;
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(c_hdr->content_id) +
itf8_size(c_hdr->comp_size) +
itf8_size(c_hdr->uncomp_size);
}
c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
c->num_blocks += s->hdr->num_blocks + 2;
c->landmark[i] = slice_offset;
if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
c->ref_seq_start + c->ref_seq_span) {
c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
- c->ref_seq_start;
}
slice_offset += s->hdr_block->method == RAW
? s->hdr_block->uncomp_size
: s->hdr_block->comp_size;
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(s->hdr_block->content_id) +
itf8_size(s->hdr_block->comp_size) +
itf8_size(s->hdr_block->uncomp_size);
for (j = 0; j < s->hdr->num_blocks; j++) {
slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(s->block[j]->content_id) +
itf8_size(s->block[j]->comp_size) +
itf8_size(s->block[j]->uncomp_size);
slice_offset += s->block[j]->method == RAW
? s->block[j]->uncomp_size
: s->block[j]->comp_size;
}
}
c->length += slice_offset; // just past the final slice
c->comp_hdr_block = c_hdr;
if (c->ref_seq_id >= 0) {
cram_ref_decr(fd->refs, c->ref_seq_id);
}
/* Cache references up-front if we have unsorted access patterns */
if (!fd->no_ref && c->refs_used) {
for (i = 0; i < fd->refs->nref; i++) {
if (c->refs_used[i])
cram_ref_decr(fd->refs, i);
}
}
return 0;
}
/*
* Adds a feature code to a read within a slice. For purposes of minimising
* memory allocations and fragmentation we have one array of features for all
* reads within the slice. We return the index into this array for this new
* feature.
*
* Returns feature index on success
* -1 on failure.
*/
static int cram_add_feature(cram_container *c, cram_slice *s,
cram_record *r, cram_feature *f) {
if (s->nfeatures >= s->afeatures) {
s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
if (!s->features)
return -1;
}
if (!r->nfeature++) {
r->feature = s->nfeatures;
cram_stats_add(c->stats[DS_FP], f->X.pos);
} else {
cram_stats_add(c->stats[DS_FP],
f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
}
cram_stats_add(c->stats[DS_FC], f->X.code);
s->features[s->nfeatures++] = *f;
return 0;
}
static int cram_add_substitution(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char base, char qual, char ref) {
cram_feature f;
// seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
f.X.pos = pos+1;
f.X.code = 'X';
f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
cram_stats_add(c->stats[DS_BS], f.X.base);
} else {
f.B.pos = pos+1;
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
cram_stats_add(c->stats[DS_BA], f.B.base);
cram_stats_add(c->stats[DS_QS], f.B.qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
}
return cram_add_feature(c, s, r, &f);
}
static int cram_add_bases(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.b.pos = pos+1;
f.b.code = 'b';
f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
f.b.len = len;
return cram_add_feature(c, s, r, &f);
}
static int cram_add_base(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char base, char qual) {
cram_feature f;
f.B.pos = pos+1;
f.B.code = 'B';
f.B.base = base;
f.B.qual = qual;
cram_stats_add(c->stats[DS_BA], base);
cram_stats_add(c->stats[DS_QS], qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_quality(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *r,
int pos, char qual) {
cram_feature f;
f.Q.pos = pos+1;
f.Q.code = 'Q';
f.Q.qual = qual;
cram_stats_add(c->stats[DS_QS], qual);
BLOCK_APPEND_CHAR(s->qual_blk, qual);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.D.pos = pos+1;
f.D.code = 'D';
f.D.len = len;
cram_stats_add(c->stats[DS_DL], len);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base, int version) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'S';
f.S.len = len;
switch (CRAM_MAJOR_VERS(version)) {
case 1:
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
BLOCK_APPEND(s->base_blk, base, len);
BLOCK_APPEND_CHAR(s->base_blk, '\0');
break;
case 2:
default:
f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
if (base) {
BLOCK_APPEND(s->soft_blk, base, len);
} else {
int i;
for (i = 0; i < len; i++)
BLOCK_APPEND_CHAR(s->soft_blk, 'N');
}
BLOCK_APPEND_CHAR(s->soft_blk, '\0');
break;
// default:
// // v3.0 onwards uses BB data-series
// f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
}
return cram_add_feature(c, s, r, &f);
}
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'H';
f.S.len = len;
cram_stats_add(c->stats[DS_HC], len);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'N';
f.S.len = len;
cram_stats_add(c->stats[DS_RS], len);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.S.pos = pos+1;
f.S.code = 'P';
f.S.len = len;
cram_stats_add(c->stats[DS_PD], len);
return cram_add_feature(c, s, r, &f);
}
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
int pos, int len, char *base) {
cram_feature f;
f.I.pos = pos+1;
if (len == 1) {
char b = base ? *base : 'N';
f.i.code = 'i';
f.i.base = b;
cram_stats_add(c->stats[DS_BA], b);
} else {
f.I.code = 'I';
f.I.len = len;
f.S.seq_idx = BLOCK_SIZE(s->base_blk);
if (base) {
BLOCK_APPEND(s->base_blk, base, len);
} else {
int i;
for (i = 0; i < len; i++)
BLOCK_APPEND_CHAR(s->base_blk, 'N');
}
BLOCK_APPEND_CHAR(s->base_blk, '\0');
}
return cram_add_feature(c, s, r, &f);
}
/*
* Encodes auxiliary data.
* Returns the read-group parsed out of the BAM aux fields on success
* NULL on failure or no rg present (FIXME)
*/
static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr) {
char *aux, *tmp, *rg = NULL;
int aux_size = bam_blk_size(b) -
((char *)bam_aux(b) - (char *)&bam_ref(b));
/* Worst case is 1 nul char on every ??:Z: string, so +33% */
BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_blk);
aux = (char *)bam_aux(b);
cr->TN_idx = s->nTN;
while (aux[0] != 0) {
int32_t i32;
int r;
if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
rg = &aux[3];
while (*aux++);
continue;
}
if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
while (*aux++);
continue;
}
if (aux[0] == 'N' && aux[1] == 'M') {
switch(aux[2]) {
case 'A': case 'C': case 'c': aux+=4; break;
case 'I': case 'i': case 'f': aux+=7; break;
default:
fprintf(stderr, "Unhandled type code for NM tag\n");
return NULL;
}
continue;
}
cr->ntags++;
i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
kh_put(s_i2i, c->tags_used, i32, &r);
if (-1 == r)
return NULL;
if (s->nTN >= s->aTN) {
s->aTN = s->aTN ? s->aTN*2 : 1024;
if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
return NULL;
}
s->TN[s->nTN++] = i32;
cram_stats_add(c->stats[DS_TN], i32);
switch(aux[2]) {
case 'A': case 'C': case 'c':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++;
break;
case 'S': case 's':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++;
break;
case 'I': case 'i': case 'f':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
break;
case 'd':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
break;
case 'Z': case 'H':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
while ((*tmp++=*aux++));
*tmp++ = '\t'; // stop byte
break;
case 'B': {
int type = aux[3], blen;
uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
(((unsigned char *)aux)[5]<< 8) +
(((unsigned char *)aux)[6]<<16) +
(((unsigned char *)aux)[7]<<24));
// skip TN field
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
// We use BYTE_ARRAY_LEN with external length, so store that first
switch (type) {
case 'c': case 'C':
blen = count;
break;
case 's': case 'S':
blen = 2*count;
break;
case 'i': case 'I': case 'f':
blen = 4*count;
break;
default:
fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
type);
return NULL;
}
tmp += itf8_put(tmp, blen+5);
*tmp++=*aux++; // sub-type & length
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
// The tag data itself
memcpy(tmp, aux, blen); tmp += blen; aux += blen;
//cram_stats_add(c->aux_B_stats, blen);
break;
}
default:
fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
return NULL;
}
}
cram_stats_add(c->stats[DS_TC], cr->ntags);
cr->aux = BLOCK_SIZE(s->aux_blk);
cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
assert(s->aux_blk->byte <= s->aux_blk->alloc);
return rg;
}
/*
* Encodes auxiliary data. Largely duplicated from above, but done so to
* keep it simple and avoid a myriad of version ifs.
*
* Returns the read-group parsed out of the BAM aux fields on success
* NULL on failure or no rg present (FIXME)
*/
static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
cram_slice *s, cram_record *cr) {
char *aux, *orig, *tmp, *rg = NULL;
int aux_size = bam_get_l_aux(b);
cram_block *td_b = c->comp_hdr->TD_blk;
int TD_blk_size = BLOCK_SIZE(td_b), new;
char *key;
khint_t k;
/* Worst case is 1 nul char on every ??:Z: string, so +33% */
BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_blk);
orig = aux = (char *)bam_aux(b);
// Copy aux keys to td_b and aux values to s->aux_blk
while (aux - orig < aux_size && aux[0] != 0) {
uint32_t i32;
int r;
if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
rg = &aux[3];
while (*aux++);
continue;
}
if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
while (*aux++);
continue;
}
if (aux[0] == 'N' && aux[1] == 'M') {
switch(aux[2]) {
case 'A': case 'C': case 'c': aux+=4; break;
case 'S': case 's': aux+=5; break;
case 'I': case 'i': case 'f': aux+=7; break;
default:
fprintf(stderr, "Unhandled type code for NM tag\n");
return NULL;
}
continue;
}
BLOCK_APPEND(td_b, aux, 3);
i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
kh_put(s_i2i, c->tags_used, i32, &r);
if (-1 == r)
return NULL;
// BQ:Z
if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') {
char *tmp;
if (!s->aux_BQ_blk)
if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ)))
return NULL;
BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_BQ_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk);
continue;
}
// BD:Z
if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') {
char *tmp;
if (!s->aux_BD_blk)
if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD)))
return NULL;
BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_BD_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk);
continue;
}
// BI:Z
if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') {
char *tmp;
if (!s->aux_BI_blk)
if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI)))
return NULL;
BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_BI_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk);
continue;
}
// OQ:Z:
if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') {
char *tmp;
if (!s->aux_OQ_blk)
if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ)))
return NULL;
BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_OQ_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk);
continue;
}
// FZ:B or ZM:B
if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') ||
(aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) {
int type = aux[3], blen;
uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
(((unsigned char *)aux)[5]<< 8) +
(((unsigned char *)aux)[6]<<16) +
(((unsigned char *)aux)[7]<<24));
char *tmp;
if (!s->aux_FZ_blk)
if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ)))
return NULL;
BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_FZ_blk);
// skip TN field
aux+=3;
// We use BYTE_ARRAY_LEN with external length, so store that first
switch (type) {
case 'c': case 'C':
blen = count;
break;
case 's': case 'S':
blen = 2*count;
break;
case 'i': case 'I': case 'f':
blen = 4*count;
break;
default:
fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
type);
return NULL;
}
blen += 5; // sub-type & length
tmp += itf8_put(tmp, blen);
// The tag data itself
memcpy(tmp, aux, blen); tmp += blen; aux += blen;
BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk);
continue;
}
// Other quality data - {Q2,E2,U2,CQ}:Z and similar
if (((aux[0] == 'Q' && aux[1] == '2') ||
(aux[0] == 'U' && aux[1] == '2') ||
(aux[0] == 'Q' && aux[1] == 'T') ||
(aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') {
char *tmp;
if (!s->aux_oq_blk)
if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq)))
return NULL;
BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_oq_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk);
continue;
}
// Other sequence data - {R2,E2,CS,BC,RT}:Z and similar
if (((aux[0] == 'R' && aux[1] == '2') ||
(aux[0] == 'E' && aux[1] == '2') ||
(aux[0] == 'C' && aux[1] == 'S') ||
(aux[0] == 'B' && aux[1] == 'C') ||
(aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') {
char *tmp;
if (!s->aux_os_blk)
if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os)))
return NULL;
BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_os_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk);
continue;
}
switch(aux[2]) {
case 'A': case 'C': case 'c':
aux+=3;
*tmp++=*aux++;
break;
case 'S': case 's':
aux+=3;
*tmp++=*aux++; *tmp++=*aux++;
break;
case 'I': case 'i': case 'f':
aux+=3;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
break;
case 'd':
aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
break;
case 'Z': case 'H':
{
char *tmp;
if (!s->aux_oz_blk)
if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz)))
return NULL;
BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1);
tmp = (char *)BLOCK_END(s->aux_oz_blk);
aux += 3;
while ((*tmp++=*aux++));
*tmp++ = '\t';
BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp -
BLOCK_DATA(s->aux_oz_blk);
}
break;
case 'B': {
int type = aux[3], blen;
uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
(((unsigned char *)aux)[5]<< 8) +
(((unsigned char *)aux)[6]<<16) +
(((unsigned char *)aux)[7]<<24));
// skip TN field
aux+=3;
// We use BYTE_ARRAY_LEN with external length, so store that first
switch (type) {
case 'c': case 'C':
blen = count;
break;
case 's': case 'S':
blen = 2*count;
break;
case 'i': case 'I': case 'f':
blen = 4*count;
break;
default:
fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
type);
return NULL;
}
blen += 5; // sub-type & length
tmp += itf8_put(tmp, blen);
// The tag data itself
memcpy(tmp, aux, blen); tmp += blen; aux += blen;
//cram_stats_add(c->aux_B_stats, blen);
break;
}
default:
fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
return NULL;
}
}
// FIXME: sort BLOCK_DATA(td_b) by char[3] triples
// And and increment TD hash entry
BLOCK_APPEND_CHAR(td_b, 0);
// Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
key = string_ndup(c->comp_hdr->TD_keys,
(char *)BLOCK_DATA(td_b) + TD_blk_size,
BLOCK_SIZE(td_b) - TD_blk_size);
k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
if (new < 0) {
return NULL;
} else if (new == 0) {
BLOCK_SIZE(td_b) = TD_blk_size;
} else {
kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
c->comp_hdr->nTL++;
}
cr->TL = kh_val(c->comp_hdr->TD_hash, k);
cram_stats_add(c->stats[DS_TL], cr->TL);
cr->aux = BLOCK_SIZE(s->aux_blk);
cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
assert(s->aux_blk->byte <= s->aux_blk->alloc);
return rg;
}
/*
* Handles creation of a new container or new slice, flushing any
* existing containers when appropriate.
*
* Really this is next slice, which may or may not lead to a new container.
*
* Returns cram_container pointer on success
* NULL on failure.
*/
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
cram_container *c = fd->ctr;
cram_slice *s;
int i;
/* First occurence */
if (c->curr_ref == -2)
c->curr_ref = bam_ref(b);
if (c->slice) {
s = c->slice;
if (c->multi_seq) {
s->hdr->ref_seq_id = -2;
s->hdr->ref_seq_start = 0;
s->hdr->ref_seq_span = 0;
} else {
s->hdr->ref_seq_id = c->curr_ref;
s->hdr->ref_seq_start = c->first_base;
s->hdr->ref_seq_span = c->last_base - c->first_base + 1;
}
s->hdr->num_records = c->curr_rec;
if (c->curr_slice == 0) {
if (c->ref_seq_id != s->hdr->ref_seq_id)
c->ref_seq_id = s->hdr->ref_seq_id;
c->ref_seq_start = c->first_base;
}
c->curr_slice++;
}
/* Flush container */
if (c->curr_slice == c->max_slice ||
(bam_ref(b) != c->curr_ref && !c->multi_seq)) {
c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
if (fd->verbose)
fprintf(stderr, "Flush container %d/%d..%d\n",
c->ref_seq_id, c->ref_seq_start,
c->ref_seq_start + c->ref_seq_span -1);
/* Encode slices */
if (fd->pool) {
if (-1 == cram_flush_container_mt(fd, c))
return NULL;
} else {
if (-1 == cram_flush_container(fd, c))
return NULL;
// Move to sep func, as we need cram_flush_container for
// the closing phase to flush the partial container.
for (i = 0; i < c->max_slice; i++) {
cram_free_slice(c->slices[i]);
c->slices[i] = NULL;
}
c->slice = NULL;
c->curr_slice = 0;
/* Easy approach for purposes of freeing stats */
cram_free_container(c);
}
c = fd->ctr = cram_new_container(fd->seqs_per_slice,
fd->slices_per_container);
if (!c)
return NULL;
c->record_counter = fd->record_counter;
c->curr_ref = bam_ref(b);
}
c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
/* New slice */
c->slice = c->slices[c->curr_slice] =
cram_new_slice(MAPPED_SLICE, c->max_rec);
if (!c->slice)
return NULL;
if (c->multi_seq) {
c->slice->hdr->ref_seq_id = -2;
c->slice->hdr->ref_seq_start = 0;
c->slice->last_apos = 1;
} else {
c->slice->hdr->ref_seq_id = bam_ref(b);
// wrong for unsorted data, will fix during encoding.
c->slice->hdr->ref_seq_start = bam_pos(b)+1;
c->slice->last_apos = bam_pos(b)+1;
}
c->curr_rec = 0;
return c;
}
/*
* Converts a single bam record into a cram record.
* Possibly used within a thread.
*
* Returns 0 on success;
* -1 on failure
*/
static int process_one_read(cram_fd *fd, cram_container *c,
cram_slice *s, cram_record *cr,
bam_seq_t *b, int rnum) {
int i, fake_qual = -1;
char *cp, *rg;
char *ref, *seq, *qual;
// FIXME: multi-ref containers
ref = c->ref;
cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len);
//fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
// Fields to resolve later
//cr->mate_line; // index to another cram_record
//cr->mate_flags; // MF
//cr->ntags; // TC
cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
if (CRAM_MAJOR_VERS(fd->version) == 1)
rg = cram_encode_aux_1_0(fd, b, c, s, cr);
else
rg = cram_encode_aux(fd, b, c, s, cr);
//cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
//cr->aux = DSTRING_LEN(s->aux_ds);
//dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
/* Read group, identified earlier */
if (rg) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
cr->rg = brg ? brg->id : -1;
} else if (CRAM_MAJOR_VERS(fd->version) == 1) {
SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
assert(brg);
} else {
cr->rg = -1;
}
cram_stats_add(c->stats[DS_RG], cr->rg);
cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
cr->flags = bam_flag(b);
if (bam_cigar_len(b) == 0)
cr->flags |= BAM_FUNMAP;
cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
// Non reference based encoding means storing the bases verbatim as features, which in
// turn means every base also has a quality already stored.
if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES;
else
cr->cram_flags = 0;
//cram_stats_add(c->stats[DS_CF], cr->cram_flags);
c->num_bases += cr->len;
cr->apos = bam_pos(b)+1;
if (c->pos_sorted) {
if (cr->apos < s->last_apos) {
c->pos_sorted = 0;
} else {
cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
s->last_apos = cr->apos;
}
} else {
//cram_stats_add(c->stats[DS_AP], cr->apos);
}
c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
cr->name = BLOCK_SIZE(s->name_blk);
cr->name_len = bam_name_len(b);
cram_stats_add(c->stats[DS_RN], cr->name_len);
BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
/*
* This seqs_ds is largely pointless and it could reuse the same memory
* over and over.
* s->base_blk is what we need for encoding.
*/
cr->seq = BLOCK_SIZE(s->seqs_blk);
cr->qual = BLOCK_SIZE(s->qual_blk);
BLOCK_GROW(s->seqs_blk, cr->len+1);
BLOCK_GROW(s->qual_blk, cr->len);
seq = cp = (char *)BLOCK_END(s->seqs_blk);
*seq = 0;
#ifdef ALLOW_UAC
{
// Convert seq 2 bases at a time for speed.
static const uint16_t code2base[256] = {
15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
};
int l2 = cr->len / 2;
unsigned char *from = (unsigned char *)bam_seq(b);
uint16_t *cpi = (uint16_t *)cp;
cp[0] = 0;
for (i = 0; i < l2; i++)
cpi[i] = le_int2(code2base[from[i]]);
if ((i *= 2) < cr->len)
cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
}
#else
for (i = 0; i < cr->len; i++)
cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
#endif
BLOCK_SIZE(s->seqs_blk) += cr->len;
qual = cp = (char *)bam_qual(b);
/* Copy and parse */
if (!(cr->flags & BAM_FUNMAP)) {
int32_t *cig_to, *cig_from;
int apos = cr->apos-1, spos = 0;
cr->cigar = s->ncigar;
cr->ncigar = bam_cigar_len(b);
while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
if (!s->cigar)
return -1;
}
cig_to = (int32_t *)s->cigar;
cig_from = (int32_t *)bam_cigar(b);
cr->feature = 0;
cr->nfeature = 0;
for (i = 0; i < cr->ncigar; i++) {
enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
cig_to[i] = cig_from[i];
/* Can also generate events from here for CRAM diffs */
switch (cig_op) {
int l;
// Don't trust = and X ops to be correct.
case BAM_CMATCH:
case BAM_CBASE_MATCH:
case BAM_CBASE_MISMATCH:
//fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
// cig_len, &ref[apos], cig_len, &seq[spos]);
l = 0;
if (!fd->no_ref && cr->len) {
int end = cig_len+apos < c->ref_end
? cig_len : c->ref_end - apos;
char *sp = &seq[spos];
char *rp = &ref[apos];
char *qp = &qual[spos];
for (l = 0; l < end; l++) {
if (rp[l] != sp[l]) {
if (!sp[l])
break;
if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
// Disabled for the time being as it doesn't
// seem to gain us much.
int ol=l;
while (l 1) {
if (cram_add_bases(fd, c, s, cr, spos+ol,
l-ol, &seq[spos+ol]))
return -1;
l--;
} else {
l = ol;
if (cram_add_substitution(fd, c, s, cr,
spos+l, sp[l],
qp[l], rp[l]))
return -1;
}
} else {
if (cram_add_substitution(fd, c, s, cr, spos+l,
sp[l], qp[l], rp[l]))
return -1;
}
}
}
spos += l;
apos += l;
}
if (l < cig_len && cr->len) {
if (fd->no_ref) {
if (CRAM_MAJOR_VERS(fd->version) == 3) {
if (cram_add_bases(fd, c, s, cr, spos,
cig_len-l, &seq[spos]))
return -1;
spos += cig_len-l;
} else {
for (; l < cig_len && seq[spos]; l++, spos++) {
if (cram_add_base(fd, c, s, cr, spos,
seq[spos], qual[spos]))
return -1;
}
}
} else {
/* off end of sequence or non-ref based output */
for (; l < cig_len && seq[spos]; l++, spos++) {
if (cram_add_base(fd, c, s, cr, spos,
seq[spos], qual[spos]))
return -1;
}
}
apos += cig_len;
} else if (!cr->len) {
/* Seq "*" */
apos += cig_len;
spos += cig_len;
}
break;
case BAM_CDEL:
if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
apos += cig_len;
break;
case BAM_CREF_SKIP:
if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
apos += cig_len;
break;
case BAM_CINS:
if (cram_add_insertion(c, s, cr, spos, cig_len,
cr->len ? &seq[spos] : NULL))
return -1;
if (fd->no_ref && cr->len) {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
}
} else {
spos += cig_len;
}
break;
case BAM_CSOFT_CLIP:
if (cram_add_softclip(c, s, cr, spos, cig_len,
cr->len ? &seq[spos] : NULL,
fd->version))
return -1;
if (fd->no_ref &&
!(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
if (cr->len) {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, qual[spos]);
}
} else {
for (l = 0; l < cig_len; l++, spos++) {
cram_add_quality(fd, c, s, cr, spos, -1);
}
}
} else {
spos += cig_len;
}
break;
case BAM_CHARD_CLIP:
if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
break;
case BAM_CPAD:
if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
return -1;
break;
}
}
fake_qual = spos;
cr->aend = MIN(apos, c->ref_end);
cram_stats_add(c->stats[DS_FN], cr->nfeature);
} else {
// Unmapped
cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
cr->cigar = 0;
cr->ncigar = 0;
cr->nfeature = 0;
cr->aend = cr->apos;
for (i = 0; i < cr->len; i++)
cram_stats_add(c->stats[DS_BA], seq[i]);
}
/*
* Append to the qual block now. We do this here as
* cram_add_substitution() can generate BA/QS events which need to
* be in the qual block before we append the rest of the data.
*/
if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
/* Special case of seq "*" */
if (cr->len == 0) {
cram_stats_add(c->stats[DS_RL], cr->len = fake_qual);
BLOCK_GROW(s->qual_blk, cr->len);
cp = (char *)BLOCK_END(s->qual_blk);
memset(cp, 255, cr->len);
} else {
BLOCK_GROW(s->qual_blk, cr->len);
cp = (char *)BLOCK_END(s->qual_blk);
char *from = (char *)&bam_qual(b)[0];
char *to = &cp[0];
memcpy(to, from, cr->len);
//for (i = 0; i < cr->len; i++) cp[i] = from[i];
}
BLOCK_SIZE(s->qual_blk) += cr->len;
} else {
if (cr->len == 0) {
cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
cram_stats_add(c->stats[DS_RL], cr->len);
}
}
/* Now we know apos and aend both, update mate-pair information */
{
int new;
khint_t k;
int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
//fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
// cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
if (cr->flags & BAM_FPAIRED) {
char *key = string_ndup(s->pair_keys,
(char *)BLOCK_DATA(s->name_blk)+cr->name,
cr->name_len);
if (!key)
return -1;
k = kh_put(m_s2i, s->pair[sec], key, &new);
if (-1 == new)
return -1;
else if (new > 0)
kh_val(s->pair[sec], k) = rnum;
} else {
new = 1;
}
if (new == 0) {
cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
int aleft, aright, sign;
aleft = MIN(cr->apos, p->apos);
aright = MAX(cr->aend, p->aend);
if (cr->apos < p->apos) {
sign = 1;
} else if (cr->apos > p->apos) {
sign = -1;
} else if (cr->flags & BAM_FREAD1) {
sign = 1;
} else {
sign = -1;
}
//fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k));
// This vs p: tlen, matepos, flags
if (bam_ins_size(b) != sign*(aright-aleft+1))
goto detached;
if (MAX(bam_mate_pos(b)+1, 0) != p->apos)
goto detached;
if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
((p->flags & BAM_FUNMAP) != 0))
goto detached;
if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
((p->flags & BAM_FREVERSE) != 0))
goto detached;
// p vs this: tlen, matepos, flags
if (p->tlen != -sign*(aright-aleft+1))
goto detached;
if (p->mate_pos != cr->apos)
goto detached;
if (((p->flags & BAM_FMUNMAP) != 0) !=
((p->mate_flags & CRAM_M_UNMAP) != 0))
goto detached;
if (((p->flags & BAM_FMREVERSE) != 0) !=
((p->mate_flags & CRAM_M_REVERSE) != 0))
goto detached;
// Supplementary reads are just too ill defined
if ((cr->flags & BAM_FSUPPLEMENTARY) ||
(p->flags & BAM_FSUPPLEMENTARY))
goto detached;
/*
* The fields below are unused when encoding this read as it is
* no longer detached. In theory they may get referred to when
* processing a 3rd or 4th read in this template?, so we set them
* here just to be sure.
*
* They do not need cram_stats_add() calls those as they are
* not emitted.
*/
cr->mate_pos = p->apos;
cr->tlen = sign*(aright-aleft+1);
cr->mate_flags =
((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
// Decrement statistics aggregated earlier
cram_stats_del(c->stats[DS_NP], p->mate_pos);
cram_stats_del(c->stats[DS_MF], p->mate_flags);
cram_stats_del(c->stats[DS_TS], p->tlen);
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
/* Similarly we could correct the p-> values too, but these will no
* longer have any code that refers back to them as the new 'p'
* for this template is our current 'cr'.
*/
//p->mate_pos = cr->apos;
//p->mate_flags =
// ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
// ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
//p->tlen = p->apos - cr->aend;
// Clear detached from cr flags
cr->cram_flags &= ~CRAM_FLAG_DETACHED;
cram_stats_add(c->stats[DS_CF], cr->cram_flags);
// Clear detached from p flags and set downstream
cram_stats_del(c->stats[DS_CF], p->cram_flags);
p->cram_flags &= ~CRAM_FLAG_DETACHED;
p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
cram_stats_add(c->stats[DS_CF], p->cram_flags);
p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
cram_stats_add(c->stats[DS_NF], p->mate_line);
kh_val(s->pair[sec], k) = rnum;
} else {
detached:
//fprintf(stderr, "unpaired\n");
/* Derive mate flags from this flag */
cr->mate_flags = 0;
if (bam_flag(b) & BAM_FMUNMAP)
cr->mate_flags |= CRAM_M_UNMAP;
if (bam_flag(b) & BAM_FMREVERSE)
cr->mate_flags |= CRAM_M_REVERSE;
cram_stats_add(c->stats[DS_MF], cr->mate_flags);
cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
cram_stats_add(c->stats[DS_NP], cr->mate_pos);
cr->tlen = bam_ins_size(b);
cram_stats_add(c->stats[DS_TS], cr->tlen);
cr->cram_flags |= CRAM_FLAG_DETACHED;
cram_stats_add(c->stats[DS_CF], cr->cram_flags);
cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
}
}
cr->mqual = bam_map_qual(b);
cram_stats_add(c->stats[DS_MQ], cr->mqual);
cr->mate_ref_id = bam_mate_ref(b);
if (!(bam_flag(b) & BAM_FUNMAP)) {
if (c->first_base > cr->apos)
c->first_base = cr->apos;
if (c->last_base < cr->aend)
c->last_base = cr->aend;
}
return 0;
}
/*
* Write iterator: put BAM format sequences into a CRAM file.
* We buffer up a containers worth of data at a time.
*
* Returns 0 on success
* -1 on failure
*/
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
cram_container *c;
if (!fd->ctr) {
fd->ctr = cram_new_container(fd->seqs_per_slice,
fd->slices_per_container);
if (!fd->ctr)
return -1;
fd->ctr->record_counter = fd->record_counter;
}
c = fd->ctr;
if (!c->slice || c->curr_rec == c->max_rec ||
(bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) {
int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
/*
* Start packing slices when we routinely have under 1/4tr full.
*
* This option isn't available if we choose to embed references
* since we can only have one per slice.
*/
if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
!fd->embed_ref) {
if (fd->verbose && !c->multi_seq)
fprintf(stderr, "Multi-ref enabled for this container\n");
multi_seq = 1;
}
slice_rec = c->slice_rec;
curr_rec = c->curr_rec;
if (CRAM_MAJOR_VERS(fd->version) == 1 ||
c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) {
if (NULL == (c = cram_next_container(fd, b))) {
if (fd->ctr) {
// prevent cram_close attempting to flush
cram_free_container(fd->ctr);
fd->ctr = NULL;
}
return -1;
}
}
/*
* Due to our processing order, some things we've already done we
* cannot easily undo. So when we first notice we should be packing
* multiple sequences per container we emit the small partial
* container as-is and then start a fresh one in a different mode.
*/
if (multi_seq) {
fd->multi_seq = 1;
c->multi_seq = 1;
c->pos_sorted = 0; // required atm for multi_seq slices
if (!c->refs_used) {
pthread_mutex_lock(&fd->ref_lock);
c->refs_used = calloc(fd->refs->nref, sizeof(int));
pthread_mutex_unlock(&fd->ref_lock);
if (!c->refs_used)
return -1;
}
}
fd->last_slice = curr_rec - slice_rec;
c->slice_rec = c->curr_rec;
// Have we seen this reference before?
if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
!fd->unsorted && multi_seq) {
if (!c->refs_used) {
pthread_mutex_lock(&fd->ref_lock);
c->refs_used = calloc(fd->refs->nref, sizeof(int));
pthread_mutex_unlock(&fd->ref_lock);
if (!c->refs_used)
return -1;
} else if (c->refs_used && c->refs_used[bam_ref(b)]) {
fprintf(stderr, "Unsorted mode enabled\n");
pthread_mutex_lock(&fd->ref_lock);
fd->unsorted = 1;
pthread_mutex_unlock(&fd->ref_lock);
fd->multi_seq = 1;
}
}
c->curr_ref = bam_ref(b);
if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
}
if (!c->bams) {
/* First time through, allocate a set of bam pointers */
pthread_mutex_lock(&fd->bam_list_lock);
if (fd->bl) {
spare_bams *spare = fd->bl;
c->bams = spare->bams;
fd->bl = spare->next;
free(spare);
} else {
c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
if (!c->bams)
return -1;
}
pthread_mutex_unlock(&fd->bam_list_lock);
}
/* Copy or alloc+copy the bam record, for later encoding */
if (c->bams[c->curr_c_rec])
bam_copy1(c->bams[c->curr_c_rec], b);
else
c->bams[c->curr_c_rec] = bam_dup(b);
c->curr_rec++;
c->curr_c_rec++;
fd->record_counter++;
return 0;
}
htslib-1.2.1/cram/cram_encode.h 0000664 0000000 0000000 00000006363 12464172677 0016362 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* Include cram.h instead.
*
* This is an internal part of the CRAM system and is automatically included
* when you #include cram.h.
*
* Implements the encoding portion of CRAM I/O. Also see
* cram_codecs.[ch] for the actual encoding functions themselves.
*/
#ifndef _CRAM_WRITE_H_
#define _CRAM_WRITE_H_
#ifdef __cplusplus
extern "C" {
#endif
/* ----------------------------------------------------------------------
* CRAM sequence iterators.
*/
/*! Write iterator: put BAM format sequences into a CRAM file.
*
* We buffer up a containers worth of data at a time.
*
* FIXME: break this into smaller pieces.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b);
/* ----------------------------------------------------------------------
* Internal functions
*/
/*! INTERNAL:
* Encodes a compression header block into a generic cram_block structure.
*
* @return
* Returns cram_block ptr on success;
* NULL on failure
*/
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
cram_block_compression_hdr *h);
/*! INTERNAL:
* Encodes a slice compression header.
*
* @return
* Returns cram_block on success;
* NULL on failure
*/
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s);
/*! INTERNAL:
* Encodes all slices in a container into blocks.
*
* @return
* Returns 0 on success;
* -1 on failure
*
* FIXME: separate into encode_container and write_container. Ideally
* we should be able to do read_container / write_container or
* decode_container / encode_container.
*/
int cram_encode_container(cram_fd *fd, cram_container *c);
#ifdef __cplusplus
}
#endif
#endif
htslib-1.2.1/cram/cram_index.c 0000664 0000000 0000000 00000033414 12464172677 0016224 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013-2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The index is a gzipped tab-delimited text file with one line per slice.
* The columns are:
* 1: reference number (0 to N-1, as per BAM ref_id)
* 2: reference position of 1st read in slice (1..?)
* 3: number of reads in slice
* 4: offset of container start (relative to end of SAM header, so 1st
* container is offset 0).
* 5: slice number within container (ie which landmark).
*
* In memory, we hold this in a nested containment list. Each list element is
* a cram_index struct. Each element in turn can contain its own list of
* cram_index structs.
*
* Any start..end range which is entirely contained within another (and
* earlier as it is sorted) range will be held within it. This ensures that
* the outer list will never have containments and we can safely do a
* binary search to find the first range which overlaps any given coordinate.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "htslib/hfile.h"
#include "cram/cram.h"
#include "cram/os.h"
#include "cram/zfio.h"
#if 0
static void dump_index_(cram_index *e, int level) {
int i, n;
n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
for (i = 0; i < e->nslice; i++) {
dump_index_(&e->e[i], level+1);
}
}
static void dump_index(cram_fd *fd) {
int i;
for (i = 0; i < fd->index_sz; i++) {
dump_index_(&fd->index[i], 0);
}
}
#endif
static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
int sign = 1;
int32_t val = 0;
size_t p = *pos;
while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
p++;
if (p < k->l && k->s[p] == '-')
sign = -1, p++;
if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
return -1;
while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
val = val*10 + k->s[p++]-'0';
*pos = p;
*val_p = sign*val;
return 0;
}
static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
int sign = 1;
int64_t val = 0;
size_t p = *pos;
while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
p++;
if (p < k->l && k->s[p] == '-')
sign = -1, p++;
if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
return -1;
while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
val = val*10 + k->s[p++]-'0';
*pos = p;
*val_p = sign*val;
return 0;
}
/*
* Loads a CRAM .crai index into memory.
*
* Returns 0 for success
* -1 for failure
*/
int cram_index_load(cram_fd *fd, const char *fn) {
char fn2[PATH_MAX];
char buf[65536];
ssize_t len;
kstring_t kstr = {0};
hFILE *fp;
cram_index *idx;
cram_index **idx_stack = NULL, *ep, e;
int idx_stack_alloc = 0, idx_stack_ptr = 0;
size_t pos = 0;
/* Check if already loaded */
if (fd->index)
return 0;
fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
if (!fd->index)
return -1;
idx = &fd->index[0];
idx->refid = -1;
idx->start = INT_MIN;
idx->end = INT_MAX;
idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
idx_stack[idx_stack_ptr] = idx;
sprintf(fn2, "%s.crai", fn);
if (!(fp = hopen(fn2, "r"))) {
perror(fn2);
free(idx_stack);
return -1;
}
// Load the file into memory
while ((len = hread(fp, buf, 65536)) > 0)
kputsn(buf, len, &kstr);
if (len < 0 || kstr.l < 2) {
if (kstr.s)
free(kstr.s);
free(idx_stack);
return -1;
}
if (hclose(fp)) {
if (kstr.s)
free(kstr.s);
free(idx_stack);
return -1;
}
// Uncompress if required
if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
size_t l;
char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
free(kstr.s);
if (!s) {
free(idx_stack);
return -1;
}
kstr.s = s;
kstr.l = l;
kstr.m = l; // conservative estimate of the size allocated
kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
}
// Parse it line at a time
do {
/* 1.1 layout */
if (kget_int32(&kstr, &pos, &e.refid) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
if (kget_int32(&kstr, &pos, &e.start) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
if (kget_int32(&kstr, &pos, &e.end) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
if (kget_int64(&kstr, &pos, &e.offset) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
if (kget_int32(&kstr, &pos, &e.slice) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
if (kget_int32(&kstr, &pos, &e.len) == -1) {
free(kstr.s); free(idx_stack); return -1;
}
e.end += e.start-1;
//printf("%d/%d..%d\n", e.refid, e.start, e.end);
if (e.refid < -1) {
free(kstr.s);
free(idx_stack);
fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
return -1;
}
if (e.refid != idx->refid) {
if (fd->index_sz < e.refid+2) {
size_t index_end = fd->index_sz * sizeof(*fd->index);
fd->index_sz = e.refid+2;
fd->index = realloc(fd->index,
fd->index_sz * sizeof(*fd->index));
memset(((char *)fd->index) + index_end, 0,
fd->index_sz * sizeof(*fd->index) - index_end);
}
idx = &fd->index[e.refid+1];
idx->refid = e.refid;
idx->start = INT_MIN;
idx->end = INT_MAX;
idx->nslice = idx->nalloc = 0;
idx->e = NULL;
idx_stack[(idx_stack_ptr = 0)] = idx;
}
while (!(e.start >= idx->start && e.end <= idx->end)) {
idx = idx_stack[--idx_stack_ptr];
}
// Now contains, so append
if (idx->nslice+1 >= idx->nalloc) {
idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
}
e.nalloc = e.nslice = 0; e.e = NULL;
*(ep = &idx->e[idx->nslice++]) = e;
idx = ep;
if (++idx_stack_ptr >= idx_stack_alloc) {
idx_stack_alloc *= 2;
idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
}
idx_stack[idx_stack_ptr] = idx;
while (pos < kstr.l && kstr.s[pos] != '\n')
pos++;
pos++;
} while (pos < kstr.l);
free(idx_stack);
free(kstr.s);
// dump_index(fd);
return 0;
}
static void cram_index_free_recurse(cram_index *e) {
if (e->e) {
int i;
for (i = 0; i < e->nslice; i++) {
cram_index_free_recurse(&e->e[i]);
}
free(e->e);
}
}
void cram_index_free(cram_fd *fd) {
int i;
if (!fd->index)
return;
for (i = 0; i < fd->index_sz; i++) {
cram_index_free_recurse(&fd->index[i]);
}
free(fd->index);
fd->index = NULL;
}
/*
* Searches the index for the first slice overlapping a reference ID
* and position, or one immediately preceeding it if none is found in
* the index to overlap this position. (Our index may have missing
* entries, but we require at least one per reference.)
*
* If the index finds multiple slices overlapping this position we
* return the first one only. Subsequent calls should specifying
* "from" as the last slice we checked to find the next one. Otherwise
* set "from" to be NULL to find the first one.
*
* Returns the cram_index pointer on sucess
* NULL on failure
*/
cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
cram_index *from) {
int i, j, k;
cram_index *e;
if (refid+1 < 0 || refid+1 >= fd->index_sz)
return NULL;
i = 0, j = fd->index[refid+1].nslice-1;
if (!from)
from = &fd->index[refid+1];
for (k = j/2; k != i; k = (j-i)/2 + i) {
if (from->e[k].refid > refid) {
j = k;
continue;
}
if (from->e[k].refid < refid) {
i = k;
continue;
}
if (from->e[k].start >= pos) {
j = k;
continue;
}
if (from->e[k].start < pos) {
i = k;
continue;
}
}
// i==j or i==j-1. Check if j is better.
if (from->e[j].start < pos && from->e[j].refid == refid)
i = j;
/* The above found *a* bin overlapping, but not necessarily the first */
while (i > 0 && from->e[i-1].end >= pos)
i--;
/* Special case for matching a start pos */
if (i+1 < from->nslice &&
from->e[i+1].start == pos &&
from->e[i+1].refid == refid)
i++;
e = &from->e[i];
return e;
}
/*
* Skips to a container overlapping the start coordinate listed in
* cram_range.
*
* In theory we call cram_index_query multiple times, once per slice
* overlapping the range. However slices may be absent from the index
* which makes this problematic. Instead we find the left-most slice
* and then read from then on, skipping decoding of slices and/or
* whole containers when they don't overlap the specified cram_range.
*
* Returns 0 on success
* -1 on failure
*/
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
cram_index *e;
// Ideally use an index, so see if we have one.
if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
if (0 != cram_seek(fd, e->offset, SEEK_SET))
if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
return -1;
} else {
fprintf(stderr, "Unknown reference ID. Missing from index?\n");
return -1;
}
if (fd->ctr) {
cram_free_container(fd->ctr);
fd->ctr = NULL;
fd->ooc = 0;
}
return 0;
}
/*
* A specialised form of cram_index_build (below) that deals with slices
* having multiple references in this (ref_id -2). In this scenario we
* decode the slice to look at the RI data series instead.
*
* Returns 0 on success
* -1 on failure
*/
static int cram_index_build_multiref(cram_fd *fd,
cram_container *c,
cram_slice *s,
zfp *fp,
off_t cpos,
int32_t landmark,
int sz) {
int i, ref = -2, ref_start = 0, ref_end;
char buf[1024];
if (0 != cram_decode_slice(fd, c, s, fd->header))
return -1;
ref_end = INT_MIN;
for (i = 0; i < s->hdr->num_records; i++) {
if (s->crecs[i].ref_id == ref) {
if (ref_end < s->crecs[i].aend)
ref_end = s->crecs[i].aend;
continue;
}
if (ref != -2) {
sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
ref, ref_start, ref_end - ref_start + 1,
(int64_t)cpos, landmark, sz);
zfputs(buf, fp);
}
ref = s->crecs[i].ref_id;
ref_start = s->crecs[i].apos;
ref_end = INT_MIN;
}
if (ref != -2) {
sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
ref, ref_start, ref_end - ref_start + 1,
(int64_t)cpos, landmark, sz);
zfputs(buf, fp);
}
return 0;
}
/*
* Builds an index file.
*
* fd is a newly opened cram file that we wish to index.
* fn_base is the filename of the associated CRAM file. Internally we
* add ".crai" to this to get the index filename.
*
* Returns 0 on success
* -1 on failure
*/
int cram_index_build(cram_fd *fd, const char *fn_base) {
cram_container *c;
off_t cpos, spos, hpos;
zfp *fp;
char fn_idx[PATH_MAX];
if (strlen(fn_base) > PATH_MAX-6)
return -1;
sprintf(fn_idx, "%s.crai", fn_base);
if (!(fp = zfopen(fn_idx, "wz"))) {
perror(fn_idx);
return -1;
}
cpos = htell(fd->fp);
while ((c = cram_read_container(fd))) {
int j;
if (fd->err) {
perror("Cram container read");
return 1;
}
hpos = htell(fd->fp);
if (!(c->comp_hdr_block = cram_read_block(fd)))
return 1;
assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
if (!c->comp_hdr)
return -1;
// 2.0 format
for (j = 0; j < c->num_landmarks; j++) {
char buf[1024];
cram_slice *s;
int sz;
spos = htell(fd->fp);
assert(spos - cpos - c->offset == c->landmark[j]);
if (!(s = cram_read_slice(fd))) {
zfclose(fp);
return -1;
}
sz = (int)(htell(fd->fp) - spos);
if (s->hdr->ref_seq_id == -2) {
cram_index_build_multiref(fd, c, s, fp,
cpos, c->landmark[j], sz);
} else {
sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
s->hdr->ref_seq_id, s->hdr->ref_seq_start,
s->hdr->ref_seq_span, (int64_t)cpos,
c->landmark[j], sz);
zfputs(buf, fp);
}
cram_free_slice(s);
}
cpos = htell(fd->fp);
assert(cpos == hpos + c->length);
cram_free_container(c);
}
if (fd->err) {
zfclose(fp);
return -1;
}
return zfclose(fp);
}
htslib-1.2.1/cram/cram_index.h 0000664 0000000 0000000 00000006213 12464172677 0016226 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CRAM_INDEX_H_
#define _CRAM_INDEX_H_
#ifdef __cplusplus
extern "C" {
#endif
/*
* Loads a CRAM .crai index into memory.
* Returns 0 for success
* -1 for failure
*/
int cram_index_load(cram_fd *fd, const char *fn);
void cram_index_free(cram_fd *fd);
/*
* Searches the index for the first slice overlapping a reference ID
* and position.
*
* Returns the cram_index pointer on sucess
* NULL on failure
*/
cram_index *cram_index_query(cram_fd *fd, int refid, int pos, cram_index *frm);
/*
* Skips to a container overlapping the start coordinate listed in
* cram_range.
*
* Returns 0 on success
* -1 on failure
*/
int cram_seek_to_refpos(cram_fd *fd, cram_range *r);
void cram_index_free(cram_fd *fd);
/*
* Skips to a container overlapping the start coordinate listed in
* cram_range.
*
* In theory we call cram_index_query multiple times, once per slice
* overlapping the range. However slices may be absent from the index
* which makes this problematic. Instead we find the left-most slice
* and then read from then on, skipping decoding of slices and/or
* whole containers when they don't overlap the specified cram_range.
*
* Returns 0 on success
* -1 on failure
*/
int cram_seek_to_refpos(cram_fd *fd, cram_range *r);
/*
* Builds an index file.
*
* fd is a newly opened cram file that we wish to index.
* fn_base is the filename of the associated CRAM file. Internally we
* add ".crai" to this to get the index filename.
*
* Returns 0 on success
* -1 on failure
*/
int cram_index_build(cram_fd *fd, const char *fn_base);
#ifdef __cplusplus
}
#endif
#endif
htslib-1.2.1/cram/cram_io.c 0000664 0000000 0000000 00000310052 12464172677 0015520 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CRAM I/O primitives.
*
* - ITF8 encoding and decoding.
* - Block based I/O
* - Zlib inflating and deflating (memory)
* - CRAM basic data structure reading and writing
* - File opening / closing
* - Reference sequence handling
*/
/*
* TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
* a way to return errors for when malloc fails.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#ifdef HAVE_LIBBZ2
#include
#endif
#ifdef HAVE_LIBLZMA
#include
#endif
#include
#include
#include
#include
#include "cram/cram.h"
#include "cram/os.h"
#include "cram/md5.h"
#include "cram/open_trace_file.h"
#include "cram/rANS_static.h"
//#define REF_DEBUG
#ifdef REF_DEBUG
#include
#define gettid() (int)syscall(SYS_gettid)
#define RP(...) fprintf (stderr, __VA_ARGS__)
#else
#define RP(...)
#endif
#include "htslib/hfile.h"
#include "htslib/bgzf.h"
#include "htslib/faidx.h"
#define TRIAL_SPAN 50
#define NTRIALS 3
/* ----------------------------------------------------------------------
* ITF8 encoding and decoding.
*
* Also see the itf8_get and itf8_put macros in cram_io.h
*/
/*
* Reads an integer in ITF-8 encoding from 'cp' and stores it in
* *val.
*
* Returns the number of bytes read on success
* -1 on failure
*/
int itf8_decode(cram_fd *fd, int32_t *val_p) {
static int nbytes[16] = {
0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
1,1,1,1, // 1000xxxx - 1011xxxx
2,2, // 1100xxxx - 1101xxxx
3, // 1110xxxx
4, // 1111xxxx
};
static int nbits[16] = {
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
0x1f, 0x1f, // 1100xxxx - 1101xxxx
0x0f, // 1110xxxx
0x0f, // 1111xxxx
};
int32_t val = hgetc(fd->fp);
if (val == -1)
return -1;
int i = nbytes[val>>4];
val &= nbits[val>>4];
switch(i) {
case 0:
*val_p = val;
return 1;
case 1:
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val;
return 2;
case 2:
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val;
return 3;
case 3:
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val;
return 4;
case 4: // really 3.5 more, why make it different?
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
*val_p = val;
}
return 5;
}
/*
* Encodes and writes a single integer in ITF-8 format.
* Returns 0 on success
* -1 on failure
*/
int itf8_encode(cram_fd *fd, int32_t val) {
char buf[5];
int len = itf8_put(buf, val);
return hwrite(fd->fp, buf, len) == len ? 0 : -1;
}
#ifndef ITF8_MACROS
/*
* As above, but decoding from memory
*/
int itf8_get(char *cp, int32_t *val_p) {
unsigned char *up = (unsigned char *)cp;
if (up[0] < 0x80) {
*val_p = up[0];
return 1;
} else if (up[0] < 0xc0) {
*val_p = ((up[0] <<8) | up[1]) & 0x3fff;
return 2;
} else if (up[0] < 0xe0) {
*val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
return 3;
} else if (up[0] < 0xf0) {
*val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
return 4;
} else {
*val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
return 5;
}
}
/*
* Stores a value to memory in ITF-8 format.
*
* Returns the number of bytes required to store the number.
* This is a maximum of 5 bytes.
*/
int itf8_put(char *cp, int32_t val) {
if (!(val & ~0x00000007f)) { // 1 byte
*cp = val;
return 1;
} else if (!(val & ~0x00003fff)) { // 2 byte
*cp++ = (val >> 8 ) | 0x80;
*cp = val & 0xff;
return 2;
} else if (!(val & ~0x01fffff)) { // 3 byte
*cp++ = (val >> 16) | 0xc0;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 3;
} else if (!(val & ~0x0fffffff)) { // 4 byte
*cp++ = (val >> 24) | 0xe0;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 4;
} else { // 5 byte
*cp++ = 0xf0 | ((val>>28) & 0xff);
*cp++ = (val >> 20) & 0xff;
*cp++ = (val >> 12) & 0xff;
*cp++ = (val >> 4 ) & 0xff;
*cp = val & 0x0f;
return 5;
}
}
#endif
/* 64-bit itf8 variant */
int ltf8_put(char *cp, int64_t val) {
if (!(val & ~((1LL<<7)-1))) {
*cp = val;
return 1;
} else if (!(val & ~((1LL<<(6+8))-1))) {
*cp++ = (val >> 8 ) | 0x80;
*cp = val & 0xff;
return 2;
} else if (!(val & ~((1LL<<(5+2*8))-1))) {
*cp++ = (val >> 16) | 0xc0;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 3;
} else if (!(val & ~((1LL<<(4+3*8))-1))) {
*cp++ = (val >> 24) | 0xe0;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 4;
} else if (!(val & ~((1LL<<(3+4*8))-1))) {
*cp++ = (val >> 32) | 0xf0;
*cp++ = (val >> 24) & 0xff;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 5;
} else if (!(val & ~((1LL<<(2+5*8))-1))) {
*cp++ = (val >> 40) | 0xf8;
*cp++ = (val >> 32) & 0xff;
*cp++ = (val >> 24) & 0xff;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 6;
} else if (!(val & ~((1LL<<(1+6*8))-1))) {
*cp++ = (val >> 48) | 0xfc;
*cp++ = (val >> 40) & 0xff;
*cp++ = (val >> 32) & 0xff;
*cp++ = (val >> 24) & 0xff;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 7;
} else if (!(val & ~((1LL<<(7*8))-1))) {
*cp++ = (val >> 56) | 0xfe;
*cp++ = (val >> 48) & 0xff;
*cp++ = (val >> 40) & 0xff;
*cp++ = (val >> 32) & 0xff;
*cp++ = (val >> 24) & 0xff;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 8;
} else {
*cp++ = 0xff;
*cp++ = (val >> 56) & 0xff;
*cp++ = (val >> 48) & 0xff;
*cp++ = (val >> 40) & 0xff;
*cp++ = (val >> 32) & 0xff;
*cp++ = (val >> 24) & 0xff;
*cp++ = (val >> 16) & 0xff;
*cp++ = (val >> 8 ) & 0xff;
*cp = val & 0xff;
return 9;
}
}
int ltf8_get(char *cp, int64_t *val_p) {
unsigned char *up = (unsigned char *)cp;
if (up[0] < 0x80) {
*val_p = up[0];
return 1;
} else if (up[0] < 0xc0) {
*val_p = (((uint64_t)up[0]<< 8) |
(uint64_t)up[1]) & (((1LL<<(6+8)))-1);
return 2;
} else if (up[0] < 0xe0) {
*val_p = (((uint64_t)up[0]<<16) |
((uint64_t)up[1]<< 8) |
(uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
return 3;
} else if (up[0] < 0xf0) {
*val_p = (((uint64_t)up[0]<<24) |
((uint64_t)up[1]<<16) |
((uint64_t)up[2]<< 8) |
(uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
return 4;
} else if (up[0] < 0xf8) {
*val_p = (((uint64_t)up[0]<<32) |
((uint64_t)up[1]<<24) |
((uint64_t)up[2]<<16) |
((uint64_t)up[3]<< 8) |
(uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
return 5;
} else if (up[0] < 0xfc) {
*val_p = (((uint64_t)up[0]<<40) |
((uint64_t)up[1]<<32) |
((uint64_t)up[2]<<24) |
((uint64_t)up[3]<<16) |
((uint64_t)up[4]<< 8) |
(uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
return 6;
} else if (up[0] < 0xfe) {
*val_p = (((uint64_t)up[0]<<48) |
((uint64_t)up[1]<<40) |
((uint64_t)up[2]<<32) |
((uint64_t)up[3]<<24) |
((uint64_t)up[4]<<16) |
((uint64_t)up[5]<< 8) |
(uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
return 7;
} else if (up[0] < 0xff) {
*val_p = (((uint64_t)up[1]<<48) |
((uint64_t)up[2]<<40) |
((uint64_t)up[3]<<32) |
((uint64_t)up[4]<<24) |
((uint64_t)up[5]<<16) |
((uint64_t)up[6]<< 8) |
(uint64_t)up[7]) & ((1LL<<(7*8))-1);
return 8;
} else {
*val_p = (((uint64_t)up[1]<<56) |
((uint64_t)up[2]<<48) |
((uint64_t)up[3]<<40) |
((uint64_t)up[4]<<32) |
((uint64_t)up[5]<<24) |
((uint64_t)up[6]<<16) |
((uint64_t)up[7]<< 8) |
(uint64_t)up[8]);
return 9;
}
}
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
int c = hgetc(fd->fp);
int64_t val = (unsigned char)c;
if (c == -1)
return -1;
if (val < 0x80) {
*val_p = val;
return 1;
} else if (val < 0xc0) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & (((1LL<<(6+8)))-1);
return 2;
} else if (val < 0xe0) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(5+2*8))-1);
return 3;
} else if (val < 0xf0) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(4+3*8))-1);
return 4;
} else if (val < 0xf8) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(3+4*8))-1);
return 5;
} else if (val < 0xfc) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(2+5*8))-1);
return 6;
} else if (val < 0xfe) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(1+6*8))-1);
return 7;
} else if (val < 0xff) {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val & ((1LL<<(7*8))-1);
return 8;
} else {
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
val = (val<<8) | (unsigned char)hgetc(fd->fp);
*val_p = val;
}
return 9;
}
/*
* Pushes a value in ITF8 format onto the end of a block.
* This shouldn't be used for high-volume data as it is not the fastest
* method.
*
* Returns the number of bytes written
*/
int itf8_put_blk(cram_block *blk, int val) {
char buf[5];
int sz;
sz = itf8_put(buf, val);
BLOCK_APPEND(blk, buf, sz);
return sz;
}
/*
* Decodes a 32-bit little endian value from fd and stores in val.
*
* Returns the number of bytes read on success
* -1 on failure
*/
int int32_decode(cram_fd *fd, int32_t *val) {
int32_t i;
if (4 != hread(fd->fp, &i, 4))
return -1;
*val = le_int4(i);
return 4;
}
/*
* Encodes a 32-bit little endian value 'val' and writes to fd.
*
* Returns the number of bytes written on success
* -1 on failure
*/
int int32_encode(cram_fd *fd, int32_t val) {
val = le_int4(val);
if (4 != hwrite(fd->fp, &val, 4))
return -1;
return 4;
}
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int int32_get(cram_block *b, int32_t *val) {
if (b->uncomp_size - BLOCK_SIZE(b) < 4)
return -1;
*val =
b->data[b->byte ] |
(b->data[b->byte+1] << 8) |
(b->data[b->byte+2] << 16) |
(b->data[b->byte+3] << 24);
BLOCK_SIZE(b) += 4;
return 4;
}
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int int32_put(cram_block *b, int32_t val) {
unsigned char cp[4];
cp[0] = ( val & 0xff);
cp[1] = ((val>>8) & 0xff);
cp[2] = ((val>>16) & 0xff);
cp[3] = ((val>>24) & 0xff);
BLOCK_APPEND(b, cp, 4);
return b->data ? 0 : -1;
}
/* ----------------------------------------------------------------------
* zlib compression code - from Gap5's tg_iface_g.c
* They're static here as they're only used within the cram_compress_block
* and cram_uncompress_block functions, which are the external interface.
*/
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
z_stream s;
unsigned char *data = NULL; /* Uncompressed output */
int data_alloc = 0;
int err;
/* Starting point at uncompressed size, and scale after that */
data = malloc(data_alloc = csize*1.2+100);
if (!data)
return NULL;
/* Initialise zlib stream */
s.zalloc = Z_NULL; /* use default allocation functions */
s.zfree = Z_NULL;
s.opaque = Z_NULL;
s.next_in = (unsigned char *)cdata;
s.avail_in = csize;
s.total_in = 0;
s.next_out = data;
s.avail_out = data_alloc;
s.total_out = 0;
//err = inflateInit(&s);
err = inflateInit2(&s, 15 + 32);
if (err != Z_OK) {
fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
free(data);
return NULL;
}
/* Decode to 'data' array */
for (;s.avail_in;) {
unsigned char *data_tmp;
int alloc_inc;
s.next_out = &data[s.total_out];
err = inflate(&s, Z_NO_FLUSH);
if (err == Z_STREAM_END)
break;
if (err != Z_OK) {
fprintf(stderr, "zlib inflate error: %s\n", s.msg);
break;
}
/* More to come, so realloc based on growth so far */
alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
data = realloc((data_tmp = data), data_alloc += alloc_inc);
if (!data) {
free(data_tmp);
return NULL;
}
s.avail_out += alloc_inc;
}
inflateEnd(&s);
*size = s.total_out;
return (char *)data;
}
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
int level, int strat) {
z_stream s;
unsigned char *cdata = NULL; /* Compressed output */
int cdata_alloc = 0;
int cdata_pos = 0;
int err;
cdata = malloc(cdata_alloc = size*1.05+100);
if (!cdata)
return NULL;
cdata_pos = 0;
/* Initialise zlib stream */
s.zalloc = Z_NULL; /* use default allocation functions */
s.zfree = Z_NULL;
s.opaque = Z_NULL;
s.next_in = (unsigned char *)data;
s.avail_in = size;
s.total_in = 0;
s.next_out = cdata;
s.avail_out = cdata_alloc;
s.total_out = 0;
s.data_type = Z_BINARY;
err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
if (err != Z_OK) {
fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
return NULL;
}
/* Encode to 'cdata' array */
for (;s.avail_in;) {
s.next_out = &cdata[cdata_pos];
s.avail_out = cdata_alloc - cdata_pos;
if (cdata_alloc - cdata_pos <= 0) {
fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
return NULL;
}
err = deflate(&s, Z_NO_FLUSH);
cdata_pos = cdata_alloc - s.avail_out;
if (err != Z_OK) {
fprintf(stderr, "zlib deflate error: %s\n", s.msg);
break;
}
}
if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
fprintf(stderr, "zlib deflate error: %s\n", s.msg);
}
*cdata_size = s.total_out;
if (deflateEnd(&s) != Z_OK) {
fprintf(stderr, "zlib deflate error: %s\n", s.msg);
}
return (char *)cdata;
}
#ifdef HAVE_LIBLZMA
/* ------------------------------------------------------------------------ */
/*
* Data compression routines using liblzma (xz)
*
* On a test set this shrunk the main db from 136157104 bytes to 114796168, but
* caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
* went from 18.3s to 36.3s. So decompression suffers too, but not as bad
* as compression times.
*
* For now we disable this functionality. If it's to be reenabled make sure you
* improve the mem_inflate implementation as it's just a test hack at the
* moment.
*/
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
int level) {
char *out;
size_t out_size = lzma_stream_buffer_bound(size);
*cdata_size = 0;
out = malloc(out_size);
/* Single call compression */
if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
(uint8_t *)data, size,
(uint8_t *)out, cdata_size,
out_size))
return NULL;
return out;
}
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
lzma_stream strm = LZMA_STREAM_INIT;
size_t out_size = 0, out_pos = 0;
char *out = NULL;
int r;
/* Initiate the decoder */
if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0))
return NULL;
/* Decode loop */
strm.avail_in = csize;
strm.next_in = (uint8_t *)cdata;
for (;strm.avail_in;) {
if (strm.avail_in > out_size - out_pos) {
out_size += strm.avail_in * 4 + 32768;
out = realloc(out, out_size);
}
strm.avail_out = out_size - out_pos;
strm.next_out = (uint8_t *)&out[out_pos];
r = lzma_code(&strm, LZMA_RUN);
if (LZMA_OK != r && LZMA_STREAM_END != r) {
fprintf(stderr, "r=%d\n", r);
fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm));
return NULL;
}
out_pos = strm.total_out;
if (r == LZMA_STREAM_END)
break;
}
/* finish up any unflushed data; necessary? */
r = lzma_code(&strm, LZMA_FINISH);
if (r != LZMA_OK && r != LZMA_STREAM_END) {
fprintf(stderr, "r=%d\n", r);
return NULL;
}
out = realloc(out, strm.total_out);
*size = strm.total_out;
lzma_end(&strm);
return out;
}
#endif
/* ----------------------------------------------------------------------
* CRAM blocks - the dynamically growable data block. We have code to
* create, update, (un)compress and read/write.
*
* These are derived from the deflate_interlaced.c blocks, but with the
* CRAM extension of content types and IDs.
*/
/*
* Allocates a new cram_block structure with a specified content_type and
* id.
*
* Returns block pointer on success
* NULL on failure
*/
cram_block *cram_new_block(enum cram_content_type content_type,
int content_id) {
cram_block *b = malloc(sizeof(*b));
if (!b)
return NULL;
b->method = b->orig_method = RAW;
b->content_type = content_type;
b->content_id = content_id;
b->comp_size = 0;
b->uncomp_size = 0;
b->data = NULL;
b->alloc = 0;
b->byte = 0;
b->bit = 7; // MSB
return b;
}
/*
* Reads a block from a cram file.
* Returns cram_block pointer on success.
* NULL on failure
*/
cram_block *cram_read_block(cram_fd *fd) {
cram_block *b = malloc(sizeof(*b));
if (!b)
return NULL;
//fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; }
if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; }
if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; }
// fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
// b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
if (b->method == RAW) {
b->alloc = b->uncomp_size;
if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
free(b->data);
free(b);
return NULL;
}
} else {
b->alloc = b->comp_size;
if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
free(b->data);
free(b);
return NULL;
}
}
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
unsigned char dat[100], *cp = dat;;
uint32_t crc;
if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
free(b);
return NULL;
}
*cp++ = b->method;
*cp++ = b->content_type;
cp += itf8_put(cp, b->content_id);
cp += itf8_put(cp, b->comp_size);
cp += itf8_put(cp, b->uncomp_size);
crc = crc32(0L, dat, cp-dat);
crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
if (crc != b->crc32) {
fprintf(stderr, "Block CRC32 failure\n");
free(b->data);
free(b);
return NULL;
}
}
b->orig_method = b->method;
b->idx = 0;
b->byte = 0;
b->bit = 7; // MSB
return b;
}
/*
* Writes a CRAM block.
* Returns 0 on success
* -1 on failure
*/
int cram_write_block(cram_fd *fd, cram_block *b) {
assert(b->method != RAW || (b->comp_size == b->uncomp_size));
if (hputc(b->method, fd->fp) == EOF) return -1;
if (hputc(b->content_type, fd->fp) == EOF) return -1;
if (itf8_encode(fd, b->content_id) == -1) return -1;
if (itf8_encode(fd, b->comp_size) == -1) return -1;
if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
if (b->method == RAW) {
if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
return -1;
} else {
if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
return -1;
}
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
unsigned char dat[100], *cp = dat;;
uint32_t crc;
*cp++ = b->method;
*cp++ = b->content_type;
cp += itf8_put(cp, b->content_id);
cp += itf8_put(cp, b->comp_size);
cp += itf8_put(cp, b->uncomp_size);
crc = crc32(0L, dat, cp-dat);
if (b->method == RAW) {
b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
} else {
b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
}
if (-1 == int32_encode(fd, b->crc32))
return -1;
}
return 0;
}
/*
* Frees a CRAM block, deallocating internal data too.
*/
void cram_free_block(cram_block *b) {
if (!b)
return;
if (b->data)
free(b->data);
free(b);
}
/*
* Uncompresses a CRAM block, if compressed.
*/
int cram_uncompress_block(cram_block *b) {
char *uncomp;
size_t uncomp_size = 0;
if (b->uncomp_size == 0) {
// blank block
b->method = RAW;
return 0;
}
switch (b->method) {
case RAW:
return 0;
case GZIP:
uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
if (!uncomp)
return -1;
if ((int)uncomp_size != b->uncomp_size) {
free(uncomp);
return -1;
}
free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = uncomp_size;
b->method = RAW;
break;
#ifdef HAVE_LIBBZ2
case BZIP2: {
unsigned int usize = b->uncomp_size;
if (!(uncomp = malloc(usize)))
return -1;
if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
(char *)b->data, b->comp_size,
0, 0)) {
free(uncomp);
return -1;
}
free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = usize;
b->method = RAW;
b->uncomp_size = usize; // Just incase it differs
break;
}
#else
case BZIP2:
fprintf(stderr, "Bzip2 compression is not compiled into this "
"version.\nPlease rebuild and try again.\n");
return -1;
#endif
#ifdef HAVE_LIBLZMA
case LZMA:
uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
if (!uncomp)
return -1;
if ((int)uncomp_size != b->uncomp_size)
return -1;
free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = uncomp_size;
b->method = RAW;
break;
#else
case LZMA:
fprintf(stderr, "Lzma compression is not compiled into this "
"version.\nPlease rebuild and try again.\n");
return -1;
break;
#endif
case RANS: {
unsigned int usize = b->uncomp_size, usize2;
uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
assert(usize == usize2);
free(b->data);
b->data = (unsigned char *)uncomp;
b->alloc = usize2;
b->method = RAW;
b->uncomp_size = usize2; // Just incase it differs
//fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
break;
}
default:
return -1;
}
return 0;
}
static char *cram_compress_by_method(char *in, size_t in_size,
size_t *out_size,
enum cram_block_method method,
int level, int strat) {
switch (method) {
case GZIP:
return zlib_mem_deflate(in, in_size, out_size, level, strat);
case BZIP2: {
#ifdef HAVE_LIBBZ2
unsigned int comp_size = in_size*1.01 + 600;
char *comp = malloc(comp_size);
if (!comp)
return NULL;
if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
in, in_size,
level, 0, 30)) {
free(comp);
return NULL;
}
*out_size = comp_size;
return comp;
#else
return NULL;
#endif
}
case LZMA:
#ifdef HAVE_LIBLZMA
return lzma_mem_deflate(in, in_size, out_size, level);
#else
return NULL;
#endif
case RANS0: {
unsigned int out_size_i;
unsigned char *cp;
cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
*out_size = out_size_i;
return (char *)cp;
}
case RANS1: {
unsigned int out_size_i;
unsigned char *cp;
cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
*out_size = out_size_i;
return (char *)cp;
}
case RAW:
break;
default:
return NULL;
}
return NULL;
}
/*
* Compresses a block using one of two different zlib strategies. If we only
* want one choice set strat2 to be -1.
*
* The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
* or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
* significantly faster.
*/
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
int method, int level) {
char *comp = NULL;
size_t comp_size = 0;
int strat;
//fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
if (method == RAW || level == 0 || b->uncomp_size == 0) {
b->method = RAW;
b->comp_size = b->uncomp_size;
//fprintf(stderr, "Skip block id %d\n", b->content_id);
return 0;
}
if (metrics) {
pthread_mutex_lock(&fd->metrics_lock);
if (metrics->trial > 0 || --metrics->next_trial <= 0) {
size_t sz_best = INT_MAX;
size_t sz_gz_rle = 0;
size_t sz_gz_def = 0;
size_t sz_rans0 = 0;
size_t sz_rans1 = 0;
size_t sz_bzip2 = 0;
size_t sz_lzma = 0;
int method_best = 0;
char *c_best = NULL, *c = NULL;
if (metrics->revised_method)
method = metrics->revised_method;
else
metrics->revised_method = method;
if (metrics->next_trial == 0) {
metrics->next_trial = TRIAL_SPAN;
metrics->trial = NTRIALS;
metrics->sz_gz_rle /= 2;
metrics->sz_gz_def /= 2;
metrics->sz_rans0 /= 2;
metrics->sz_rans1 /= 2;
metrics->sz_bzip2 /= 2;
metrics->sz_lzma /= 2;
}
pthread_mutex_unlock(&fd->metrics_lock);
if (method & (1<data, b->uncomp_size,
&sz_gz_rle, GZIP, 1, Z_RLE);
if (c && sz_best > sz_gz_rle) {
sz_best = sz_gz_rle;
method_best = GZIP_RLE;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_gz_rle = b->uncomp_size*2+1000;
}
//fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
}
if (method & (1<data, b->uncomp_size,
&sz_gz_def, GZIP, level,
Z_FILTERED);
if (c && sz_best > sz_gz_def) {
sz_best = sz_gz_def;
method_best = GZIP;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_gz_def = b->uncomp_size*2+1000;
}
//fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
}
if (method & (1<data, b->uncomp_size,
&sz_rans0, RANS0, 0, 0);
if (c && sz_best > sz_rans0) {
sz_best = sz_rans0;
method_best = RANS0;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_rans0 = b->uncomp_size*2+1000;
}
}
if (method & (1<data, b->uncomp_size,
&sz_rans1, RANS1, 0, 0);
if (c && sz_best > sz_rans1) {
sz_best = sz_rans1;
method_best = RANS1;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_rans1 = b->uncomp_size*2+1000;
}
}
if (method & (1<data, b->uncomp_size,
&sz_bzip2, BZIP2, level, 0);
if (c && sz_best > sz_bzip2) {
sz_best = sz_bzip2;
method_best = BZIP2;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_bzip2 = b->uncomp_size*2+1000;
}
}
if (method & (1<data, b->uncomp_size,
&sz_lzma, LZMA, level, 0);
if (c && sz_best > sz_lzma) {
sz_best = sz_lzma;
method_best = LZMA;
if (c_best)
free(c_best);
c_best = c;
} else if (c) {
free(c);
} else {
sz_lzma = b->uncomp_size*2+1000;
}
}
//fprintf(stderr, "sz_best = %d\n", sz_best);
free(b->data);
b->data = (unsigned char *)c_best;
//printf("method_best = %s\n", cram_block_method2str(method_best));
b->method = method_best == GZIP_RLE ? GZIP : method_best;
b->comp_size = sz_best;
pthread_mutex_lock(&fd->metrics_lock);
metrics->sz_gz_rle += sz_gz_rle;
metrics->sz_gz_def += sz_gz_def;
metrics->sz_rans0 += sz_rans0;
metrics->sz_rans1 += sz_rans1;
metrics->sz_bzip2 += sz_bzip2;
metrics->sz_lzma += sz_lzma;
if (--metrics->trial == 0) {
int best_method = RAW;
int best_sz = INT_MAX;
// Scale methods by cost
if (fd->level <= 3) {
metrics->sz_rans1 *= 1.02;
metrics->sz_gz_def *= 1.04;
metrics->sz_bzip2 *= 1.08;
metrics->sz_lzma *= 1.10;
} else if (fd->level <= 6) {
metrics->sz_rans1 *= 1.01;
metrics->sz_gz_def *= 1.02;
metrics->sz_bzip2 *= 1.03;
metrics->sz_lzma *= 1.05;
}
if (method & (1< metrics->sz_gz_rle)
best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
if (method & (1< metrics->sz_gz_def)
best_sz = metrics->sz_gz_def, best_method = GZIP;
if (method & (1< metrics->sz_rans0)
best_sz = metrics->sz_rans0, best_method = RANS0;
if (method & (1< metrics->sz_rans1)
best_sz = metrics->sz_rans1, best_method = RANS1;
if (method & (1< metrics->sz_bzip2)
best_sz = metrics->sz_bzip2, best_method = BZIP2;
if (method & (1< metrics->sz_lzma)
best_sz = metrics->sz_lzma, best_method = LZMA;
if (best_method == GZIP_RLE) {
metrics->method = GZIP;
metrics->strat = Z_RLE;
} else {
metrics->method = best_method;
metrics->strat = Z_FILTERED;
}
// If we see at least MAXFAIL trials in a row for a specific
// compression method with more than MAXDELTA aggregate
// size then we drop this from the list of methods used
// for this block type.
#define MAXDELTA 0.20
#define MAXFAILS 4
if (best_method == GZIP_RLE) {
metrics->gz_rle_cnt = 0;
metrics->gz_rle_extra = 0;
} else if (best_sz < metrics->sz_gz_rle) {
double r = (double)metrics->sz_gz_rle / best_sz - 1;
if (++metrics->gz_rle_cnt >= MAXFAILS &&
(metrics->gz_rle_extra += r) >= MAXDELTA)
method &= ~(1<gz_def_cnt = 0;
metrics->gz_def_extra = 0;
} else if (best_sz < metrics->sz_gz_def) {
double r = (double)metrics->sz_gz_def / best_sz - 1;
if (++metrics->gz_def_cnt >= MAXFAILS &&
(metrics->gz_def_extra += r) >= MAXDELTA)
method &= ~(1<rans0_cnt = 0;
metrics->rans0_extra = 0;
} else if (best_sz < metrics->sz_rans0) {
double r = (double)metrics->sz_rans0 / best_sz - 1;
if (++metrics->rans0_cnt >= MAXFAILS &&
(metrics->rans0_extra += r) >= MAXDELTA)
method &= ~(1<rans1_cnt = 0;
metrics->rans1_extra = 0;
} else if (best_sz < metrics->sz_rans1) {
double r = (double)metrics->sz_rans1 / best_sz - 1;
if (++metrics->rans1_cnt >= MAXFAILS &&
(metrics->rans1_extra += r) >= MAXDELTA)
method &= ~(1<bzip2_cnt = 0;
metrics->bzip2_extra = 0;
} else if (best_sz < metrics->sz_bzip2) {
double r = (double)metrics->sz_bzip2 / best_sz - 1;
if (++metrics->bzip2_cnt >= MAXFAILS &&
(metrics->bzip2_extra += r) >= MAXDELTA)
method &= ~(1<lzma_cnt = 0;
metrics->lzma_extra = 0;
} else if (best_sz < metrics->sz_lzma) {
double r = (double)metrics->sz_lzma / best_sz - 1;
if (++metrics->lzma_cnt >= MAXFAILS &&
(metrics->lzma_extra += r) >= MAXDELTA)
method &= ~(1<revised_method)
// fprintf(stderr, "%d: method from %x to %x\n",
// b->content_id, metrics->revised_method, method);
metrics->revised_method = method;
}
pthread_mutex_unlock(&fd->metrics_lock);
} else {
strat = metrics->strat;
method = metrics->method;
pthread_mutex_unlock(&fd->metrics_lock);
comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
&comp_size, method,
level, strat);
if (!comp)
return -1;
free(b->data);
b->data = (unsigned char *)comp;
b->comp_size = comp_size;
b->method = method;
}
} else {
// no cached metrics, so just do zlib?
comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
&comp_size, GZIP, level, Z_FILTERED);
if (!comp) {
fprintf(stderr, "Compression failed!\n");
return -1;
}
free(b->data);
b->data = (unsigned char *)comp;
b->comp_size = comp_size;
b->method = GZIP;
}
if (fd->verbose)
fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
b->content_id, b->uncomp_size, b->comp_size,
cram_block_method2str(b->method));
if (b->method == RANS1)
b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
return 0;
}
cram_metrics *cram_new_metrics(void) {
cram_metrics *m = calloc(1, sizeof(*m));
if (!m)
return NULL;
m->trial = NTRIALS-1;
m->next_trial = TRIAL_SPAN;
m->method = RAW;
m->strat = 0;
m->revised_method = 0;
return m;
}
char *cram_block_method2str(enum cram_block_method m) {
switch(m) {
case RAW: return "RAW";
case GZIP: return "GZIP";
case BZIP2: return "BZIP2";
case LZMA: return "LZMA";
case RANS0: return "RANS0";
case RANS1: return "RANS1";
case GZIP_RLE: return "GZIP_RLE";
case ERROR: break;
}
return "?";
}
char *cram_content_type2str(enum cram_content_type t) {
switch (t) {
case FILE_HEADER: return "FILE_HEADER";
case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
case MAPPED_SLICE: return "MAPPED_SLICE";
case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
case EXTERNAL: return "EXTERNAL";
case CORE: return "CORE";
case CT_ERROR: break;
}
return "?";
}
/*
* Extra error checking on fclose to really ensure data is written.
* Care needs to be taken to handle pipes vs real files.
*
* Returns 0 on success
* -1 on failure.
*/
int paranoid_fclose(FILE *fp) {
if (-1 == fflush(fp) && errno != EBADF) {
fclose(fp);
return -1;
}
errno = 0;
if (-1 == fsync(fileno(fp))) {
if (errno != EINVAL) { // eg pipe
fclose(fp);
return -1;
}
}
return fclose(fp);
}
/* ----------------------------------------------------------------------
* Reference sequence handling
*
* These revolve around the refs_t structure, which may potentially be
* shared between multiple cram_fd.
*
* We start with refs_create() to allocate an empty refs_t and then
* populate it with @SQ line data using refs_from_header(). This is done on
* cram_open(). Also at start up we can call cram_load_reference() which
* is used with "scramble -r foo.fa". This replaces the fd->refs with the
* new one specified. In either case refs2id() is then called which
* maps ref_entry names to @SQ ids (refs_t->ref_id[]).
*
* Later, possibly within a thread, we will want to know the actual ref
* seq itself, obtained by calling cram_get_ref(). This may use the
* UR: or M5: fields or the filename specified in the original
* cram_load_reference() call.
*
* Given the potential for multi-threaded reference usage, we have
* reference counting (sorry for the confusing double use of "ref") to
* track the number of callers interested in any specific reference.
*/
void refs_free(refs_t *r) {
RP("refs_free()\n");
if (--r->count > 0)
return;
if (!r)
return;
if (r->pool)
string_pool_destroy(r->pool);
if (r->h_meta) {
khint_t k;
for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
ref_entry *e;
if (!kh_exist(r->h_meta, k))
continue;
if (!(e = kh_val(r->h_meta, k)))
continue;
if (e->seq)
free(e->seq);
free(e);
}
kh_destroy(refs, r->h_meta);
}
if (r->ref_id)
free(r->ref_id);
if (r->fp)
bgzf_close(r->fp);
pthread_mutex_destroy(&r->lock);
free(r);
}
static refs_t *refs_create(void) {
refs_t *r = calloc(1, sizeof(*r));
RP("refs_create()\n");
if (!r)
return NULL;
if (!(r->pool = string_pool_create(8192)))
goto err;
r->ref_id = NULL; // see refs2id() to populate.
r->count = 1;
r->last = NULL;
r->last_id = -1;
if (!(r->h_meta = kh_init(refs)))
goto err;
pthread_mutex_init(&r->lock, NULL);
return r;
err:
refs_free(r);
return NULL;
}
/*
* Opens a reference fasta file as a BGZF stream, allowing for
* compressed files. It automatically builds a .fai file if
* required and if compressed a .gzi bgzf index too.
*
* Returns a BGZF handle on success;
* NULL on failure.
*/
static BGZF *bgzf_open_ref(char *fn, char *mode) {
BGZF *fp;
char fai_file[PATH_MAX];
snprintf(fai_file, PATH_MAX, "%s.fai", fn);
if (access(fai_file, R_OK) != 0)
if (fai_build(fn) != 0)
return NULL;
if (!(fp = bgzf_open(fn, mode))) {
perror(fn);
return NULL;
}
if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
bgzf_close(fp);
return NULL;
}
return fp;
}
/*
* Loads a FAI file for a reference.fasta.
* "is_err" indicates whether failure to load is worthy of emitting an
* error message. In some cases (eg with embedded references) we
* speculatively load, just incase, and silently ignore errors.
*
* Returns the refs_t struct on success (maybe newly allocated);
* NULL on failure
*/
static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
struct stat sb;
FILE *fp = NULL;
char fai_fn[PATH_MAX];
char line[8192];
refs_t *r = r_orig;
size_t fn_l = strlen(fn);
int id = 0, id_alloc = 0;
RP("refs_load_fai %s\n", fn);
if (!r)
if (!(r = refs_create()))
goto err;
/* Open reference, for later use */
if (stat(fn, &sb) != 0) {
if (is_err)
perror(fn);
goto err;
}
if (r->fp)
if (bgzf_close(r->fp) != 0)
goto err;
r->fp = NULL;
if (!(r->fn = string_dup(r->pool, fn)))
goto err;
if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
r->fn[fn_l-4] = 0;
if (!(r->fp = bgzf_open_ref(r->fn, "r")))
goto err;
/* Parse .fai file and load meta-data */
sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
if (stat(fai_fn, &sb) != 0) {
if (is_err)
perror(fai_fn);
goto err;
}
if (!(fp = fopen(fai_fn, "r"))) {
if (is_err)
perror(fai_fn);
goto err;
}
while (fgets(line, 8192, fp) != NULL) {
ref_entry *e = malloc(sizeof(*e));
char *cp;
int n;
khint_t k;
if (!e)
return NULL;
// id
for (cp = line; *cp && !isspace(*cp); cp++)
;
*cp++ = 0;
e->name = string_dup(r->pool, line);
// length
while (*cp && isspace(*cp))
cp++;
e->length = strtoll(cp, &cp, 10);
// offset
while (*cp && isspace(*cp))
cp++;
e->offset = strtoll(cp, &cp, 10);
// bases per line
while (*cp && isspace(*cp))
cp++;
e->bases_per_line = strtol(cp, &cp, 10);
// line length
while (*cp && isspace(*cp))
cp++;
e->line_length = strtol(cp, &cp, 10);
// filename
e->fn = r->fn;
e->count = 0;
e->seq = NULL;
k = kh_put(refs, r->h_meta, e->name, &n);
if (-1 == n) {
free(e);
return NULL;
}
if (n) {
kh_val(r->h_meta, k) = e;
} else {
ref_entry *re = kh_val(r->h_meta, k);
if (re && (re->count != 0 || re->length != 0)) {
/* Keep old */
free(e);
} else {
/* Replace old */
if (re)
free(re);
kh_val(r->h_meta, k) = e;
}
}
if (id >= id_alloc) {
int x;
id_alloc = id_alloc ?id_alloc*2 : 16;
r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
for (x = id; x < id_alloc; x++)
r->ref_id[x] = NULL;
}
r->ref_id[id] = e;
r->nref = ++id;
}
return r;
err:
if (fp)
fclose(fp);
if (!r_orig)
refs_free(r);
return NULL;
}
/*
* Indexes references by the order they appear in a BAM file. This may not
* necessarily be the same order they appear in the fasta reference file.
*
* Returns 0 on success
* -1 on failure
*/
int refs2id(refs_t *r, SAM_hdr *h) {
int i;
if (r->ref_id)
free(r->ref_id);
if (r->last)
r->last = NULL;
r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
if (!r->ref_id)
return -1;
r->nref = h->nref;
for (i = 0; i < h->nref; i++) {
khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
if (k != kh_end(r->h_meta)) {
r->ref_id[i] = kh_val(r->h_meta, k);
} else {
fprintf(stderr, "Unable to find ref name '%s'\n",
h->ref[i].name);
}
}
return 0;
}
/*
* Generates refs_t entries based on @SQ lines in the header.
* Returns 0 on success
* -1 on failure
*/
static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
int i, j;
if (!h || h->nref == 0)
return 0;
//fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
/* Existing refs are fine, as long as they're compatible with the hdr. */
if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
return -1;
/* Copy info from h->ref[i] over to r */
for (i = 0, j = r->nref; i < h->nref; i++) {
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
khint_t k;
int n;
k = kh_get(refs, r->h_meta, h->ref[i].name);
if (k != kh_end(r->h_meta))
// Ref already known about
continue;
if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
return -1;
if (!h->ref[j].name)
return -1;
r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
r->ref_id[j]->length = 0; // marker for not yet loaded
/* Initialise likely filename if known */
if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
//fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
}
}
k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
if (n <= 0) // already exists or error
return -1;
kh_val(r->h_meta, k) = r->ref_id[j];
j++;
}
r->nref = j;
return 0;
}
/*
* Attaches a header to a cram_fd.
*
* This should be used when creating a new cram_fd for writing where
* we have an SAM_hdr already constructed (eg from a file we've read
* in).
*/
int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
if (fd->header)
sam_hdr_free(fd->header);
fd->header = hdr;
return refs_from_header(fd->refs, fd, hdr);
}
/*
* Converts a directory and a filename into an expanded path, replacing %s
* in directory with the filename and %[0-9]+s with portions of the filename
* Any remaining parts of filename are added to the end with /%s.
*/
void expand_cache_path(char *path, char *dir, char *fn) {
char *cp;
while ((cp = strchr(dir, '%'))) {
strncpy(path, dir, cp-dir);
path += cp-dir;
if (*++cp == 's') {
strcpy(path, fn);
path += strlen(fn);
fn += strlen(fn);
cp++;
} else if (*cp >= '0' && *cp <= '9') {
char *endp;
long l;
l = strtol(cp, &endp, 10);
l = MIN(l, strlen(fn));
if (*endp == 's') {
strncpy(path, fn, l);
path += l;
fn += l;
*path = 0;
cp = endp+1;
} else {
*path++ = '%';
*path++ = *cp++;
}
} else {
*path++ = '%';
*path++ = *cp++;
}
dir = cp;
}
strcpy(path, dir);
path += strlen(dir);
if (*fn && path[-1] != '/')
*path++ = '/';
strcpy(path, fn);
}
/*
* Make the directory containing path and any prefix directories.
*/
void mkdir_prefix(char *path, int mode) {
char *cp = strrchr(path, '/');
if (!cp)
return;
*cp = 0;
if (is_directory(path)) {
*cp = '/';
return;
}
if (mkdir(path, mode) == 0) {
chmod(path, mode);
*cp = '/';
return;
}
mkdir_prefix(path, mode);
mkdir(path, mode);
chmod(path, mode);
*cp = '/';
}
/*
* Return the cache directory to use, based on the first of these
* environment variables to be set to a non-empty value.
*/
static const char *get_cache_basedir(const char **extra) {
char *base;
*extra = "";
base = getenv("XDG_CACHE_HOME");
if (base && *base) return base;
base = getenv("HOME");
if (base && *base) { *extra = "/.cache"; return base; }
base = getenv("TMPDIR");
if (base && *base) return base;
base = getenv("TEMP");
if (base && *base) return base;
return "/tmp";
}
/*
* Queries the M5 string from the header and attempts to populate the
* reference from this using the REF_PATH environment.
*
* Returns 0 on sucess
* -1 on failure
*/
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
char *ref_path = getenv("REF_PATH");
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX];
char *local_cache = getenv("REF_CACHE");
mFILE *mf;
if (fd->verbose)
fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
if (!ref_path || *ref_path == '\0') {
/*
* If we have no ref path, we use the EBI server.
* However to avoid spamming it we require a local ref cache too.
*/
ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
if (!local_cache || *local_cache == '\0') {
const char *extra;
const char *base = get_cache_basedir(&extra);
snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
local_cache = cache;
if (fd->verbose)
fprintf(stderr, "Populating local cache: %s\n", local_cache);
}
}
if (!r->name)
return -1;
if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
return -1;
if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
goto no_M5;
if (fd->verbose)
fprintf(stderr, "Querying ref %s\n", tag->str+3);
/* Use cache if available */
if (local_cache && *local_cache) {
struct stat sb;
BGZF *fp;
expand_cache_path(path, local_cache, tag->str+3);
if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
r->length = sb.st_size;
r->offset = r->line_length = r->bases_per_line = 0;
r->fn = string_dup(fd->refs->pool, path);
if (fd->refs->fp)
if (bgzf_close(fd->refs->fp) != 0)
return -1;
fd->refs->fp = fp;
fd->refs->fn = r->fn;
// Fall back to cram_get_ref() where it'll do the actual
// reading of the file.
return 0;
}
}
/* Otherwise search */
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
size_t sz;
r->seq = mfsteal(mf, &sz);
r->length = sz;
} else {
refs_t *refs;
char *fn;
no_M5:
/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
return -1;
fn = (strncmp(tag->str+3, "file:", 5) == 0)
? tag->str+8
: tag->str+3;
if (fd->refs->fp) {
if (bgzf_close(fd->refs->fp) != 0)
return -1;
fd->refs->fp = NULL;
}
if (!(refs = refs_load_fai(fd->refs, fn, 0)))
return -1;
fd->refs = refs;
if (fd->refs->fp) {
if (bgzf_close(fd->refs->fp) != 0)
return -1;
fd->refs->fp = NULL;
}
if (!fd->refs->fn)
return -1;
if (-1 == refs2id(fd->refs, fd->header))
return -1;
if (!fd->refs->ref_id || !fd->refs->ref_id[id])
return -1;
// Local copy already, so fall back to cram_get_ref().
return 0;
}
/* Populate the local disk cache if required */
if (local_cache && *local_cache) {
FILE *fp;
int i;
expand_cache_path(path, local_cache, tag->str+3);
if (fd->verbose)
fprintf(stderr, "Path='%s'\n", path);
mkdir_prefix(path, 01777);
i = 0;
do {
sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
i++;
fp = fopen(path_tmp, "wx");
} while (fp == NULL && errno == EEXIST);
if (!fp) {
perror(path_tmp);
// Not fatal - we have the data already so keep going.
return 0;
}
if (r->length != fwrite(r->seq, 1, r->length, fp)) {
perror(path);
}
if (-1 == paranoid_fclose(fp)) {
unlink(path_tmp);
} else {
if (0 == chmod(path_tmp, 0444))
rename(path_tmp, path);
else
unlink(path_tmp);
}
}
return 0;
}
static void cram_ref_incr_locked(refs_t *r, int id) {
RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
if (id < 0 || !r->ref_id[id]->seq)
return;
if (r->last_id == id)
r->last_id = -1;
++r->ref_id[id]->count;
}
void cram_ref_incr(refs_t *r, int id) {
pthread_mutex_lock(&r->lock);
cram_ref_incr_locked(r, id);
pthread_mutex_unlock(&r->lock);
}
static void cram_ref_decr_locked(refs_t *r, int id) {
RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
if (id < 0 || !r->ref_id[id]->seq) {
assert(r->ref_id[id]->count >= 0);
return;
}
if (--r->ref_id[id]->count <= 0) {
assert(r->ref_id[id]->count == 0);
if (r->last_id >= 0) {
if (r->ref_id[r->last_id]->count <= 0 &&
r->ref_id[r->last_id]->seq) {
RP("%d FREE REF %d (%p)\n", gettid(),
r->last_id, r->ref_id[r->last_id]->seq);
free(r->ref_id[r->last_id]->seq);
r->ref_id[r->last_id]->seq = NULL;
r->ref_id[r->last_id]->length = 0;
}
}
r->last_id = id;
}
}
void cram_ref_decr(refs_t *r, int id) {
pthread_mutex_lock(&r->lock);
cram_ref_decr_locked(r, id);
pthread_mutex_unlock(&r->lock);
}
/*
* Used by cram_ref_load and cram_ref_get. The file handle will have
* already been opened, so we can catch it. The ref_entry *e informs us
* of whether this is a multi-line fasta file or a raw MD5 style file.
* Either way we create a single contiguous sequence.
*
* Returns all or part of a reference sequence on success (malloced);
* NULL on failure.
*/
static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
off_t offset, len;
char *seq;
if (end < start)
end = start;
/*
* Compute locations in file. This is trivial for the MD5 files, but
* is still necessary for the fasta variants.
*/
offset = e->line_length
? e->offset + (start-1)/e->bases_per_line * e->line_length +
(start-1) % e->bases_per_line
: start-1;
len = (e->line_length
? e->offset + (end-1)/e->bases_per_line * e->line_length +
(end-1) % e->bases_per_line
: end-1) - offset + 1;
if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
perror("bgzf_useek() on reference file");
return NULL;
}
if (len == 0 || !(seq = malloc(len))) {
return NULL;
}
if (len != bgzf_read(fp, seq, len)) {
perror("bgzf_read() on reference file");
free(seq);
return NULL;
}
/* Strip white-space if required. */
if (len != end-start+1) {
int i, j;
char *cp = seq;
char *cp_to;
for (i = j = 0; i < len; i++) {
if (cp[i] >= '!' && cp[i] <= '~')
cp[j++] = cp[i] & ~0x20;
}
cp_to = cp+j;
if (cp_to - seq != end-start+1) {
fprintf(stderr, "Malformed reference file?\n");
free(seq);
return NULL;
}
} else {
int i;
for (i = 0; i < len; i++) {
seq[i] = seq[i] & ~0x20; // uppercase in ASCII
}
}
return seq;
}
/*
* Load the entire reference 'id'.
* This also increments the reference count by 1.
*
* Returns ref_entry on success;
* NULL on failure
*/
ref_entry *cram_ref_load(refs_t *r, int id) {
ref_entry *e = r->ref_id[id];
int start = 1, end = e->length;
char *seq;
if (e->seq) {
return e;
}
assert(e->count == 0);
if (r->last) {
#ifdef REF_DEBUG
int idx = 0;
for (idx = 0; idx < r->nref; idx++)
if (r->last == r->ref_id[idx])
break;
RP("%d cram_ref_load DECR %d\n", gettid(), idx);
#endif
assert(r->last->count > 0);
if (--r->last->count <= 0) {
RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
if (r->last->seq) {
free(r->last->seq);
r->last->seq = NULL;
}
}
}
/* Open file if it's not already the current open reference */
if (strcmp(r->fn, e->fn) || r->fp == NULL) {
if (r->fp)
if (bgzf_close(r->fp) != 0)
return NULL;
r->fn = e->fn;
if (!(r->fp = bgzf_open_ref(r->fn, "r")))
return NULL;
}
RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
if (!(seq = load_ref_portion(r->fp, e, start, end))) {
return NULL;
}
RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
e->seq = seq;
e->count++;
/*
* Also keep track of last used ref so incr/decr loops on the same
* sequence don't cause load/free loops.
*/
RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
r->last = e;
e->count++;
return e;
}
/*
* Returns a portion of a reference sequence from start to end inclusive.
* The returned pointer is owned by either the cram_file fd or by the
* internal refs_t structure and should not be freed by the caller.
*
* The difference is whether or not this refs_t is in use by just the one
* cram_fd or by multiples, or whether we have multiple threads accessing
* references. In either case fd->shared will be true and we start using
* reference counting to track the number of users of a specific reference
* sequence.
*
* Otherwise the ref seq returned is allocated as part of cram_fd itself
* and will be freed up on the next call to cram_get_ref or cram_close.
*
* To return the entire reference sequence, specify start as 1 and end
* as 0.
*
* To cease using a reference, call cram_ref_decr().
*
* Returns reference on success,
* NULL on failure
*/
char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
ref_entry *r;
char *seq;
int ostart = start;
if (id == -1)
return NULL;
/* FIXME: axiomatic query of r->seq being true?
* Or shortcut for unsorted data where we load once and never free?
*/
//fd->shared_ref = 1; // hard code for now to simplify things
pthread_mutex_lock(&fd->ref_lock);
RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
/*
* Unsorted data implies we want to fetch an entire reference at a time.
* We just deal with this at the moment by claiming we're sharing
* references instead, which has the same requirement.
*/
if (fd->unsorted)
fd->shared_ref = 1;
/* Sanity checking: does this ID exist? */
if (id >= fd->refs->nref) {
fprintf(stderr, "No reference found for id %d\n", id);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
if (!fd->refs || !fd->refs->ref_id[id]) {
fprintf(stderr, "No reference found for id %d\n", id);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
if (!(r = fd->refs->ref_id[id])) {
fprintf(stderr, "No reference found for id %d\n", id);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
/*
* It has an entry, but may not have been populated yet.
* Any manually loaded .fai files have their lengths known.
* A ref entry computed from @SQ lines (M5 or UR field) will have
* r->length == 0 unless it's been loaded once and verified that we have
* an on-disk filename for it.
*
* 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
* open_path_mfile and libcurl, which isn't multi-thread safe unless I
* rewrite my code to have one curl handle per thread.
*/
pthread_mutex_lock(&fd->refs->lock);
if (r->length == 0) {
if (cram_populate_ref(fd, id, r) == -1) {
fprintf(stderr, "Failed to populate reference for id %d\n", id);
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
r = fd->refs->ref_id[id];
if (fd->unsorted)
cram_ref_incr_locked(fd->refs, id);
}
/*
* We now know that we the filename containing the reference, so check
* for limits. If it's over half the reference we'll load all of it in
* memory as this will speed up subsequent calls.
*/
if (end < 1)
end = r->length;
if (end >= r->length)
end = r->length;
assert(start >= 1);
if (end - start >= 0.5*r->length || fd->shared_ref) {
start = 1;
end = r->length;
}
/*
* Maybe we have it cached already? If so use it.
*
* Alternatively if we don't have the sequence but we're sharing
* references and/or are asking for the entire length of it, then
* load the full reference into the refs structure and return
* a pointer to that one instead.
*/
if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
char *cp;
if (id >= 0) {
if (r->seq) {
cram_ref_incr_locked(fd->refs, id);
} else {
ref_entry *e;
if (!(e = cram_ref_load(fd->refs, id))) {
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
/* unsorted data implies cache ref indefinitely, to avoid
* continually loading and unloading.
*/
if (fd->unsorted)
cram_ref_incr_locked(fd->refs, id);
}
fd->ref = NULL; /* We never access it directly */
fd->ref_start = 1;
fd->ref_end = r->length;
fd->ref_id = id;
cp = fd->refs->ref_id[id]->seq + ostart-1;
} else {
fd->ref = NULL;
cp = NULL;
}
RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return cp;
}
/*
* Otherwise we're not sharing, we don't have a copy of it already and
* we're only asking for a small portion of it.
*
* In this case load up just that segment ourselves, freeing any old
* small segments in the process.
*/
/* Unmapped ref ID */
if (id < 0) {
if (fd->ref_free) {
free(fd->ref_free);
fd->ref_free = NULL;
}
fd->ref = NULL;
fd->ref_id = id;
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
/* Open file if it's not already the current open reference */
if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
if (fd->refs->fp)
if (bgzf_close(fd->refs->fp) != 0)
return NULL;
fd->refs->fn = r->fn;
if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) {
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
}
if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return NULL;
}
if (fd->ref_free)
free(fd->ref_free);
fd->ref_id = id;
fd->ref_start = start;
fd->ref_end = end;
fd->ref_free = fd->ref;
seq = fd->ref;
pthread_mutex_unlock(&fd->refs->lock);
pthread_mutex_unlock(&fd->ref_lock);
return seq + ostart - start;
}
/*
* If fd has been opened for reading, it may be permitted to specify 'fn'
* as NULL and let the code auto-detect the reference by parsing the
* SAM header @SQ lines.
*/
int cram_load_reference(cram_fd *fd, char *fn) {
if (fn) {
fd->refs = refs_load_fai(fd->refs, fn,
!(fd->embed_ref && fd->mode == 'r'));
fn = fd->refs ? fd->refs->fn : NULL;
}
fd->ref_fn = fn;
if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
if (fd->refs)
refs_free(fd->refs);
if (!(fd->refs = refs_create()))
return -1;
if (-1 == refs_from_header(fd->refs, fd, fd->header))
return -1;
}
if (fd->header)
if (-1 == refs2id(fd->refs, fd->header))
return -1;
return fn ? 0 : -1;
}
/* ----------------------------------------------------------------------
* Containers
*/
/*
* Creates a new container, specifying the maximum number of slices
* and records permitted.
*
* Returns cram_container ptr on success
* NULL on failure
*/
cram_container *cram_new_container(int nrec, int nslice) {
cram_container *c = calloc(1, sizeof(*c));
enum cram_DS_ID id;
if (!c)
return NULL;
c->curr_ref = -2;
c->max_c_rec = nrec * nslice;
c->curr_c_rec = 0;
c->max_rec = nrec;
c->record_counter = 0;
c->num_bases = 0;
c->max_slice = nslice;
c->curr_slice = 0;
c->pos_sorted = 1;
c->max_apos = 0;
c->multi_seq = 0;
c->bams = NULL;
if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
goto err;
c->slice = NULL;
if (!(c->comp_hdr = cram_new_compression_header()))
goto err;
c->comp_hdr_block = NULL;
for (id = DS_RN; id < DS_TN; id++)
if (!(c->stats[id] = cram_stats_create())) goto err;
//c->aux_B_stats = cram_stats_create();
if (!(c->tags_used = kh_init(s_i2i)))
goto err;
c->refs_used = 0;
return c;
err:
if (c) {
if (c->slices)
free(c->slices);
free(c);
}
return NULL;
}
void cram_free_container(cram_container *c) {
enum cram_DS_ID id;
int i;
if (!c)
return;
if (c->refs_used)
free(c->refs_used);
if (c->landmark)
free(c->landmark);
if (c->comp_hdr)
cram_free_compression_header(c->comp_hdr);
if (c->comp_hdr_block)
cram_free_block(c->comp_hdr_block);
if (c->slices) {
for (i = 0; i < c->max_slice; i++)
if (c->slices[i])
cram_free_slice(c->slices[i]);
free(c->slices);
}
for (id = DS_RN; id < DS_TN; id++)
if (c->stats[id]) cram_stats_free(c->stats[id]);
//if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
if (c->tags_used) kh_destroy(s_i2i, c->tags_used);
free(c);
}
/*
* Reads a container header.
*
* Returns cram_container on success
* NULL on failure or no container left (fd->err == 0).
*/
cram_container *cram_read_container(cram_fd *fd) {
cram_container c2, *c;
int i, s;
size_t rd = 0;
fd->err = 0;
fd->eof = 0;
memset(&c2, 0, sizeof(c2));
if (CRAM_MAJOR_VERS(fd->version) == 1) {
if ((s = itf8_decode(fd, &c2.length)) == -1) {
fd->eof = fd->empty_container ? 1 : 2;
return NULL;
} else {
rd+=s;
}
} else {
if ((s = int32_decode(fd, &c2.length)) == -1) {
if (CRAM_MAJOR_VERS(fd->version) == 2 &&
CRAM_MINOR_VERS(fd->version) == 0)
fd->eof = 1; // EOF blocks arrived in v2.1
else
fd->eof = fd->empty_container ? 1 : 2;
return NULL;
} else {
rd+=s;
}
}
if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s;
if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s;
if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s;
if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
c2.record_counter = 0;
c2.num_bases = 0;
} else {
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
if ((s = ltf8_decode(fd, &c2.record_counter)) == -1)
return NULL;
else
rd += s;
} else {
int32_t i32;
if ((s = itf8_decode(fd, &i32)) == -1)
return NULL;
else
rd += s;
c2.record_counter = i32;
}
if ((s = ltf8_decode(fd, &c2.num_bases))== -1)
return NULL;
else
rd += s;
}
if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s;
if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s;
if (!(c = calloc(1, sizeof(*c))))
return NULL;
*c = c2;
if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
c->num_landmarks) {
fd->err = errno;
cram_free_container(c);
return NULL;
}
for (i = 0; i < c->num_landmarks; i++) {
if ((s = itf8_decode(fd, &c->landmark[i])) == -1) {
cram_free_container(c);
return NULL;
} else {
rd += s;
}
}
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
uint32_t crc, i;
unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat;
if (!dat) {
cram_free_container(c);
return NULL;
}
if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
return NULL;
else
rd+=4;
/* Reencode first as we can't easily access the original byte stream.
*
* FIXME: Technically this means this may not be fool proof. We could
* create a CRAM file using a 2 byte ITF8 value that can fit in a
* 1 byte field, meaning the encoding is different to the original
* form and so has a different CRC.
*
* The correct implementation would be to have an alternative form
* of itf8_decode which also squirrels away the raw byte stream
* during decoding so we can then CRC that.
*/
*(unsigned int *)cp = le_int4(c->length); cp += 4;
cp += itf8_put(cp, c->ref_seq_id);
cp += itf8_put(cp, c->ref_seq_start);
cp += itf8_put(cp, c->ref_seq_span);
cp += itf8_put(cp, c->num_records);
cp += ltf8_put((char *)cp, c->record_counter);
cp += itf8_put(cp, c->num_bases);
cp += itf8_put(cp, c->num_blocks);
cp += itf8_put(cp, c->num_landmarks);
for (i = 0; i < c->num_landmarks; i++) {
cp += itf8_put(cp, c->landmark[i]);
}
crc = crc32(0L, dat, cp-dat);
if (crc != c->crc32) {
fprintf(stderr, "Container header CRC32 failure\n");
cram_free_container(c);
return NULL;
}
}
c->offset = rd;
c->slices = NULL;
c->curr_slice = 0;
c->max_slice = c->num_landmarks;
c->slice_rec = 0;
c->curr_rec = 0;
c->max_rec = 0;
if (c->ref_seq_id == -2) {
c->multi_seq = 1;
fd->multi_seq = 1;
}
fd->empty_container =
(c->num_records == 0 &&
c->ref_seq_id == -1 &&
c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
return c;
}
/*
* Writes a container structure.
*
* Returns 0 on success
* -1 on failure
*/
int cram_write_container(cram_fd *fd, cram_container *c) {
char buf_a[1024], *buf = buf_a, *cp;
int i;
if (55 + c->num_landmarks * 5 >= 1024)
buf = malloc(55 + c->num_landmarks * 5);
cp = buf;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
cp += itf8_put(cp, c->length);
} else {
*(int32_t *)cp = le_int4(c->length);
cp += 4;
}
if (c->multi_seq) {
cp += itf8_put(cp, -2);
cp += itf8_put(cp, 0);
cp += itf8_put(cp, 0);
} else {
cp += itf8_put(cp, c->ref_seq_id);
cp += itf8_put(cp, c->ref_seq_start);
cp += itf8_put(cp, c->ref_seq_span);
}
cp += itf8_put(cp, c->num_records);
if (CRAM_MAJOR_VERS(fd->version) == 2) {
cp += itf8_put(cp, c->record_counter);
cp += ltf8_put(cp, c->num_bases);
} else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
cp += ltf8_put(cp, c->record_counter);
cp += ltf8_put(cp, c->num_bases);
}
cp += itf8_put(cp, c->num_blocks);
cp += itf8_put(cp, c->num_landmarks);
for (i = 0; i < c->num_landmarks; i++)
cp += itf8_put(cp, c->landmark[i]);
if (CRAM_MAJOR_VERS(fd->version) >= 3) {
c->crc32 = crc32(0L, (uc *)buf, cp-buf);
cp[0] = c->crc32 & 0xff;
cp[1] = (c->crc32 >> 8) & 0xff;
cp[2] = (c->crc32 >> 16) & 0xff;
cp[3] = (c->crc32 >> 24) & 0xff;
cp += 4;
}
if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
if (buf != buf_a)
free(buf);
return -1;
}
if (buf != buf_a)
free(buf);
return 0;
}
// common component shared by cram_flush_container{,_mt}
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
int i, j;
//fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
/* Write the container struct itself */
if (0 != cram_write_container(fd, c))
return -1;
/* And the compression header */
if (0 != cram_write_block(fd, c->comp_hdr_block))
return -1;
/* Followed by the slice blocks */
for (i = 0; i < c->curr_slice; i++) {
cram_slice *s = c->slices[i];
if (0 != cram_write_block(fd, s->hdr_block))
return -1;
for (j = 0; j < s->hdr->num_blocks; j++) {
if (0 != cram_write_block(fd, s->block[j]))
return -1;
}
}
return hflush(fd->fp) == 0 ? 0 : -1;
}
/*
* Flushes a completely or partially full container to disk, writing
* container structure, header and blocks. This also calls the encoder
* functions.
*
* Returns 0 on success
* -1 on failure
*/
int cram_flush_container(cram_fd *fd, cram_container *c) {
/* Encode the container blocks and generate compression header */
if (0 != cram_encode_container(fd, c))
return -1;
return cram_flush_container2(fd, c);
}
typedef struct {
cram_fd *fd;
cram_container *c;
} cram_job;
void *cram_flush_thread(void *arg) {
cram_job *j = (cram_job *)arg;
/* Encode the container blocks and generate compression header */
if (0 != cram_encode_container(j->fd, j->c)) {
fprintf(stderr, "cram_encode_container failed\n");
return NULL;
}
return arg;
}
static int cram_flush_result(cram_fd *fd) {
int i, ret = 0;
t_pool_result *r;
while ((r = t_pool_next_result(fd->rqueue))) {
cram_job *j = (cram_job *)r->data;
cram_container *c;
if (!j) {
t_pool_delete_result(r, 0);
return -1;
}
fd = j->fd;
c = j->c;
if (0 != cram_flush_container2(fd, c))
return -1;
/* Free the container */
for (i = 0; i < c->max_slice; i++) {
cram_free_slice(c->slices[i]);
c->slices[i] = NULL;
}
c->slice = NULL;
c->curr_slice = 0;
cram_free_container(c);
ret |= hflush(fd->fp) == 0 ? 0 : -1;
t_pool_delete_result(r, 1);
}
return ret;
}
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
cram_job *j;
if (!fd->pool)
return cram_flush_container(fd, c);
if (!(j = malloc(sizeof(*j))))
return -1;
j->fd = fd;
j->c = c;
t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
return cram_flush_result(fd);
}
/* ----------------------------------------------------------------------
* Compression headers; the first part of the container
*/
/*
* Creates a new blank container compression header
*
* Returns header ptr on success
* NULL on failure
*/
cram_block_compression_hdr *cram_new_compression_header(void) {
cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
if (!hdr)
return NULL;
if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
free(hdr);
return NULL;
}
if (!(hdr->TD_hash = kh_init(m_s2i))) {
cram_free_block(hdr->TD_blk);
free(hdr);
return NULL;
}
if (!(hdr->TD_keys = string_pool_create(8192))) {
kh_destroy(m_s2i, hdr->TD_hash);
cram_free_block(hdr->TD_blk);
free(hdr);
return NULL;
}
return hdr;
}
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
int i;
if (hdr->landmark)
free(hdr->landmark);
if (hdr->preservation_map)
kh_destroy(map, hdr->preservation_map);
for (i = 0; i < CRAM_MAP_HASH; i++) {
cram_map *m, *m2;
for (m = hdr->rec_encoding_map[i]; m; m = m2) {
m2 = m->next;
if (m->codec)
m->codec->free(m->codec);
free(m);
}
}
for (i = 0; i < CRAM_MAP_HASH; i++) {
cram_map *m, *m2;
for (m = hdr->tag_encoding_map[i]; m; m = m2) {
m2 = m->next;
if (m->codec)
m->codec->free(m->codec);
free(m);
}
}
for (i = 0; i < DS_END; i++) {
if (hdr->codecs[i])
hdr->codecs[i]->free(hdr->codecs[i]);
}
if (hdr->TL)
free(hdr->TL);
if (hdr->TD_blk)
cram_free_block(hdr->TD_blk);
if (hdr->TD_hash)
kh_destroy(m_s2i, hdr->TD_hash);
if (hdr->TD_keys)
string_pool_destroy(hdr->TD_keys);
free(hdr);
}
/* ----------------------------------------------------------------------
* Slices and slice headers
*/
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
if (!hdr)
return;
if (hdr->block_content_ids)
free(hdr->block_content_ids);
free(hdr);
return;
}
void cram_free_slice(cram_slice *s) {
if (!s)
return;
if (s->hdr_block)
cram_free_block(s->hdr_block);
if (s->block) {
int i;
if (s->hdr) {
for (i = 0; i < s->hdr->num_blocks; i++) {
cram_free_block(s->block[i]);
}
}
free(s->block);
}
if (s->block_by_id)
free(s->block_by_id);
if (s->hdr)
cram_free_slice_header(s->hdr);
if (s->seqs_blk)
cram_free_block(s->seqs_blk);
if (s->qual_blk)
cram_free_block(s->qual_blk);
if (s->name_blk)
cram_free_block(s->name_blk);
if (s->aux_blk)
cram_free_block(s->aux_blk);
if (s->aux_OQ_blk)
cram_free_block(s->aux_OQ_blk);
if (s->aux_BQ_blk)
cram_free_block(s->aux_BQ_blk);
if (s->aux_FZ_blk)
cram_free_block(s->aux_FZ_blk);
if (s->aux_oq_blk)
cram_free_block(s->aux_oq_blk);
if (s->aux_os_blk)
cram_free_block(s->aux_os_blk);
if (s->aux_oz_blk)
cram_free_block(s->aux_oz_blk);
if (s->base_blk)
cram_free_block(s->base_blk);
if (s->soft_blk)
cram_free_block(s->soft_blk);
if (s->cigar)
free(s->cigar);
if (s->crecs)
free(s->crecs);
if (s->features)
free(s->features);
if (s->TN)
free(s->TN);
if (s->pair_keys)
string_pool_destroy(s->pair_keys);
if (s->pair[0])
kh_destroy(m_s2i, s->pair[0]);
if (s->pair[1])
kh_destroy(m_s2i, s->pair[1]);
free(s);
}
/*
* Creates a new empty slice in memory, for subsequent writing to
* disk.
*
* Returns cram_slice ptr on success
* NULL on failure
*/
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
cram_slice *s = calloc(1, sizeof(*s));
if (!s)
return NULL;
if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
goto err;
s->hdr->content_type = type;
s->hdr_block = NULL;
s->block = NULL;
s->block_by_id = NULL;
s->last_apos = 0;
if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
s->cigar = NULL;
s->cigar_alloc = 0;
s->ncigar = 0;
if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
s->features = NULL;
s->nfeatures = s->afeatures = 0;
#ifndef TN_external
s->TN = NULL;
s->nTN = s->aTN = 0;
#endif
// Volatile keys as we do realloc in dstring
if (!(s->pair_keys = string_pool_create(8192))) goto err;
if (!(s->pair[0] = kh_init(m_s2i))) goto err;
if (!(s->pair[1] = kh_init(m_s2i))) goto err;
#ifdef BA_external
s->BA_len = 0;
#endif
return s;
err:
if (s)
cram_free_slice(s);
return NULL;
}
/*
* Loads an entire slice.
* FIXME: In 1.0 the native unit of slices within CRAM is broken
* as slices contain references to objects in other slices.
* To work around this while keeping the slice oriented outer loop
* we read all slices and stitch them together into a fake large
* slice instead.
*
* Returns cram_slice ptr on success
* NULL on failure
*/
cram_slice *cram_read_slice(cram_fd *fd) {
cram_block *b = cram_read_block(fd);
cram_slice *s = calloc(1, sizeof(*s));
int i, n, max_id, min_id;
if (!b || !s)
goto err;
s->hdr_block = b;
switch (b->content_type) {
case MAPPED_SLICE:
case UNMAPPED_SLICE:
if (!(s->hdr = cram_decode_slice_header(fd, b)))
goto err;
break;
default:
fprintf(stderr, "Unexpected block of type %s\n",
cram_content_type2str(b->content_type));
goto err;
}
s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
if (!s->block)
goto err;
for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
if (!(s->block[i] = cram_read_block(fd)))
goto err;
if (s->block[i]->content_type == EXTERNAL) {
if (max_id < s->block[i]->content_id)
max_id = s->block[i]->content_id;
if (min_id > s->block[i]->content_id)
min_id = s->block[i]->content_id;
}
}
if (min_id >= 0 && max_id < 1024) {
if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
goto err;
for (i = 0; i < n; i++) {
if (s->block[i]->content_type != EXTERNAL)
continue;
s->block_by_id[s->block[i]->content_id] = s->block[i];
}
}
/* Initialise encoding/decoding tables */
s->cigar = NULL;
s->cigar_alloc = 0;
s->ncigar = 0;
if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
s->crecs = NULL;
s->last_apos = s->hdr->ref_seq_start;
return s;
err:
if (b)
cram_free_block(b);
if (s) {
s->hdr_block = NULL;
cram_free_slice(s);
}
return NULL;
}
/* ----------------------------------------------------------------------
* CRAM file definition (header)
*/
/*
* Reads a CRAM file definition structure.
* Returns file_def ptr on success
* NULL on failure
*/
cram_file_def *cram_read_file_def(cram_fd *fd) {
cram_file_def *def = malloc(sizeof(*def));
if (!def)
return NULL;
if (26 != hread(fd->fp, &def->magic[0], 26)) {
free(def);
return NULL;
}
if (memcmp(def->magic, "CRAM", 4) != 0) {
free(def);
return NULL;
}
if (def->major_version > 3) {
fprintf(stderr, "CRAM version number mismatch\n"
"Expected 1.x, 2.x or 3.x, got %d.%d\n",
def->major_version, def->minor_version);
free(def);
return NULL;
}
fd->first_container += 26;
fd->last_slice = 0;
return def;
}
/*
* Writes a cram_file_def structure to cram_fd.
* Returns 0 on success
* -1 on failure
*/
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
}
void cram_free_file_def(cram_file_def *def) {
if (def) free(def);
}
/* ----------------------------------------------------------------------
* SAM header I/O
*/
/*
* Reads the SAM header from the first CRAM data block.
* Also performs minimal parsing to extract read-group
* and sample information.
* Returns SAM hdr ptr on success
* NULL on failure
*/
SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
int32_t header_len;
char *header;
SAM_hdr *hdr;
/* 1.1 onwards stores the header in the first block of a container */
if (CRAM_MAJOR_VERS(fd->version) == 1) {
/* Length */
if (-1 == int32_decode(fd, &header_len))
return NULL;
/* Alloc and read */
if (NULL == (header = malloc(header_len+1)))
return NULL;
*header = 0;
if (header_len != hread(fd->fp, header, header_len))
return NULL;
fd->first_container += 4 + header_len;
} else {
cram_container *c = cram_read_container(fd);
cram_block *b;
int i, len;
if (!c)
return NULL;
if (c->num_blocks < 1) {
cram_free_container(c);
return NULL;
}
if (!(b = cram_read_block(fd))) {
cram_free_container(c);
return NULL;
}
cram_uncompress_block(b);
len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
/* Extract header from 1st block */
if (-1 == int32_get(b, &header_len) ||
b->uncomp_size - 4 < header_len) {
cram_free_container(c);
cram_free_block(b);
return NULL;
}
if (NULL == (header = malloc(header_len+1))) {
cram_free_container(c);
cram_free_block(b);
return NULL;
}
memcpy(header, BLOCK_END(b), header_len);
header[header_len]='\0';
cram_free_block(b);
/* Consume any remaining blocks */
for (i = 1; i < c->num_blocks; i++) {
if (!(b = cram_read_block(fd))) {
cram_free_container(c);
return NULL;
}
len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
cram_free_block(b);
}
if (c->length && c->length > len) {
// Consume padding
char *pads = malloc(c->length - len);
if (!pads) {
cram_free_container(c);
return NULL;
}
if (c->length - len != hread(fd->fp, pads, c->length - len)) {
cram_free_container(c);
return NULL;
}
free(pads);
}
cram_free_container(c);
}
/* Parse */
hdr = sam_hdr_parse_(header, header_len);
free(header);
return hdr;
}
/*
* Converts 'in' to a full pathname to store in out.
* Out must be at least PATH_MAX bytes long.
*/
static void full_path(char *out, char *in) {
if (*in == '/') {
strncpy(out, in, PATH_MAX);
out[PATH_MAX-1] = 0;
} else {
int len;
// unable to get dir or out+in is too long
if (!getcwd(out, PATH_MAX) ||
(len = strlen(out))+1+strlen(in) >= PATH_MAX) {
strncpy(out, in, PATH_MAX);
out[PATH_MAX-1] = 0;
return;
}
sprintf(out+len, "/%.*s", PATH_MAX - len, in);
// FIXME: cope with `pwd`/../../../foo.fa ?
}
}
/*
* Writes a CRAM SAM header.
* Returns 0 on success
* -1 on failure
*/
int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
int header_len;
int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
/* Write CRAM MAGIC if not yet written. */
if (fd->file_def->major_version == 0) {
fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
if (0 != cram_write_file_def(fd, fd->file_def))
return -1;
}
/* 1.0 requires and UNKNOWN read-group */
if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
if (sam_hdr_add(hdr, "RG",
"ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
return -1;
}
/* Fix M5 strings */
if (fd->refs && !fd->no_ref) {
int i;
for (i = 0; i < hdr->nref; i++) {
SAM_hdr_type *ty;
char *ref;
if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
return -1;
if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
char unsigned buf[16], buf2[33];
int j, rlen;
MD5_CTX md5;
if (!fd->refs ||
!fd->refs->ref_id ||
!fd->refs->ref_id[i]) {
return -1;
}
rlen = fd->refs->ref_id[i]->length;
MD5_Init(&md5);
ref = cram_get_ref(fd, i, 1, rlen);
if (NULL == ref) return -1;
rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
MD5_Update(&md5, ref, rlen);
MD5_Final(buf, &md5);
cram_ref_decr(fd->refs, i);
for (j = 0; j < 16; j++) {
buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
}
buf2[32] = 0;
if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
return -1;
}
if (fd->ref_fn) {
char ref_fn[PATH_MAX];
full_path(ref_fn, fd->ref_fn);
if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
return -1;
}
}
}
if (sam_hdr_rebuild(hdr))
return -1;
/* Length */
header_len = sam_hdr_length(hdr);
if (CRAM_MAJOR_VERS(fd->version) == 1) {
if (-1 == int32_encode(fd, header_len))
return -1;
/* Text data */
if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
return -1;
} else {
/* Create block(s) inside a container */
cram_block *b = cram_new_block(FILE_HEADER, 0);
cram_container *c = cram_new_container(0, 0);
int padded_length;
char *pads;
int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
if (!b || !c) {
if (b) cram_free_block(b);
if (c) cram_free_container(c);
return -1;
}
int32_put(b, header_len);
BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
BLOCK_UPLEN(b);
// Compress header block if V3.0 and above
if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
int method = 1<use_bz2)
method |= 1<use_lzma)
method |= 1<level);
}
if (blank_block) {
c->length = b->comp_size + 2 + 4*is_cram_3 +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
c->num_blocks = 2;
c->num_landmarks = 2;
if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
c->landmark[0] = 0;
c->landmark[1] = c->length;
// Plus extra storage for uncompressed secondary blank block
padded_length = MIN(c->length*.5, 10000);
c->length += padded_length + 2 + 4*is_cram_3 +
itf8_size(b->content_id) +
itf8_size(padded_length)*2;
} else {
// Pad the block instead.
c->num_blocks = 1;
c->num_landmarks = 1;
if (!(c->landmark = malloc(sizeof(*c->landmark))))
return -1;
c->landmark[0] = 0;
padded_length = MAX(c->length*1.5, 10000) - c->length;
c->length = b->comp_size + padded_length +
2 + 4*is_cram_3 +
itf8_size(b->content_id) +
itf8_size(b->uncomp_size) +
itf8_size(b->comp_size);
if (NULL == (pads = calloc(1, padded_length))) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
BLOCK_APPEND(b, pads, padded_length);
BLOCK_UPLEN(b);
free(pads);
}
if (-1 == cram_write_container(fd, c)) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
if (-1 == cram_write_block(fd, b)) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
if (blank_block) {
BLOCK_RESIZE(b, padded_length);
memset(BLOCK_DATA(b), 0, padded_length);
BLOCK_SIZE(b) = padded_length;
BLOCK_UPLEN(b);
b->method = RAW;
if (-1 == cram_write_block(fd, b)) {
cram_free_block(b);
cram_free_container(c);
return -1;
}
}
cram_free_block(b);
cram_free_container(c);
}
if (-1 == refs_from_header(fd->refs, fd, fd->header))
return -1;
if (-1 == refs2id(fd->refs, fd->header))
return -1;
if (0 != hflush(fd->fp))
return -1;
RP("=== Finishing saving header ===\n");
return 0;
}
/* ----------------------------------------------------------------------
* The top-level cram opening, closing and option handling
*/
/*
* Initialises the lookup tables. These could be global statics, but they're
* clumsy to setup in a multi-threaded environment unless we generate
* verbatim code and include that.
*/
static void cram_init_tables(cram_fd *fd) {
int i;
memset(fd->L1, 4, 256);
fd->L1['A'] = 0; fd->L1['a'] = 0;
fd->L1['C'] = 1; fd->L1['c'] = 1;
fd->L1['G'] = 2; fd->L1['g'] = 2;
fd->L1['T'] = 3; fd->L1['t'] = 3;
memset(fd->L2, 5, 256);
fd->L2['A'] = 0; fd->L2['a'] = 0;
fd->L2['C'] = 1; fd->L2['c'] = 1;
fd->L2['G'] = 2; fd->L2['g'] = 2;
fd->L2['T'] = 3; fd->L2['t'] = 3;
fd->L2['N'] = 4; fd->L2['n'] = 4;
if (CRAM_MAJOR_VERS(fd->version) == 1) {
for (i = 0; i < 0x200; i++) {
int f = 0;
if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
if (i & CRAM_FREAD1) f |= BAM_FREAD1;
if (i & CRAM_FREAD2) f |= BAM_FREAD2;
if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
if (i & CRAM_FDUP) f |= BAM_FDUP;
fd->bam_flag_swap[i] = f;
}
for (i = 0; i < 0x1000; i++) {
int g = 0;
if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
if (i & BAM_FREAD1) g |= CRAM_FREAD1;
if (i & BAM_FREAD2) g |= CRAM_FREAD2;
if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
if (i & BAM_FDUP) g |= CRAM_FDUP;
fd->cram_flag_swap[i] = g;
}
} else {
/* NOP */
for (i = 0; i < 0x1000; i++)
fd->bam_flag_swap[i] = i;
for (i = 0; i < 0x1000; i++)
fd->cram_flag_swap[i] = i;
}
memset(fd->cram_sub_matrix, 4, 32*32);
for (i = 0; i < 32; i++) {
fd->cram_sub_matrix[i]['A'&0x1f]=0;
fd->cram_sub_matrix[i]['C'&0x1f]=1;
fd->cram_sub_matrix[i]['G'&0x1f]=2;
fd->cram_sub_matrix[i]['T'&0x1f]=3;
fd->cram_sub_matrix[i]['N'&0x1f]=4;
}
for (i = 0; i < 20; i+=4) {
int j;
for (j = 0; j < 20; j++) {
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
}
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
}
}
// Default version numbers for CRAM
static int major_version = 2;
static int minor_version = 1;
/*
* Opens a CRAM file for read (mode "rb") or write ("wb").
* The filename may be "-" to indicate stdin or stdout.
*
* Returns file handle on success
* NULL on failure.
*/
cram_fd *cram_open(const char *filename, const char *mode) {
hFILE *fp;
cram_fd *fd;
char fmode[3]= { mode[0], '\0', '\0' };
if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
fmode[1] = 'b';
}
fp = hopen(filename, fmode);
if (!fp)
return NULL;
fd = cram_dopen(fp, filename, mode);
if (!fd)
hclose_abruptly(fp);
return fd;
}
/* Opens an existing stream for reading or writing.
*
* Returns file handle on success;
* NULL on failure.
*/
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
int i;
char *cp;
cram_fd *fd = calloc(1, sizeof(*fd));
if (!fd)
return NULL;
fd->level = 5;
for (i = 0; mode[i]; i++) {
if (mode[i] >= '0' && mode[i] <= '9') {
fd->level = mode[i] - '0';
break;
}
}
fd->fp = fp;
fd->mode = *mode;
fd->first_container = 0;
if (fd->mode == 'r') {
/* Reader */
if (!(fd->file_def = cram_read_file_def(fd)))
goto err;
fd->version = fd->file_def->major_version * 256 +
fd->file_def->minor_version;
if (!(fd->header = cram_read_SAM_hdr(fd)))
goto err;
} else {
/* Writer */
cram_file_def *def = calloc(1, sizeof(*def));
if (!def)
return NULL;
fd->file_def = def;
def->magic[0] = 'C';
def->magic[1] = 'R';
def->magic[2] = 'A';
def->magic[3] = 'M';
def->major_version = 0; // Indicator to write file def later.
def->minor_version = 0;
memset(def->file_id, 0, 20);
strncpy(def->file_id, filename, 20);
fd->version = major_version * 256 + minor_version;
/* SAM header written later along with this file_def */
}
cram_init_tables(fd);
fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
if (!fd->prefix)
goto err;
fd->first_base = fd->last_base = -1;
fd->record_counter = 0;
fd->ctr = NULL;
fd->refs = refs_create();
if (!fd->refs)
goto err;
fd->ref_id = -2;
fd->ref = NULL;
fd->decode_md = 0;
fd->verbose = 0;
fd->seqs_per_slice = SEQS_PER_SLICE;
fd->slices_per_container = SLICE_PER_CNT;
fd->embed_ref = 0;
fd->no_ref = 0;
fd->ignore_md5 = 0;
fd->use_bz2 = 0;
fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
fd->use_lzma = 0;
fd->multi_seq = -1;
fd->unsorted = 0;
fd->shared_ref = 0;
fd->index = NULL;
fd->own_pool = 0;
fd->pool = NULL;
fd->rqueue = NULL;
fd->job_pending = NULL;
fd->ooc = 0;
fd->required_fields = INT_MAX;
for (i = 0; i < DS_END; i++)
fd->m[i] = cram_new_metrics();
fd->range.refid = -2; // no ref.
fd->eof = 1; // See samtools issue #150
fd->ref_fn = NULL;
fd->bl = NULL;
/* Initialise dummy refs from the @SQ headers */
if (-1 == refs_from_header(fd->refs, fd, fd->header))
goto err;
return fd;
err:
if (fd)
free(fd);
return NULL;
}
/*
* Seek within a CRAM file.
*
* Returns 0 on success
* -1 on failure
*/
int cram_seek(cram_fd *fd, off_t offset, int whence) {
char buf[65536];
fd->ooc = 0;
if (hseek(fd->fp, offset, whence) >= 0)
return 0;
if (!(whence == SEEK_CUR && offset >= 0))
return -1;
/* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
while (offset > 0) {
int len = MIN(65536, offset);
if (len != hread(fd->fp, buf, len))
return -1;
offset -= len;
}
return 0;
}
/*
* Flushes a CRAM file.
* Useful for when writing to stdout without wishing to close the stream.
*
* Returns 0 on success
* -1 on failure
*/
int cram_flush(cram_fd *fd) {
if (!fd)
return -1;
if (fd->mode == 'w' && fd->ctr) {
if(fd->ctr->slice)
fd->ctr->curr_slice++;
if (-1 == cram_flush_container_mt(fd, fd->ctr))
return -1;
}
return 0;
}
/*
* Closes a CRAM file.
* Returns 0 on success
* -1 on failure
*/
int cram_close(cram_fd *fd) {
spare_bams *bl, *next;
int i;
if (!fd)
return -1;
if (fd->mode == 'w' && fd->ctr) {
if(fd->ctr->slice)
fd->ctr->curr_slice++;
if (-1 == cram_flush_container_mt(fd, fd->ctr))
return -1;
}
if (fd->pool) {
t_pool_flush(fd->pool);
if (0 != cram_flush_result(fd))
return -1;
pthread_mutex_destroy(&fd->metrics_lock);
pthread_mutex_destroy(&fd->ref_lock);
pthread_mutex_destroy(&fd->bam_list_lock);
fd->ctr = NULL; // prevent double freeing
//fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
t_results_queue_destroy(fd->rqueue);
}
if (fd->mode == 'w') {
/* Write EOF block */
if (CRAM_MAJOR_VERS(fd->version) == 3) {
if (38 != hwrite(fd->fp,
"\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
"\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
"\x00\x01\x00" // Cont HDR
"\x05\xbd\xd9\x4f" // CRC32
"\x00\x01\x00\x06\x06" // Comp.HDR blk
"\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
"\xee\x63\x01\x4b", // CRC32
38))
return -1;
} else {
if (30 != hwrite(fd->fp,
"\x0b\x00\x00\x00\xff\xff\xff\xff"
"\x0f\xe0\x45\x4f\x46\x00\x00\x00"
"\x00\x01\x00\x00\x01\x00\x06\x06"
"\x01\x00\x01\x00\x01\x00", 30))
return -1;
}
}
for (bl = fd->bl; bl; bl = next) {
int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
next = bl->next;
for (i = 0; i < max_rec; i++) {
if (bl->bams[i])
bam_free(bl->bams[i]);
}
free(bl->bams);
free(bl);
}
if (hclose(fd->fp) != 0)
return -1;
if (fd->file_def)
cram_free_file_def(fd->file_def);
if (fd->header)
sam_hdr_free(fd->header);
free(fd->prefix);
if (fd->ctr)
cram_free_container(fd->ctr);
if (fd->refs)
refs_free(fd->refs);
if (fd->ref_free)
free(fd->ref_free);
for (i = 0; i < DS_END; i++)
if (fd->m[i])
free(fd->m[i]);
if (fd->index)
cram_index_free(fd);
if (fd->own_pool && fd->pool)
t_pool_destroy(fd->pool, 0);
free(fd);
return 0;
}
/*
* Returns 1 if we hit an EOF while reading.
*/
int cram_eof(cram_fd *fd) {
return fd->eof;
}
/*
* Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
* Use this immediately after opening.
*
* Returns 0 on success
* -1 on failure
*/
int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
int r;
va_list args;
va_start(args, opt);
r = cram_set_voption(fd, opt, args);
va_end(args);
return r;
}
/*
* Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
* Use this immediately after opening.
*
* Returns 0 on success
* -1 on failure
*/
int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
refs_t *refs;
if (!fd)
return -1;
switch (opt) {
case CRAM_OPT_DECODE_MD:
fd->decode_md = va_arg(args, int);
break;
case CRAM_OPT_PREFIX:
if (fd->prefix)
free(fd->prefix);
if (!(fd->prefix = strdup(va_arg(args, char *))))
return -1;
break;
case CRAM_OPT_VERBOSITY:
fd->verbose = va_arg(args, int);
break;
case CRAM_OPT_SEQS_PER_SLICE:
fd->seqs_per_slice = va_arg(args, int);
break;
case CRAM_OPT_SLICES_PER_CONTAINER:
fd->slices_per_container = va_arg(args, int);
break;
case CRAM_OPT_EMBED_REF:
fd->embed_ref = va_arg(args, int);
break;
case CRAM_OPT_NO_REF:
fd->no_ref = va_arg(args, int);
break;
case CRAM_OPT_IGNORE_MD5:
fd->ignore_md5 = va_arg(args, int);
break;
case CRAM_OPT_USE_BZIP2:
fd->use_bz2 = va_arg(args, int);
break;
case CRAM_OPT_USE_RANS:
fd->use_rans = va_arg(args, int);
break;
case CRAM_OPT_USE_LZMA:
fd->use_lzma = va_arg(args, int);
break;
case CRAM_OPT_SHARED_REF:
fd->shared_ref = 1;
refs = va_arg(args, refs_t *);
if (refs != fd->refs) {
if (fd->refs)
refs_free(fd->refs);
fd->refs = refs;
fd->refs->count++;
}
break;
case CRAM_OPT_RANGE:
fd->range = *va_arg(args, cram_range *);
return cram_seek_to_refpos(fd, &fd->range);
case CRAM_OPT_REFERENCE:
return cram_load_reference(fd, va_arg(args, char *));
case CRAM_OPT_VERSION: {
int major, minor;
char *s = va_arg(args, char *);
if (2 != sscanf(s, "%d.%d", &major, &minor)) {
fprintf(stderr, "Malformed version string %s\n", s);
return -1;
}
if (!((major == 1 && minor == 0) ||
(major == 2 && (minor == 0 || minor == 1)) ||
(major == 3 && minor == 0))) {
fprintf(stderr, "Unknown version string; "
"use 1.0, 2.0, 2.1 or 3.0\n");
return -1;
}
fd->version = major*256 + minor;
if (CRAM_MAJOR_VERS(fd->version) >= 3)
fd->use_rans = 1;
break;
}
case CRAM_OPT_MULTI_SEQ_PER_SLICE:
fd->multi_seq = va_arg(args, int);
break;
case CRAM_OPT_NTHREADS: {
int nthreads = va_arg(args, int);
if (nthreads > 1) {
if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
return -1;
fd->rqueue = t_results_queue_init();
pthread_mutex_init(&fd->metrics_lock, NULL);
pthread_mutex_init(&fd->ref_lock, NULL);
pthread_mutex_init(&fd->bam_list_lock, NULL);
fd->shared_ref = 1;
fd->own_pool = 1;
}
break;
}
case CRAM_OPT_THREAD_POOL:
fd->pool = va_arg(args, t_pool *);
if (fd->pool) {
fd->rqueue = t_results_queue_init();
pthread_mutex_init(&fd->metrics_lock, NULL);
pthread_mutex_init(&fd->ref_lock, NULL);
pthread_mutex_init(&fd->bam_list_lock, NULL);
}
fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
fd->own_pool = 0;
//fd->qsize = 1;
//fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
//t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
break;
case CRAM_OPT_REQUIRED_FIELDS:
fd->required_fields = va_arg(args, int);
break;
default:
fprintf(stderr, "Unknown CRAM option code %d\n", opt);
return -1;
}
return 0;
}
htslib-1.2.1/cram/cram_io.h 0000664 0000000 0000000 00000041201 12464172677 0015522 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* Include cram.h instead.
*
* This is an internal part of the CRAM system and is automatically included
* when you #include cram.h.
*
* Implements the low level CRAM I/O primitives.
* This includes basic data types such as byte, int, ITF-8,
* maps, bitwise I/O, etc.
*/
#ifndef _CRAM_IO_H_
#define _CRAM_IO_H_
#ifdef __cplusplus
extern "C" {
#endif
#define ITF8_MACROS
#include
#include
/**@{ ----------------------------------------------------------------------
* ITF8 encoding and decoding.
*
* Also see the itf8_get and itf8_put macros.
*/
/*! INTERNAL: Converts two characters into an integer for use in switch{} */
#define CRAM_KEY(a,b) (((a)<<8)|((b)))
/*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
* *val.
*
* @return
* Returns the number of bytes read on success;
* -1 on failure
*/
int itf8_decode(cram_fd *fd, int32_t *val);
#ifndef ITF8_MACROS
/*! Reads an integer in ITF-8 encoding from 'cp' and stores it in
* *val.
*
* @return
* Returns the number of bytes read on success;
* -1 on failure
*/
int itf8_get(char *cp, int32_t *val_p);
/*! Stores a value to memory in ITF-8 format.
*
* @return
* Returns the number of bytes required to store the number.
* This is a maximum of 5 bytes.
*/
int itf8_put(char *cp, int32_t val);
#else
/*
* Macro implementations of the above
*/
#define itf8_get(c,v) (((uc)(c)[0]<0x80)?(*(v)=(uc)(c)[0],1):(((uc)(c)[0]<0xc0)?(*(v)=(((uc)(c)[0]<<8)|(uc)(c)[1])&0x3fff,2):(((uc)(c)[0]<0xe0)?(*(v)=(((uc)(c)[0]<<16)|((uc)(c)[1]<<8)|(uc)(c)[2])&0x1fffff,3):(((uc)(c)[0]<0xf0)?(*(v)=(((uc)(c)[0]<<24)|((uc)(c)[1]<<16)|((uc)(c)[2]<<8)|(uc)(c)[3])&0x0fffffff,4):(*(v)=(((uc)(c)[0]&0x0f)<<28)|((uc)(c)[1]<<20)|((uc)(c)[2]<<12)|((uc)(c)[3]<<4)|((uc)(c)[4]&0x0f),5)))))
#define itf8_put(c,v) ((!((v)&~0x7f))?((c)[0]=(v),1):(!((v)&~0x3fff))?((c)[0]=((v)>>8)|0x80,(c)[1]=(v)&0xff,2):(!((v)&~0x1fffff))?((c)[0]=((v)>>16)|0xc0,(c)[1]=((v)>>8)&0xff,(c)[2]=(v)&0xff,3):(!((v)&~0xfffffff))?((c)[0]=((v)>>24)|0xe0,(c)[1]=((v)>>16)&0xff,(c)[2]=((v)>>8)&0xff,(c)[3]=(v)&0xff,4):((c)[0]=0xf0|(((v)>>28)&0xff),(c)[1]=((v)>>20)&0xff,(c)[2]=((v)>>12)&0xff,(c)[3]=((v)>>4)&0xff,(c)[4]=(v)&0xf,5))
#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
#endif
int ltf8_get(char *cp, int64_t *val_p);
int ltf8_put(char *cp, int64_t val);
/*! Pushes a value in ITF8 format onto the end of a block.
*
* This shouldn't be used for high-volume data as it is not the fastest
* method.
*
* @return
* Returns the number of bytes written
*/
int itf8_put_blk(cram_block *blk, int val);
/**@}*/
/**@{ ----------------------------------------------------------------------
* CRAM blocks - the dynamically growable data block. We have code to
* create, update, (un)compress and read/write.
*
* These are derived from the deflate_interlaced.c blocks, but with the
* CRAM extension of content types and IDs.
*/
/*! Allocates a new cram_block structure with a specified content_type and
* id.
*
* @return
* Returns block pointer on success;
* NULL on failure
*/
cram_block *cram_new_block(enum cram_content_type content_type,
int content_id);
/*! Reads a block from a cram file.
*
* @return
* Returns cram_block pointer on success;
* NULL on failure
*/
cram_block *cram_read_block(cram_fd *fd);
/*! Writes a CRAM block.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_write_block(cram_fd *fd, cram_block *b);
/*! Frees a CRAM block, deallocating internal data too.
*/
void cram_free_block(cram_block *b);
/*! Uncompress a memory block using Zlib.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
/*! Uncompresses a CRAM block, if compressed.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_uncompress_block(cram_block *b);
/*! Compresses a block.
*
* Compresses a block using one of two different zlib strategies. If we only
* want one choice set strat2 to be -1.
*
* The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
* or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
* significantly faster.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
int method, int level);
cram_metrics *cram_new_metrics(void);
char *cram_block_method2str(enum cram_block_method m);
char *cram_content_type2str(enum cram_content_type t);
/* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
/* Block size and data pointer. */
#define BLOCK_SIZE(b) ((b)->byte)
#define BLOCK_DATA(b) ((b)->data)
/* Returns the address one past the end of the block */
#define BLOCK_END(b) (&(b)->data[(b)->byte])
/* Request block to be at least 'l' bytes long */
#define BLOCK_RESIZE(b,l) \
do { \
while((b)->alloc <= (l)) { \
(b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024; \
(b)->data = realloc((b)->data, (b)->alloc); \
} \
} while(0)
/* Ensure the block can hold at least another 'l' bytes */
#define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l))
/* Append string 's' of length 'l' */
#define BLOCK_APPEND(b,s,l) \
do { \
BLOCK_GROW((b),(l)); \
memcpy(BLOCK_END((b)), (s), (l)); \
BLOCK_SIZE((b)) += (l); \
} while (0)
/* Append as single character 'c' */
#define BLOCK_APPEND_CHAR(b,c) \
do { \
BLOCK_GROW((b),1); \
(b)->data[(b)->byte++] = (c); \
} while (0)
/* Append a single unsigned integer */
#define BLOCK_APPEND_UINT(b,i) \
do { \
unsigned char *cp; \
BLOCK_GROW((b),11); \
cp = &(b)->data[(b)->byte]; \
(b)->byte += append_uint32(cp, (i)) - cp; \
} while (0)
static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
uint32_t j;
if (i == 0) {
*cp++ = '0';
return cp;
}
if (i < 100) goto b1;
if (i < 10000) goto b3;
if (i < 1000000) goto b5;
if (i < 100000000) goto b7;
if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
if (i) *cp++ = i + '0';
return cp;
x8: *cp++ = i / 100000000 + '0', i %= 100000000;
x7: *cp++ = i / 10000000 + '0', i %= 10000000;
x6: *cp++ = i / 1000000 + '0', i %= 1000000;
x5: *cp++ = i / 100000 + '0', i %= 100000;
x4: *cp++ = i / 10000 + '0', i %= 10000;
x3: *cp++ = i / 1000 + '0', i %= 1000;
x2: *cp++ = i / 100 + '0', i %= 100;
x1: *cp++ = i / 10 + '0', i %= 10;
x0: *cp++ = i + '0';
return cp;
}
static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
*cp++ = i / 100000000 + '0', i %= 100000000;
*cp++ = i / 10000000 + '0', i %= 10000000;
*cp++ = i / 1000000 + '0', i %= 1000000;
*cp++ = i / 100000 + '0', i %= 100000;
*cp++ = i / 10000 + '0', i %= 10000;
*cp++ = i / 1000 + '0', i %= 1000;
*cp++ = i / 100 + '0', i %= 100;
*cp++ = i / 10 + '0', i %= 10;
*cp++ = i + '0';
return cp;
}
static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
uint64_t j;
if (i <= 0xffffffff)
return append_uint32(cp, i);
if ((j = i/1000000000) > 1000000000) {
cp = append_uint32(cp, j/1000000000);
j %= 1000000000;
cp = append_sub32(cp, j);
} else {
cp = append_uint32(cp, i / 1000000000);
}
cp = append_sub32(cp, i % 1000000000);
return cp;
}
#define BLOCK_UPLEN(b) \
(b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
/**@}*/
/**@{ ----------------------------------------------------------------------
* Reference sequence handling
*/
/*! Loads a reference set from fn and stores in the cram_fd.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_load_reference(cram_fd *fd, char *fn);
/*! Generates a lookup table in refs based on the SQ headers in SAM_hdr.
*
* Indexes references by the order they appear in a BAM file. This may not
* necessarily be the same order they appear in the fasta reference file.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int refs2id(refs_t *r, SAM_hdr *bfd);
void refs_free(refs_t *r);
/*! Returns a portion of a reference sequence from start to end inclusive.
*
* The returned pointer is owned by the cram_file fd and should not be freed
* by the caller. It is valid only until the next cram_get_ref is called
* with the same fd parameter (so is thread-safe if given multiple files).
*
* To return the entire reference sequence, specify start as 1 and end
* as 0.
*
* @return
* Returns reference on success;
* NULL on failure
*/
char *cram_get_ref(cram_fd *fd, int id, int start, int end);
void cram_ref_incr(refs_t *r, int id);
void cram_ref_decr(refs_t *r, int id);
/**@}*/
/**@{ ----------------------------------------------------------------------
* Containers
*/
/*! Creates a new container, specifying the maximum number of slices
* and records permitted.
*
* @return
* Returns cram_container ptr on success;
* NULL on failure
*/
cram_container *cram_new_container(int nrec, int nslice);
void cram_free_container(cram_container *c);
/*! Reads a container header.
*
* @return
* Returns cram_container on success;
* NULL on failure or no container left (fd->err == 0).
*/
cram_container *cram_read_container(cram_fd *fd);
/*! Writes a container structure.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_write_container(cram_fd *fd, cram_container *h);
/*! Flushes a container to disk.
*
* Flushes a completely or partially full container to disk, writing
* container structure, header and blocks. This also calls the encoder
* functions.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_flush_container(cram_fd *fd, cram_container *c);
int cram_flush_container_mt(cram_fd *fd, cram_container *c);
/**@}*/
/**@{ ----------------------------------------------------------------------
* Compression headers; the first part of the container
*/
/*! Creates a new blank container compression header
*
* @return
* Returns header ptr on success;
* NULL on failure
*/
cram_block_compression_hdr *cram_new_compression_header(void);
/*! Frees a cram_block_compression_hdr */
void cram_free_compression_header(cram_block_compression_hdr *hdr);
/**@}*/
/**@{ ----------------------------------------------------------------------
* Slices and slice headers
*/
/*! Frees a slice header */
void cram_free_slice_header(cram_block_slice_hdr *hdr);
/*! Frees a slice */
void cram_free_slice(cram_slice *s);
/*! Creates a new empty slice in memory, for subsequent writing to
* disk.
*
* @return
* Returns cram_slice ptr on success;
* NULL on failure
*/
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
/*! Loads an entire slice.
*
* FIXME: In 1.0 the native unit of slices within CRAM is broken
* as slices contain references to objects in other slices.
* To work around this while keeping the slice oriented outer loop
* we read all slices and stitch them together into a fake large
* slice instead.
*
* @return
* Returns cram_slice ptr on success;
* NULL on failure
*/
cram_slice *cram_read_slice(cram_fd *fd);
/**@}*/
/**@{ ----------------------------------------------------------------------
* CRAM file definition (header)
*/
/*! Reads a CRAM file definition structure.
*
* @return
* Returns file_def ptr on success;
* NULL on failure
*/
cram_file_def *cram_read_file_def(cram_fd *fd);
/*! Writes a cram_file_def structure to cram_fd.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_write_file_def(cram_fd *fd, cram_file_def *def);
/*! Frees a cram_file_def structure. */
void cram_free_file_def(cram_file_def *def);
/**@}*/
/**@{ ----------------------------------------------------------------------
* SAM header I/O
*/
/*! Reads the SAM header from the first CRAM data block.
*
* Also performs minimal parsing to extract read-group
* and sample information.
*
* @return
* Returns SAM hdr ptr on success;
* NULL on failure
*/
SAM_hdr *cram_read_SAM_hdr(cram_fd *fd);
/*! Writes a CRAM SAM header.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr);
/**@}*/
/**@{ ----------------------------------------------------------------------
* The top-level cram opening, closing and option handling
*/
/*! Opens a CRAM file for read (mode "rb") or write ("wb").
*
* The filename may be "-" to indicate stdin or stdout.
*
* @return
* Returns file handle on success;
* NULL on failure.
*/
cram_fd *cram_open(const char *filename, const char *mode);
/*! Opens an existing stream for reading or writing.
*
* @return
* Returns file handle on success;
* NULL on failure.
*/
cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
/*! Closes a CRAM file.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_close(cram_fd *fd);
/*
* Seek within a CRAM file.
*
* Returns 0 on success
* -1 on failure
*/
int cram_seek(cram_fd *fd, off_t offset, int whence);
/*
* Flushes a CRAM file.
* Useful for when writing to stdout without wishing to close the stream.
*
* Returns 0 on success
* -1 on failure
*/
int cram_flush(cram_fd *fd);
/*! Checks for end of file on a cram_fd stream.
*
* @return
* Returns 0 if not at end of file
* 1 if we hit an expected EOF (end of range or EOF block)
* 2 for other EOF (end of stream without EOF block)
*/
int cram_eof(cram_fd *fd);
/*! Sets options on the cram_fd.
*
* See CRAM_OPT_* definitions in cram_structs.h.
* Use this immediately after opening.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_set_option(cram_fd *fd, enum cram_option opt, ...);
/*! Sets options on the cram_fd.
*
* See CRAM_OPT_* definitions in cram_structs.h.
* Use this immediately after opening.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args);
/*!
* Attaches a header to a cram_fd.
*
* This should be used when creating a new cram_fd for writing where
* we have an SAM_hdr already constructed (eg from a file we've read
* in).
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
#ifdef __cplusplus
}
#endif
#endif /* _CRAM_IO_H_ */
htslib-1.2.1/cram/cram_samtools.c 0000664 0000000 0000000 00000011431 12464172677 0016751 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2010-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include
#include
#include
#include "cram/cram.h"
#include "htslib/sam.h"
/*---------------------------------------------------------------------------
* Samtools compatibility portion
*/
int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
const char *qname, size_t qname_len,
int flag,
int rname, // Ref ID
int pos,
int end, // aligned start/end coords
int mapq,
uint32_t ncigar, const uint32_t *cigar,
int mrnm, // Mate Ref ID
int mpos,
int isize,
int len,
const char *seq,
const char *qual) {
static const char L[256] = {
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15,
15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15,
15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
};
bam1_t *b = (bam1_t *)*bp;
uint8_t *cp;
int i, bam_len;
//b->l_aux = extra_len; // we fill this out later
bam_len = qname_len + 1 + ncigar*4 + (len+1)/2 + len + extra_len;
if (b->m_data < bam_len) {
b->m_data = bam_len;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
if (!b->data)
return -1;
}
b->l_data = bam_len;
b->core.tid = rname;
b->core.pos = pos-1;
b->core.bin = bam_reg2bin(pos, end);
b->core.qual = mapq;
b->core.l_qname = qname_len+1;
b->core.flag = flag;
b->core.n_cigar = ncigar;
b->core.l_qseq = len;
b->core.mtid = mrnm;
b->core.mpos = mpos-1;
b->core.isize = isize;
cp = b->data;
strncpy((char *)cp, qname, qname_len);
cp[qname_len] = 0;
cp += qname_len+1;
memcpy(cp, cigar, ncigar*4);
cp += ncigar*4;
for (i = 0; i+1 < len; i+=2) {
*cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]];
}
if (i < len)
*cp++ = L[(uc)seq[i]]<<4;
if (qual)
memcpy(cp, qual, len);
else
memset(cp, '\xff', len);
return 0;
}
bam_hdr_t *cram_header_to_bam(SAM_hdr *h) {
int i;
bam_hdr_t *header = bam_hdr_init();
header->l_text = ks_len(&h->text);
header->text = malloc(header->l_text+1);
memcpy(header->text, ks_str(&h->text), header->l_text);
header->text[header->l_text] = 0;
header->n_targets = h->nref;
header->target_name = (char **)calloc(header->n_targets,
sizeof(char *));
header->target_len = (uint32_t *)calloc(header->n_targets, 4);
for (i = 0; i < h->nref; i++) {
header->target_name[i] = strdup(h->ref[i].name);
header->target_len[i] = h->ref[i].len;
}
return header;
}
SAM_hdr *bam_header_to_cram(bam_hdr_t *h) {
return sam_hdr_parse_(h->text, h->l_text);
}
htslib-1.2.1/cram/cram_samtools.h 0000664 0000000 0000000 00000006641 12464172677 0016765 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2010-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CRAM_SAMTOOLS_H_
#define _CRAM_SAMTOOLS_H_
/* Samtools compatible API */
#define bam_blk_size(b) ((b)->l_data)
#define bam_set_blk_size(b,v) ((b)->data_len = (v))
#define bam_ref(b) (b)->core.tid
#define bam_pos(b) (b)->core.pos
#define bam_mate_pos(b) (b)->core.mpos
#define bam_mate_ref(b) (b)->core.mtid
#define bam_ins_size(b) (b)->core.isize
#define bam_seq_len(b) (b)->core.l_qseq
#define bam_cigar_len(b) (b)->core.n_cigar
#define bam_flag(b) (b)->core.flag
#define bam_bin(b) (b)->core.bin
#define bam_map_qual(b) (b)->core.qual
#define bam_name_len(b) (b)->core.l_qname
#define bam_name(b) bam_get_qname((b))
#define bam_qual(b) bam_get_qual((b))
#define bam_seq(b) bam_get_seq((b))
#define bam_cigar(b) bam_get_cigar((b))
#define bam_aux(b) bam_get_aux((b))
#define bam_dup(b) bam_copy1(bam_init1(), (b))
#define bam_free(b) bam_destroy1((b))
#define bam_reg2bin(beg,end) hts_reg2bin((beg),(end),14,5)
#include "htslib/sam.h"
enum cigar_op {
BAM_CMATCH_=BAM_CMATCH,
BAM_CINS_=BAM_CINS,
BAM_CDEL_=BAM_CDEL,
BAM_CREF_SKIP_=BAM_CREF_SKIP,
BAM_CSOFT_CLIP_=BAM_CSOFT_CLIP,
BAM_CHARD_CLIP_=BAM_CHARD_CLIP,
BAM_CPAD_=BAM_CPAD,
BAM_CBASE_MATCH=BAM_CEQUAL,
BAM_CBASE_MISMATCH=BAM_CDIFF
};
typedef bam1_t bam_seq_t;
#include "cram/sam_header.h"
bam_hdr_t *cram_header_to_bam(SAM_hdr *h);
SAM_hdr *bam_header_to_cram(bam_hdr_t *h);
int bam_construct_seq(bam_seq_t **bp, size_t extra_len,
const char *qname, size_t qname_len,
int flag,
int rname, // Ref ID
int pos,
int end, // aligned start/end coords
int mapq,
uint32_t ncigar, const uint32_t *cigar,
int mrnm, // Mate Ref ID
int mpos,
int isize,
int len,
const char *seq,
const char *qual);
#endif /* _CRAM_SAMTOOLS_H_ */
htslib-1.2.1/cram/cram_stats.c 0000664 0000000 0000000 00000027632 12464172677 0016260 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "cram/cram.h"
#include "cram/os.h"
cram_stats *cram_stats_create(void) {
return calloc(1, sizeof(cram_stats));
}
void cram_stats_add(cram_stats *st, int32_t val) {
st->nsamp++;
//assert(val >= 0);
if (val < MAX_STAT_VAL && val >= 0) {
st->freqs[val]++;
} else {
khint_t k;
int r;
if (!st->h) {
st->h = kh_init(m_i2i);
}
k = kh_put(m_i2i, st->h, val, &r);
if (r == 0)
kh_val(st->h, k)++;
else if (r != -1)
kh_val(st->h, k) = 1;
else
; // FIXME: handle error
}
}
void cram_stats_del(cram_stats *st, int32_t val) {
st->nsamp--;
//assert(val >= 0);
if (val < MAX_STAT_VAL && val >= 0) {
st->freqs[val]--;
assert(st->freqs[val] >= 0);
} else if (st->h) {
khint_t k = kh_get(m_i2i, st->h, val);
if (k != kh_end(st->h)) {
if (--kh_val(st->h, k) == 0)
kh_del(m_i2i, st->h, k);
} else {
fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
st->nsamp++;
}
} else {
fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
st->nsamp++;
}
}
void cram_stats_dump(cram_stats *st) {
int i;
fprintf(stderr, "cram_stats:\n");
for (i = 0; i < MAX_STAT_VAL; i++) {
if (!st->freqs[i])
continue;
fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
}
if (st->h) {
khint_t k;
for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
if (!kh_exist(st->h, k))
continue;
fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
}
}
}
#if 1
/* Returns the number of bits set in val; it the highest bit used */
static int nbits(int v) {
static const int MultiplyDeBruijnBitPosition[32] = {
1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
};
v |= v >> 1; // first up to set all bits 1 after the first 1 */
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
// DeBruijn magic to find top bit
return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
}
#endif
/*
* Computes entropy from integer frequencies for various encoding methods and
* picks the best encoding.
*
* FIXME: we could reuse some of the code here for the actual encoding
* parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
*
* Returns the best codec to use.
*/
enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
enum cram_encoding best_encoding = E_NULL;
int best_size = INT_MAX, bits;
int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;
//cram_stats_dump(st);
/* Count number of unique symbols */
for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
if (!st->freqs[i])
continue;
if (nvals >= vals_alloc) {
vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
vals = realloc(vals, vals_alloc * sizeof(int));
freqs = realloc(freqs, vals_alloc * sizeof(int));
if (!vals || !freqs) {
if (vals) free(vals);
if (freqs) free(freqs);
return E_HUFFMAN; // Cannot do much else atm
}
}
vals[nvals] = i;
freqs[nvals] = st->freqs[i];
ntot += freqs[nvals];
if (max_val < i) max_val = i;
if (min_val > i) min_val = i;
nvals++;
}
if (st->h) {
khint_t k;
int i;
for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
if (!kh_exist(st->h, k))
continue;
if (nvals >= vals_alloc) {
vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
vals = realloc(vals, vals_alloc * sizeof(int));
freqs = realloc(freqs, vals_alloc * sizeof(int));
if (!vals || !freqs)
return E_HUFFMAN; // Cannot do much else atm
}
i = kh_key(st->h, k);
vals[nvals]=i;
freqs[nvals] = kh_val(st->h, k);
ntot += freqs[nvals];
if (max_val < i) max_val = i;
if (min_val > i) min_val = i;
nvals++;
}
}
st->nvals = nvals;
assert(ntot == st->nsamp);
if (nvals <= 1) {
free(vals);
free(freqs);
return E_HUFFMAN;
}
if (fd->verbose > 1)
fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
min_val, max_val, nvals, ntot);
/* Theoretical entropy */
// if (fd->verbose > 1) {
// double dbits = 0;
// for (i = 0; i < nvals; i++) {
// dbits += freqs[i] * log((double)freqs[i]/ntot);
// }
// dbits /= -log(2);
// if (fd->verbose > 1)
// fprintf(stderr, "Entropy = %f\n", dbits);
// }
if (nvals > 1 && ntot > 256) {
#if 0
/*
* CRUDE huffman estimator. Round to closest and round up from 0
* to 1 bit.
*
* With and without ITF8 incase we have a few discrete values but with
* large magnitude.
*
* Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
* compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
* may detect the correlation of high bytes to low bytes in multi-
* byte values. So this predictor breaks down.
*/
double dbits = 0; // entropy + ~huffman
double dbitsH = 0;
double dbitsE = 0; // external entropy + ~huffman
double dbitsEH = 0;
int F[256] = {0}, n = 0;
double e = 0; // accumulated error bits
for (i = 0; i < nvals; i++) {
double x; int X;
unsigned int v = vals[i];
//Better encoding would cope with sign.
//v = ABS(vals[i])*2+(vals[i]<0);
if (!(v & ~0x7f)) {
F[v] += freqs[i], n+=freqs[i];
} else if (!(v & ~0x3fff)) {
F[(v>>8) |0x80] += freqs[i];
F[ v &0xff] += freqs[i], n+=2*freqs[i];
} else if (!(v & ~0x1fffff)) {
F[(v>>16)|0xc0] += freqs[i];
F[(v>>8 )&0xff] += freqs[i];
F[ v &0xff] += freqs[i], n+=3*freqs[i];
} else if (!(v & ~0x0fffffff)) {
F[(v>>24)|0xe0] += freqs[i];
F[(v>>16)&0xff] += freqs[i];
F[(v>>8 )&0xff] += freqs[i];
F[ v &0xff] += freqs[i], n+=4*freqs[i];
} else {
F[(v>>28)|0xf0] += freqs[i];
F[(v>>20)&0xff] += freqs[i];
F[(v>>12)&0xff] += freqs[i];
F[(v>>4 )&0xff] += freqs[i];
F[ v &0x0f] += freqs[i], n+=5*freqs[i];
}
x = -log((double)freqs[i]/ntot)/.69314718055994530941;
X = x+0.5;
if ((int)(x+((double)e/freqs[i])+.5)>X) {
X++;
} else if ((int)(x+((double)e/freqs[i])+.5) 1.1) {
//fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals);
free(vals); free(freqs);
return nvals < 200 ? E_HUFFMAN : E_BETA;
}
#endif
free(vals); free(freqs);
return E_EXTERNAL;
}
/*
* Avoid complex stats for now, just do heuristic of HUFFMAN for small
* alphabets and BETA for anything large.
*/
free(vals); free(freqs);
return nvals < 200 ? E_HUFFMAN : E_BETA;
//return E_HUFFMAN;
//return E_EXTERNAL;
/* We only support huffman now anyway... */
//free(vals); free(freqs); return E_HUFFMAN;
/* Beta */
bits = nbits(max_val - min_val) * ntot;
if (fd->verbose > 1)
fprintf(stderr, "BETA = %d\n", bits);
if (best_size > bits)
best_size = bits, best_encoding = E_BETA;
#if 0
/* Unary */
if (min_val >= 0) {
for (bits = i = 0; i < nvals; i++)
bits += freqs[i]*(vals[i]+1);
if (fd->verbose > 1)
fprintf(stderr, "UNARY = %d\n", bits);
if (best_size > bits)
best_size = bits, best_encoding = E_NULL; //E_UNARY;
}
/* Gamma */
for (bits = i = 0; i < nvals; i++)
bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i];
if (fd->verbose > 1)
fprintf(stderr, "GAMMA = %d\n", bits);
if (best_size > bits)
best_size = bits, best_encoding = E_GAMMA;
/* Subexponential */
for (k = 0; k < 10; k++) {
for (bits = i = 0; i < nvals; i++) {
if (vals[i]-min_val < (1<verbose > 1)
fprintf(stderr, "SUBEXP%d = %d\n", k, bits);
if (best_size > bits)
best_size = bits, best_encoding = E_SUBEXP;
}
#endif
/* byte array len */
/* byte array stop */
/* External? Guesswork! */
/* Huffman */
// qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs);
// for (i = 0; i < nvals; i++) {
// fprintf(stderr, "%d = %d\n", i, freqs[i]);
// vals[i] = 0;
// }
/* Grow freqs to 2*freqs, to store sums */
/* Vals holds link data */
freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
codes = calloc(2*nvals, sizeof(*codes));
if (!freqs || !codes)
return E_HUFFMAN; // Cannot do much else atm
/* Inefficient, use pointers to form chain so we can insert and maintain
* a sorted list? This is currently O(nvals^2) complexity.
*/
for (;;) {
int low1 = INT_MAX, low2 = INT_MAX;
int ind1 = 0, ind2 = 0;
for (i = 0; i < nvals; i++) {
if (freqs[i] < 0)
continue;
if (low1 > freqs[i])
low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
else if (low2 > freqs[i])
low2 = freqs[i], ind2 = i;
}
if (low2 == INT_MAX)
break;
//fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n",
// ind1, vals[ind1], ind2, vals[ind2], low1, low2,
// nvals, low1+low2);
freqs[nvals] = low1 + low2;
codes[ind1] = nvals;
codes[ind2] = nvals;
freqs[ind1] *= -1;
freqs[ind2] *= -1;
nvals++;
}
nvals = nvals/2+1;
for (i = 0; i < nvals; i++) {
int code_len = 0;
for (k = codes[i]; k; k = codes[k])
code_len++;
codes[i] = code_len;
freqs[i] *= -1;
//fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]);
}
for (bits = i = 0; i < nvals; i++) {
bits += freqs[i] * codes[i];
}
if (fd->verbose > 1)
fprintf(stderr, "HUFFMAN = %d\n", bits);
if (best_size >= bits)
best_size = bits, best_encoding = E_HUFFMAN;
free(codes);
free(vals);
free(freqs);
return best_encoding;
}
void cram_stats_free(cram_stats *st) {
if (st->h)
kh_destroy(m_i2i, st->h);
free(st);
}
htslib-1.2.1/cram/cram_stats.h 0000664 0000000 0000000 00000004347 12464172677 0016263 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CRAM_STATS_H_
#define _CRAM_STATS_H_
#ifdef __cplusplus
extern "C" {
#endif
cram_stats *cram_stats_create(void);
void cram_stats_add(cram_stats *st, int32_t val);
void cram_stats_del(cram_stats *st, int32_t val);
void cram_stats_dump(cram_stats *st);
void cram_stats_free(cram_stats *st);
/*
* Computes entropy from integer frequencies for various encoding methods and
* picks the best encoding.
*
* FIXME: we could reuse some of the code here for the actual encoding
* parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
*
* Returns the best codec to use.
*/
enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st);
#ifdef __cplusplus
}
#endif
#endif
htslib-1.2.1/cram/cram_structs.h 0000664 0000000 0000000 00000053422 12464172677 0016632 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2012-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _CRAM_STRUCTS_H_
#define _CRAM_STRUCTS_H_
#ifdef __cplusplus
extern "C" {
#endif
/*
* Defines in-memory structs for the basic file-format objects in the
* CRAM format.
*
* The basic file format is:
* File-def SAM-hdr Container Container ...
*
* Container:
* Service-block data-block data-block ...
*
* Multiple blocks in a container are grouped together as slices,
* also sometimes referred to as landmarks in the spec.
*/
#include
#include "cram/thread_pool.h"
#include "cram/string_alloc.h"
#include "htslib/khash.h"
// Generic hash-map integer -> integer
KHASH_MAP_INIT_INT(m_i2i, int)
// Generic hash-set integer -> (existance)
KHASH_SET_INIT_INT(s_i2i)
// For brevity
typedef unsigned char uc;
/*
* A union for the preservation map. Required for khash.
*/
typedef union {
int i;
char *p;
} pmap_t;
// Generates static functions here which isn't ideal, but we have no way
// currently to declare the kh_map_t structure here without also declaring a
// duplicate in the .c files due to the nature of the KHASH macros.
KHASH_MAP_INIT_STR(map, pmap_t)
struct hFILE;
#define SEQS_PER_SLICE 10000
#define SLICE_PER_CNT 1
#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
#define MAX_STAT_VAL 1024
//#define MAX_STAT_VAL 16
typedef struct {
int freqs[MAX_STAT_VAL];
khash_t(m_i2i) *h;
int nsamp; // total number of values added
int nvals; // total number of unique values added
} cram_stats;
/* NB: matches java impl, not the spec */
enum cram_encoding {
E_NULL = 0,
E_EXTERNAL = 1,
E_GOLOMB = 2,
E_HUFFMAN = 3,
E_BYTE_ARRAY_LEN = 4,
E_BYTE_ARRAY_STOP = 5,
E_BETA = 6,
E_SUBEXP = 7,
E_GOLOMB_RICE = 8,
E_GAMMA = 9
};
enum cram_external_type {
E_INT = 1,
E_LONG = 2,
E_BYTE = 3,
E_BYTE_ARRAY = 4,
E_BYTE_ARRAY_BLOCK = 5,
};
/* External IDs used by this implementation (only assumed during writing) */
enum cram_DS_ID {
DS_CORE = 0,
DS_aux = 1, // aux_blk
DS_aux_OQ = 2,
DS_aux_BQ = 3,
DS_aux_BD = 4,
DS_aux_BI = 5,
DS_aux_FZ = 6, // also ZM:B
DS_aux_oq = 7, // other qualities
DS_aux_os = 8, // other sequences
DS_aux_oz = 9, // other strings
DS_ref,
DS_RN, // name_blk
DS_QS, // qual_blk
DS_IN, // base_blk
DS_SC, // soft_blk
DS_BF, // start loop
DS_CF,
DS_AP,
DS_RG,
DS_MQ,
DS_NS,
DS_MF,
DS_TS,
DS_NP,
DS_NF,
DS_RL,
DS_FN,
DS_FC,
DS_FP,
DS_DL,
DS_BA,
DS_BS,
DS_TL,
DS_RI,
DS_RS,
DS_PD,
DS_HC,
DS_BB,
DS_QQ,
DS_TN, // end loop
DS_RN_len,
DS_SC_len,
DS_BB_len,
DS_QQ_len,
DS_TC, // CRAM v1.0 tags
DS_TM, // test
DS_TV, // test
DS_END,
};
/* "File Definition Structure" */
typedef struct {
char magic[4];
uint8_t major_version;
uint8_t minor_version;
char file_id[20]; // Filename or SHA1 checksum
} cram_file_def;
#define CRAM_MAJOR_VERS(v) ((v) >> 8)
#define CRAM_MINOR_VERS(v) ((v) & 0xff)
struct cram_slice;
enum cram_block_method {
ERROR = -1,
RAW = 0,
GZIP = 1,
BZIP2 = 2,
LZMA = 3,
RANS = 4, // Generic; either order
RANS0 = 4,
RANS1 = 10, // Not externalised; stored as RANS (generic)
GZIP_RLE = 11, // NB: not externalised in CRAM
};
enum cram_content_type {
CT_ERROR = -1,
FILE_HEADER = 0,
COMPRESSION_HEADER = 1,
MAPPED_SLICE = 2,
UNMAPPED_SLICE = 3, // CRAM V1.0 only
EXTERNAL = 4,
CORE = 5,
};
/* Compression metrics */
typedef struct {
// number of trials and time to next trial
int trial;
int next_trial;
// aggregate sizes during trials
int sz_gz_rle;
int sz_gz_def;
int sz_rans0;
int sz_rans1;
int sz_bzip2;
int sz_lzma;
// resultant method from trials
int method;
int strat;
// Revisions of method, to allow culling of continually failing ones.
int gz_rle_cnt;
int gz_def_cnt;
int rans0_cnt;
int rans1_cnt;
int bzip2_cnt;
int lzma_cnt;
int revised_method;
double gz_rle_extra;
double gz_def_extra;
double rans0_extra;
double rans1_extra;
double bzip2_extra;
double lzma_extra;
} cram_metrics;
/* Block */
typedef struct {
enum cram_block_method method, orig_method;
enum cram_content_type content_type;
int32_t content_id;
int32_t comp_size;
int32_t uncomp_size;
uint32_t crc32;
int32_t idx; /* offset into data */
unsigned char *data;
// For bit I/O
size_t alloc;
size_t byte;
int bit;
} cram_block;
struct cram_codec; /* defined in cram_codecs.h */
struct cram_map;
#define CRAM_MAP_HASH 32
#define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
/* Compression header block */
typedef struct {
int32_t ref_seq_id;
int32_t ref_seq_start;
int32_t ref_seq_span;
int32_t num_records;
int32_t num_landmarks;
int32_t *landmark;
/* Flags from preservation map */
int mapped_qs_included;
int unmapped_qs_included;
int unmapped_placed;
int qs_included;
int read_names_included;
int AP_delta;
// indexed by ref-base and subst. code
char substitution_matrix[5][4];
// TD Dictionary as a concatenated block
cram_block *TD_blk; // Tag Dictionary
int nTL; // number of TL entries in TD
unsigned char **TL; // array of size nTL, pointer into TD_blk.
khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
string_alloc_t *TD_keys; // Pooled keys for TD hash.
khash_t(map) *preservation_map;
struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
struct cram_codec *codecs[DS_END];
char *uncomp; // A single block of uncompressed data
size_t uncomp_size, uncomp_alloc;
unsigned int data_series; // See cram_fields enum below
} cram_block_compression_hdr;
typedef struct cram_map {
int key; /* 0xe0 + 3 bytes */
enum cram_encoding encoding;
int offset; /* Offset into a single block of memory */
int size; /* Size */
struct cram_codec *codec;
struct cram_map *next; // for noddy internal hash
} cram_map;
/* Mapped or unmapped slice header block */
typedef struct {
enum cram_content_type content_type;
int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
int32_t num_records;
int64_t record_counter;
int32_t num_blocks;
int32_t num_content_ids;
int32_t *block_content_ids;
int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
unsigned char md5[16];
} cram_block_slice_hdr;
struct ref_entry;
/*
* Container.
*
* Conceptually a container is split into slices, and slices into blocks.
* However on disk it's just a list of blocks and we need to query the
* block types to identify the start/end points of the slices.
*
* OR... are landmarks the start/end points of slices?
*/
typedef struct {
int32_t length;
int32_t ref_seq_id;
int32_t ref_seq_start;
int32_t ref_seq_span;
int64_t record_counter;
int64_t num_bases;
int32_t num_records;
int32_t num_blocks;
int32_t num_landmarks;
int32_t *landmark;
/* Size of container header above */
size_t offset;
/* Compression header is always the first block? */
cram_block_compression_hdr *comp_hdr;
cram_block *comp_hdr_block;
/* For construction purposes */
int max_slice, curr_slice; // maximum number of slices
int max_rec, curr_rec; // current and max recs per slice
int max_c_rec, curr_c_rec; // current and max recs per container
int slice_rec; // rec no. for start of this slice
int curr_ref; // current ref ID. -2 for no previous
int last_pos; // last record position
struct cram_slice **slices, *slice;
int pos_sorted; // boolean, 1=>position sorted data
int max_apos; // maximum position, used if pos_sorted==0
int last_slice; // number of reads in last slice (0 for 1st)
int multi_seq; // true if packing multi seqs per cont/slice
int unsorted; // true is AP_delta is 0.
/* Copied from fd before encoding, to allow multi-threading */
int ref_start, first_base, last_base, ref_id, ref_end;
char *ref;
//struct ref_entry *ref;
/* For multi-threading */
bam_seq_t **bams;
/* Statistics for encoding */
cram_stats *stats[DS_END];
khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
int *refs_used; // array of frequency of ref seq IDs
uint32_t crc32; // CRC32
} cram_container;
/*
* A single cram record
*/
typedef struct {
struct cram_slice *s; // Filled out by cram_decode only
int32_t ref_id; // fixed for all recs in slice?
int32_t flags; // BF
int32_t cram_flags; // CF
int32_t len; // RL
int32_t apos; // AP
int32_t rg; // RG
int32_t name; // RN; idx to s->names_blk
int32_t name_len;
int32_t mate_line; // index to another cram_record
int32_t mate_ref_id;
int32_t mate_pos; // NP
int32_t tlen; // TS
// Auxiliary data
int32_t ntags; // TC
int32_t aux; // idx to s->aux_blk
int32_t aux_size; // total size of packed ntags in aux_blk
#ifndef TN_external
int32_t TN_idx; // TN; idx to s->TN;
#else
int32_t tn; // idx to s->tn_blk
#endif
int TL;
int32_t seq; // idx to s->seqs_blk
int32_t qual; // idx to s->qual_blk
int32_t cigar; // idx to s->cigar
int32_t ncigar;
int32_t aend; // alignment end
int32_t mqual; // MQ
int32_t feature; // idx to s->feature
int32_t nfeature; // number of features
int32_t mate_flags; // MF
} cram_record;
// Accessor macros as an analogue of the bam ones
#define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
#define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
#define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
#define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
#define cram_seqi(c,i) (cram_seq((c))[(i)])
#define cram_name_len(c) ((c)->name_len)
#define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
#define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
#define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
/*
* A feature is a base difference, used for the sequence reference encoding.
* (We generate these internally when writing CRAM.)
*/
typedef struct {
union {
struct {
int pos;
int code;
int base; // substitution code
} X;
struct {
int pos;
int code;
int base; // actual base & qual
int qual;
} B;
struct {
int pos;
int code;
int seq_idx; // index to s->seqs_blk
int len;
} b;
struct {
int pos;
int code;
int qual;
} Q;
struct {
int pos;
int code;
int len;
int seq_idx; // soft-clip multiple bases
} S;
struct {
int pos;
int code;
int len;
int seq_idx; // insertion multiple bases
} I;
struct {
int pos;
int code;
int base; // insertion single base
} i;
struct {
int pos;
int code;
int len;
} D;
struct {
int pos;
int code;
int len;
} N;
struct {
int pos;
int code;
int len;
} P;
struct {
int pos;
int code;
int len;
} H;
};
} cram_feature;
/*
* A slice is really just a set of blocks, but it
* is the logical unit for decoding a number of
* sequences.
*/
typedef struct cram_slice {
cram_block_slice_hdr *hdr;
cram_block *hdr_block;
cram_block **block;
cram_block **block_by_id;
/* State used during encoding/decoding */
int last_apos, max_apos;
/* Array of decoded cram records */
cram_record *crecs;
/* An dynamically growing buffers for data pointed
* to by crecs[] array.
*/
uint32_t *cigar;
uint32_t cigar_alloc;
uint32_t ncigar;
cram_feature *features;
int nfeatures;
int afeatures; // allocated size of features
#ifndef TN_external
// TN field (Tag Name)
uint32_t *TN;
int nTN, aTN; // used and allocated size for TN[]
#else
cram_block *tn_blk;
int tn_id;
#endif
// For variable sized elements which are always external blocks.
cram_block *name_blk;
cram_block *seqs_blk;
cram_block *qual_blk;
cram_block *base_blk;
cram_block *soft_blk;
cram_block *aux_blk;
cram_block *aux_OQ_blk;
cram_block *aux_BQ_blk;
cram_block *aux_BD_blk;
cram_block *aux_BI_blk;
cram_block *aux_FZ_blk;
cram_block *aux_oq_blk;
cram_block *aux_os_blk;
cram_block *aux_oz_blk;
string_alloc_t *pair_keys; // Pooled keys for pair hash.
khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
char *ref; // slice of current reference
int ref_start; // start position of current reference;
int ref_end; // end position of current reference;
int ref_id;
} cram_slice;
/*-----------------------------------------------------------------------------
* Consider moving reference handling to cram_refs.[ch]
*/
// from fa.fai / samtools faidx files
typedef struct ref_entry {
char *name;
char *fn;
int64_t length;
int64_t offset;
int bases_per_line;
int line_length;
int64_t count; // for shared references so we know to dealloc seq
char *seq;
} ref_entry;
KHASH_MAP_INIT_STR(refs, ref_entry*)
// References structure.
typedef struct {
string_alloc_t *pool; // String pool for holding filenames and SN vals
khash_t(refs) *h_meta; // ref_entry*, index by name
ref_entry **ref_id; // ref_entry*, index by ID
int nref; // number of ref_entry
char *fn; // current file opened
BGZF *fp; // and the hFILE* to go with it.
int count; // how many cram_fd sharing this refs struct
pthread_mutex_t lock; // Mutex for multi-threaded updating
ref_entry *last; // Last queried sequence
int last_id; // Used in cram_ref_decr_locked to delay free
} refs_t;
/*-----------------------------------------------------------------------------
* CRAM index
*
* Detect format by number of entries per line.
* 5 => 1.0 (refid, start, nseq, C offset, slice)
* 6 => 1.1 (refid, start, span, C offset, S offset, S size)
*
* Indices are stored in a nested containment list, which is trivial to set
* up as the indices are on sorted data so we're appending to the nclist
* in sorted order. Basically if a slice entirely fits within a previous
* slice then we append to that slices list. This is done recursively.
*
* Lists are sorted on two dimensions: ref id + slice coords.
*/
typedef struct cram_index {
int nslice, nalloc; // total number of slices
struct cram_index *e; // array of size nslice
int refid; // 1.0 1.1
int start; // 1.0 1.1
int end; // 1.1
int nseq; // 1.0 - undocumented
int slice; // 1.0 landmark index, 1.1 landmark value
int len; // 1.1 - size of slice in bytes
int64_t offset; // 1.0 1.1
} cram_index;
typedef struct {
int refid;
int start;
int end;
} cram_range;
/*-----------------------------------------------------------------------------
*/
/* CRAM File handle */
typedef struct spare_bams {
bam_seq_t **bams;
struct spare_bams *next;
} spare_bams;
typedef struct cram_fd {
struct hFILE *fp;
int mode; // 'r' or 'w'
int version;
cram_file_def *file_def;
SAM_hdr *header;
char *prefix;
int64_t record_counter;
int err;
// Most recent compression header decoded
//cram_block_compression_hdr *comp_hdr;
//cram_block_slice_hdr *slice_hdr;
// Current container being processed.
cram_container *ctr;
// positions for encoding or decoding
int first_base, last_base;
// cached reference portion
refs_t *refs; // ref meta-data structure
char *ref, *ref_free; // current portion held in memory
int ref_id;
int ref_start;
int ref_end;
char *ref_fn; // reference fasta filename
// compression level and metrics
int level;
cram_metrics *m[DS_END];
// options
int decode_md; // Whether to export MD and NM tags
int verbose;
int seqs_per_slice;
int slices_per_container;
int embed_ref;
int no_ref;
int ignore_md5;
int use_bz2;
int use_rans;
int use_lzma;
int shared_ref;
unsigned int required_fields;
cram_range range;
// lookup tables, stored here so we can be trivially multi-threaded
unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
unsigned int cram_flag_swap[0x1000];// bam -> cram flags
unsigned char L1[256]; // ACGT{*} ->0123{4}
unsigned char L2[256]; // ACGTN{*}->01234{5}
char cram_sub_matrix[32][32]; // base substituion codes
int index_sz;
cram_index *index; // array, sizeof index_sz
off_t first_container;
int eof;
int last_slice; // number of recs encoded in last slice
int multi_seq;
int unsorted;
int empty_container; // Marker for EOF block
// thread pool
int own_pool;
t_pool *pool;
t_results_queue *rqueue;
pthread_mutex_t metrics_lock;
pthread_mutex_t ref_lock;
spare_bams *bl;
pthread_mutex_t bam_list_lock;
void *job_pending;
int ooc; // out of containers.
} cram_fd;
// Translation of required fields to cram data series
enum cram_fields {
CRAM_BF = 0x00000001,
CRAM_AP = 0x00000002,
CRAM_FP = 0x00000004,
CRAM_RL = 0x00000008,
CRAM_DL = 0x00000010,
CRAM_NF = 0x00000020,
CRAM_BA = 0x00000040,
CRAM_QS = 0x00000080,
CRAM_FC = 0x00000100,
CRAM_FN = 0x00000200,
CRAM_BS = 0x00000400,
CRAM_IN = 0x00000800,
CRAM_RG = 0x00001000,
CRAM_MQ = 0x00002000,
CRAM_TL = 0x00004000,
CRAM_RN = 0x00008000,
CRAM_NS = 0x00010000,
CRAM_NP = 0x00020000,
CRAM_TS = 0x00040000,
CRAM_MF = 0x00080000,
CRAM_CF = 0x00100000,
CRAM_RI = 0x00200000,
CRAM_RS = 0x00400000,
CRAM_PD = 0x00800000,
CRAM_HC = 0x01000000,
CRAM_SC = 0x02000000,
CRAM_BB = 0x04000000,
CRAM_BB_len = 0x08000000,
CRAM_QQ = 0x10000000,
CRAM_QQ_len = 0x20000000,
CRAM_aux= 0x40000000,
CRAM_ALL= 0x7fffffff,
};
// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
// encode a base difference, but we don't need to know what it is for CIGAR.
// If we have a soft-clip or insertion, we do need SC/IN though to know how
// long that array is.
#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \
CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ)
/* BF bitfields */
/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
#define CRAM_FPAIRED 256
#define CRAM_FPROPER_PAIR 128
#define CRAM_FUNMAP 64
#define CRAM_FREVERSE 32
#define CRAM_FREAD1 16
#define CRAM_FREAD2 8
#define CRAM_FSECONDARY 4
#define CRAM_FQCFAIL 2
#define CRAM_FDUP 1
#define DS_aux_S "\001"
#define DS_aux_OQ_S "\002"
#define DS_aux_BQ_S "\003"
#define DS_aux_BD_S "\004"
#define DS_aux_BI_S "\005"
#define DS_aux_FZ_S "\006"
#define DS_aux_oq_S "\007"
#define DS_aux_os_S "\010"
#define DS_aux_oz_S "\011"
#define CRAM_M_REVERSE 1
#define CRAM_M_UNMAP 2
/* CF bitfields */
#define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
#define CRAM_FLAG_DETACHED (1<<1)
#define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
#ifdef __cplusplus
}
#endif
#endif /* _CRAM_STRUCTS_H_ */
htslib-1.2.1/cram/files.c 0000664 0000000 0000000 00000004714 12464172677 0015216 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 1994, 1996-1997, 2000, 2003 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1 Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2 Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include "cram/misc.h"
#include
#include
/* Alliant's Concentrix is hugely deficient */
/* Define things we require in this program */
/* Methinks S_IFMT and S_IFDIR aren't defined in POSIX */
#ifndef S_ISDIR
#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
#endif /*!S_ISDIR*/
#ifndef S_ISREG
#define S_ISREG(m) (((m)&S_IFMT) == S_IFREG)
#endif /*!S_ISREG*/
int is_directory(char * fn)
{
struct stat buf;
if ( stat(fn,&buf) ) return 0;
return S_ISDIR(buf.st_mode);
}
int is_file(char * fn)
{
struct stat buf;
if ( stat(fn,&buf) ) return 0;
return S_ISREG(buf.st_mode);
}
int file_exists(char * fn)
{
struct stat buf;
return ( stat(fn,&buf) == 0);
}
int file_size(char * fn)
{
struct stat buf;
if ( stat(fn,&buf) != 0) return 0;
return buf.st_size;
}
htslib-1.2.1/cram/mFILE.c 0000664 0000000 0000000 00000036506 12464172677 0015014 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2005-2006, 2008-2009, 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "cram/os.h"
#include "cram/mFILE.h"
#include "cram/vlen.h"
/*
* This file contains memory-based versions of the most commonly used
* (by io_lib) stdio functions.
*
* Actual file IO takes place either on opening or closing an mFILE.
*
* Coupled to this are a bunch of rather scary macros which can be obtained
* by including stdio_hack.h. It is recommended though that you use mFILE.h
* instead and replace fopen with mfopen (etc). This is more or less
* mandatory if you wish to use both FILE and mFILE structs in a single file.
*/
static mFILE *m_channel[3]; /* stdin, stdout and stderr fakes */
/*
* Reads the entirety of fp into memory. If 'fn' exists it is the filename
* associated with fp. This will be used for more optimal reading (via a
* stat to identify the size and a single read). Otherwise we use successive
* reads until EOF.
*
* Returns a malloced buffer on success of length *size
* NULL on failure
*/
static char *mfload(FILE *fp, const char *fn, size_t *size, int binary) {
struct stat sb;
char *data = NULL;
size_t allocated = 0, used = 0;
int bufsize = 8192;
#ifdef _WIN32
if (binary)
_setmode(_fileno(fp), _O_BINARY);
else
_setmode(_fileno(fp), _O_TEXT);
#endif
if (fn && -1 != stat(fn, &sb)) {
data = malloc(allocated = sb.st_size);
bufsize = sb.st_size;
} else {
fn = NULL;
}
do {
size_t len;
if (used + bufsize > allocated) {
allocated += bufsize;
data = realloc(data, allocated);
}
len = fread(data + used, 1, allocated - used, fp);
if (len > 0)
used += len;
} while (!feof(fp) && (fn == NULL || used < sb.st_size));
*size = used;
return data;
}
/*
* Creates and returns m_channel[0].
* We initialise this on the first attempted read, which then slurps in
* all of stdin until EOF is met.
*/
mFILE *mstdin(void) {
if (m_channel[0])
return m_channel[0];
m_channel[0] = mfcreate(NULL, 0);
if (NULL == m_channel[0]) return NULL;
m_channel[0]->fp = stdin;
return m_channel[0];
}
static void init_mstdin(void) {
static int done_stdin = 0;
if (done_stdin)
return;
m_channel[0]->data = mfload(stdin, NULL, &m_channel[0]->size, 1);
m_channel[0]->mode = MF_READ;
done_stdin = 1;
}
/*
* Creates and returns m_channel[1]. This is the fake for stdout. It starts as
* an empty buffer which is physically written out only when mfflush or
* mfclose are called.
*/
mFILE *mstdout(void) {
if (m_channel[1])
return m_channel[1];
m_channel[1] = mfcreate(NULL, 0);
if (NULL == m_channel[1]) return NULL;
m_channel[1]->fp = stdout;
m_channel[1]->mode = MF_WRITE;
return m_channel[1];
}
/*
* Stderr as an mFILE.
* The code handles stderr by returning m_channel[2], but also checking
* for stderr in fprintf (the common usage of it) to auto-flush.
*/
mFILE *mstderr(void) {
if (m_channel[2])
return m_channel[2];
m_channel[2] = mfcreate(NULL, 0);
if (NULL == m_channel[2]) return NULL;
m_channel[2]->fp = stderr;
m_channel[2]->mode = MF_WRITE;
return m_channel[2];
}
/*
* For creating existing mFILE pointers directly from memory buffers.
*/
mFILE *mfcreate(char *data, int size) {
mFILE *mf = (mFILE *)malloc(sizeof(*mf));
if (NULL == mf) return NULL;
mf->fp = NULL;
mf->data = data;
mf->alloced = size;
mf->size = size;
mf->eof = 0;
mf->offset = 0;
mf->flush_pos = 0;
mf->mode = MF_READ | MF_WRITE;
return mf;
}
/*
* Recreate an existing mFILE to house new data/size.
* It also rewinds the file.
*/
void mfrecreate(mFILE *mf, char *data, int size) {
if (mf->data)
free(mf->data);
mf->data = data;
mf->size = size;
mf->alloced = size;
mf->eof = 0;
mf->offset = 0;
mf->flush_pos = 0;
}
/*
* Creates a new mFILE to contain the contents of the FILE pointer.
* This mFILE is purely for in-memory operations and has no links to the
* original FILE* it came from. It also doesn't close the FILE pointer.
* Consider using mfreopen() is you need different behaviour.
*
* Returns mFILE * on success
* NULL on failure.
*/
mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp) {
mFILE *mf;
/* Open using mfreopen() */
if (NULL == (mf = mfreopen(path, mode_str, fp)))
return NULL;
/* Disassociate from the input stream */
mf->fp = NULL;
return mf;
}
/*
* Converts a FILE * to an mFILE *.
* Use this for wrapper functions to turn external prototypes requring
* FILE * as an argument into internal code using mFILE *.
*/
mFILE *mfreopen(const char *path, const char *mode_str, FILE *fp) {
mFILE *mf;
int r = 0, w = 0, a = 0, b = 0, x = 0, mode = 0;
/* Parse mode:
* r = read file contents (if truncated => don't read)
* w = write on close
* a = position at end of buffer
* x = position at same location as the original fp, don't seek on flush
*/
if (strchr(mode_str, 'r'))
r = 1, mode |= MF_READ;
if (strchr(mode_str, 'w'))
w = 1, mode |= MF_WRITE | MF_TRUNC;
if (strchr(mode_str, 'a'))
w = a = 1, mode |= MF_WRITE | MF_APPEND;
if (strchr(mode_str, 'b'))
b = 1, mode |= MF_BINARY;
if (strchr(mode_str, 'x'))
x = 1;
if (strchr(mode_str, '+')) {
w = 1, mode |= MF_READ | MF_WRITE;
if (a)
r = 1;
}
if (r) {
mf = mfcreate(NULL, 0);
if (NULL == mf) return NULL;
if (!(mode & MF_TRUNC)) {
mf->data = mfload(fp, path, &mf->size, b);
mf->alloced = mf->size;
if (!a)
fseek(fp, 0, SEEK_SET);
}
} else if (w) {
/* Write - initialise the data structures */
mf = mfcreate(NULL, 0);
if (NULL == mf) return NULL;
} else {
fprintf(stderr, "Must specify either r, w or a for mode\n");
return NULL;
}
mf->fp = fp;
mf->mode = mode;
if (x) {
mf->mode |= MF_MODEX;
}
if (a) {
mf->flush_pos = mf->size;
fseek(fp, 0, SEEK_END);
}
return mf;
}
/*
* Opens a file. If we have read access (r or a+) then it loads the entire
* file into memory. If We have write access then the pathname is stored.
* We do not actually write until an mfclose, which then checks this pathname.
*/
mFILE *mfopen(const char *path, const char *mode) {
FILE *fp;
if (NULL == (fp = fopen(path, mode)))
return NULL;
return mfreopen(path, mode, fp);
}
/*
* Closes an mFILE. If the filename is known (implying write access) then this
* also writes the data to disk.
*
* Stdout is handled by calling mfflush which writes to stdout if appropriate.
*/
int mfclose(mFILE *mf) {
if (!mf)
return -1;
mfflush(mf);
if (mf->fp)
fclose(mf->fp);
mfdestroy(mf);
return 0;
}
/*
* Closes the file pointer contained within the mFILE without destroying
* the in-memory data.
*/
int mfdetach(mFILE *mf) {
if (!mf)
return -1;
mfflush(mf);
if (mf->fp) {
fclose(mf->fp);
mf->fp = NULL;
}
return 0;
}
/*
* Destroys an mFILE structure but does not flush or close it
*/
int mfdestroy(mFILE *mf) {
if (!mf)
return -1;
if (mf->data)
free(mf->data);
free(mf);
return 0;
}
/*
* Steals that data out of an mFILE. The mFILE itself will be closed.
* It is up to the caller to free the stolen buffer. If size_out is
* not NULL, mf->size will be stored in it.
* This is more-or-less the opposite of mfcreate().
*/
void *mfsteal(mFILE *mf, size_t *size_out) {
void *data;
if (!mf) return NULL;
data = mf->data;
if (NULL != size_out) *size_out = mf->size;
mfdetach(mf);
mf->data = NULL;
mfdestroy(mf);
return data;
}
/*
* Seek/tell functions. Nothing more than updating and reporting an
* in-memory index. NB we can seek on stdin or stdout even provided we
* haven't been flushing.
*/
int mfseek(mFILE *mf, long offset, int whence) {
switch (whence) {
case SEEK_SET:
mf->offset = offset;
break;
case SEEK_CUR:
mf->offset += offset;
break;
case SEEK_END:
mf->offset = mf->size + offset;
break;
default:
errno = EINVAL;
return -1;
}
mf->eof = 0;
return 0;
}
long mftell(mFILE *mf) {
return mf->offset;
}
void mrewind(mFILE *mf) {
mf->offset = 0;
mf->eof = 0;
}
/*
* mftruncate is not directly a translation of ftruncate as the latter
* takes a file descriptor instead of a FILE *. It performs the analogous
* role though.
*
* If offset is -1 then the file is truncated to be the current file
* offset.
*/
void mftruncate(mFILE *mf, long offset) {
mf->size = offset != -1 ? offset : mf->offset;
if (mf->offset > mf->size)
mf->offset = mf->size;
}
int mfeof(mFILE *mf) {
return mf->eof;
}
/*
* mFILE read/write functions. Basically these turn fread/fwrite syntax
* into memcpy statements, with appropriate memory handling for writing.
*/
size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
size_t len;
char *cptr = (char *)ptr;
if (mf == m_channel[0]) init_mstdin();
if (mf->size <= mf->offset)
return 0;
len = size * nmemb <= mf->size - mf->offset
? size * nmemb
: mf->size - mf->offset;
if (!size)
return 0;
memcpy(cptr, &mf->data[mf->offset], len);
mf->offset += len;
if (len != size * nmemb) {
mf->eof = 1;
}
return len / size;
}
size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf) {
if (!(mf->mode & MF_WRITE))
return 0;
/* Append mode => forced all writes to end of file */
if (mf->mode & MF_APPEND)
mf->offset = mf->size;
/* Make sure we have enough room */
while (size * nmemb + mf->offset > mf->alloced) {
size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
void * new_data = realloc(mf->data, new_alloced);
if (NULL == new_data) return 0;
mf->alloced = new_alloced;
mf->data = new_data;
}
/* Record where we need to reflush from */
if (mf->offset < mf->flush_pos)
mf->flush_pos = mf->offset;
/* Copy the data over */
memcpy(&mf->data[mf->offset], ptr, size * nmemb);
mf->offset += size * nmemb;
if (mf->size < mf->offset)
mf->size = mf->offset;
return nmemb;
}
int mfgetc(mFILE *mf) {
if (mf == m_channel[0]) init_mstdin();
if (mf->offset < mf->size) {
return (unsigned char)mf->data[mf->offset++];
}
mf->eof = 1;
return -1;
}
int mungetc(int c, mFILE *mf) {
if (mf->offset > 0) {
mf->data[--mf->offset] = c;
return c;
}
mf->eof = 1;
return -1;
}
char *mfgets(char *s, int size, mFILE *mf) {
int i;
if (mf == m_channel[0]) init_mstdin();
*s = 0;
for (i = 0; i < size-1;) {
if (mf->offset < mf->size) {
s[i] = mf->data[mf->offset++];
if (s[i++] == '\n')
break;
} else {
mf->eof = 1;
break;
}
}
s[i] = 0;
return i ? s : NULL;
}
/*
* Flushes an mFILE. If this is a real open of a file in write mode then
* mFILE->fp will be set. We then write out any new data in mFILE since the
* last flush. We cannot tell what may have been modified as we don't keep
* track of that, so we typically rewrite out the entire file contents between
* the last flush_pos and the end of file.
*
* For stderr/stdout we also reset the offsets so we cannot modify things
* we've already output.
*/
int mfflush(mFILE *mf) {
if (!mf->fp)
return 0;
/* FIXME: only do this when opened in write mode */
if (mf == m_channel[1] || mf == m_channel[2]) {
if (mf->flush_pos < mf->size) {
size_t bytes = mf->size - mf->flush_pos;
if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
return -1;
if (0 != fflush(mf->fp))
return -1;
}
/* Stdout & stderr are non-seekable streams so throw away the data */
mf->offset = mf->size = mf->flush_pos = 0;
}
/* only flush when opened in write mode */
if (mf->mode & MF_WRITE) {
if (mf->flush_pos < mf->size) {
size_t bytes = mf->size - mf->flush_pos;
if (!(mf->mode & MF_MODEX)) {
fseek(mf->fp, mf->flush_pos, SEEK_SET);
}
if (fwrite(mf->data + mf->flush_pos, 1, bytes, mf->fp) < bytes)
return -1;
if (0 != fflush(mf->fp))
return -1;
}
if (ftell(mf->fp) != -1 &&
ftruncate(fileno(mf->fp), ftell(mf->fp)) == -1)
return -1;
mf->flush_pos = mf->size;
}
return 0;
}
/*
* A wrapper around vsprintf() to write to an mFILE. This also uses vflen() to
* estimate how many additional bytes of storage will be required for the
* vsprintf to work.
*/
int mfprintf(mFILE *mf, char *fmt, ...) {
int ret;
size_t est_length;
va_list args;
va_start(args, fmt);
est_length = vflen(fmt, args);
va_end(args);
while (est_length + mf->offset > mf->alloced) {
size_t new_alloced = mf->alloced ? mf->alloced * 2 : 1024;
void * new_data = realloc(mf->data, new_alloced);
if (NULL == new_data) return -1;
mf->alloced = new_alloced;
mf->data = new_data;
}
va_start(args, fmt);
ret = vsprintf(&mf->data[mf->offset], fmt, args);
va_end(args);
if (ret > 0) {
mf->offset += ret;
if (mf->size < mf->offset)
mf->size = mf->offset;
}
if (mf->fp == stderr) {
/* Auto-flush for stderr */
if (0 != mfflush(mf)) return -1;
}
return ret;
}
/*
* Converts an mFILE from binary to ascii mode by replacing all
* cr-nl with nl.
*
* Primarily used on windows when we've uncompressed a binary file which
* happens to be a text file (eg Experiment File). Previously we would have
* seeked back to the start and used _setmode(fileno(fp), _O_TEXT).
*
* Side effect: resets offset and flush_pos back to the start.
*/
void mfascii(mFILE *mf) {
size_t p1, p2;
for (p1 = p2 = 1; p1 < mf->size; p1++, p2++) {
if (mf->data[p1] == '\n' && mf->data[p1-1] == '\r') {
p2--; /* delete the \r */
}
mf->data[p2] = mf->data[p1];
}
mf->size = p2;
mf->offset = mf->flush_pos = 0;
}
htslib-1.2.1/cram/mFILE.h 0000664 0000000 0000000 00000005667 12464172677 0015025 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2005-2006, 2008-2009 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _MFILE_H_
#define _MFILE_H_
#include
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
FILE *fp;
char *data;
size_t alloced;
int eof;
int mode; /* open mode in MF_?? define bit pattern */
size_t size;
size_t offset;
size_t flush_pos;
} mFILE;
#define MF_READ 1
#define MF_WRITE 2
#define MF_APPEND 4
#define MF_BINARY 8
#define MF_TRUNC 16
#define MF_MODEX 32
mFILE *mfreopen(const char *path, const char *mode, FILE *fp);
mFILE *mfopen(const char *path, const char *mode);
int mfdetach(mFILE *mf);
int mfclose(mFILE *mf);
int mfdestroy(mFILE *mf);
int mfseek(mFILE *mf, long offset, int whence);
long mftell(mFILE *mf);
void mrewind(mFILE *mf);
void mftruncate(mFILE *mf, long offset);
int mfeof(mFILE *mf);
size_t mfread(void *ptr, size_t size, size_t nmemb, mFILE *mf);
size_t mfwrite(void *ptr, size_t size, size_t nmemb, mFILE *mf);
int mfgetc(mFILE *mf);
int mungetc(int c, mFILE *mf);
mFILE *mfcreate(char *data, int size);
mFILE *mfcreate_from(const char *path, const char *mode_str, FILE *fp);
void mfrecreate(mFILE *mf, char *data, int size);
void *mfsteal(mFILE *mf, size_t *size_out);
char *mfgets(char *s, int size, mFILE *mf);
int mfflush(mFILE *mf);
int mfprintf(mFILE *mf, char *fmt, ...);
mFILE *mstdin(void);
mFILE *mstdout(void);
mFILE *mstderr(void);
void mfascii(mFILE *mf);
#ifdef __cplusplus
}
#endif
#endif /* _MFILE_H_ */
htslib-1.2.1/cram/md5.c 0000664 0000000 0000000 00000020415 12464172677 0014575 0 ustar 00root root 0000000 0000000 /*
* This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
* MD5 Message-Digest Algorithm (RFC 1321).
*
* Homepage:
* http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
*
* Author:
* Alexander Peslyak, better known as Solar Designer
*
* This software was written by Alexander Peslyak in 2001. No copyright is
* claimed, and the software is hereby placed in the public domain.
* In case this attempt to disclaim copyright and place the software in the
* public domain is deemed null and void, then the software is
* Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
* general public under the following terms:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* There's ABSOLUTELY NO WARRANTY, express or implied.
*
* (This is a heavily cut-down "BSD license".)
*
* This differs from Colin Plumb's older public domain implementation in that
* no exactly 32-bit integer data type is required (any 32-bit or wider
* unsigned integer data type will do), there's no compile-time endianness
* configuration, and the function prototypes match OpenSSL's. No code from
* Colin Plumb's implementation has been reused; this comment merely compares
* the properties of the two independent implementations.
*
* The primary goals of this implementation are portability and ease of use.
* It is meant to be fast, but not as fast as possible. Some known
* optimizations are not included to reduce source code size and avoid
* compile-time configuration.
*/
#ifndef HAVE_OPENSSL
#include
#include "md5.h"
/*
* The basic MD5 functions.
*
* F and G are optimized compared to their RFC 1321 definitions for
* architectures that lack an AND-NOT instruction, just like in Colin Plumb's
* implementation.
*/
#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | ~(z)))
/*
* The MD5 transformation for all four rounds.
*/
#define STEP(f, a, b, c, d, x, t, s) \
(a) += f((b), (c), (d)) + (x) + (t); \
(a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
(a) += (b);
/*
* SET reads 4 input bytes in little-endian byte order and stores them
* in a properly aligned word in host byte order.
*
* The check for little-endian architectures that tolerate unaligned
* memory accesses is just an optimization. Nothing will break if it
* doesn't work.
*/
#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
#define SET(n) \
(*(MD5_u32plus *)&ptr[(n) * 4])
#define GET(n) \
SET(n)
#else
#define SET(n) \
(ctx->block[(n)] = \
(MD5_u32plus)ptr[(n) * 4] | \
((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
#define GET(n) \
(ctx->block[(n)])
#endif
/*
* This processes one or more 64-byte data blocks, but does NOT update
* the bit counters. There are no alignment requirements.
*/
static void *body(MD5_CTX *ctx, void *data, unsigned long size)
{
unsigned char *ptr;
MD5_u32plus a, b, c, d;
MD5_u32plus saved_a, saved_b, saved_c, saved_d;
ptr = data;
a = ctx->a;
b = ctx->b;
c = ctx->c;
d = ctx->d;
do {
saved_a = a;
saved_b = b;
saved_c = c;
saved_d = d;
/* Round 1 */
STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
/* Round 2 */
STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
/* Round 3 */
STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
STEP(H, d, a, b, c, GET(8), 0x8771f681, 11)
STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23)
STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11)
STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23)
STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11)
STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
STEP(H, b, c, d, a, GET(6), 0x04881d05, 23)
STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11)
STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23)
/* Round 4 */
STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
a += saved_a;
b += saved_b;
c += saved_c;
d += saved_d;
ptr += 64;
} while (size -= 64);
ctx->a = a;
ctx->b = b;
ctx->c = c;
ctx->d = d;
return ptr;
}
void MD5_Init(MD5_CTX *ctx)
{
ctx->a = 0x67452301;
ctx->b = 0xefcdab89;
ctx->c = 0x98badcfe;
ctx->d = 0x10325476;
ctx->lo = 0;
ctx->hi = 0;
}
void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size)
{
MD5_u32plus saved_lo;
unsigned long used, free;
saved_lo = ctx->lo;
if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
ctx->hi++;
ctx->hi += size >> 29;
used = saved_lo & 0x3f;
if (used) {
free = 64 - used;
if (size < free) {
memcpy(&ctx->buffer[used], data, size);
return;
}
memcpy(&ctx->buffer[used], data, free);
data = (unsigned char *)data + free;
size -= free;
body(ctx, ctx->buffer, 64);
}
if (size >= 64) {
data = body(ctx, data, size & ~(unsigned long)0x3f);
size &= 0x3f;
}
memcpy(ctx->buffer, data, size);
}
void MD5_Final(unsigned char *result, MD5_CTX *ctx)
{
unsigned long used, free;
used = ctx->lo & 0x3f;
ctx->buffer[used++] = 0x80;
free = 64 - used;
if (free < 8) {
memset(&ctx->buffer[used], 0, free);
body(ctx, ctx->buffer, 64);
used = 0;
free = 64;
}
memset(&ctx->buffer[used], 0, free - 8);
ctx->lo <<= 3;
ctx->buffer[56] = ctx->lo;
ctx->buffer[57] = ctx->lo >> 8;
ctx->buffer[58] = ctx->lo >> 16;
ctx->buffer[59] = ctx->lo >> 24;
ctx->buffer[60] = ctx->hi;
ctx->buffer[61] = ctx->hi >> 8;
ctx->buffer[62] = ctx->hi >> 16;
ctx->buffer[63] = ctx->hi >> 24;
body(ctx, ctx->buffer, 64);
result[0] = ctx->a;
result[1] = ctx->a >> 8;
result[2] = ctx->a >> 16;
result[3] = ctx->a >> 24;
result[4] = ctx->b;
result[5] = ctx->b >> 8;
result[6] = ctx->b >> 16;
result[7] = ctx->b >> 24;
result[8] = ctx->c;
result[9] = ctx->c >> 8;
result[10] = ctx->c >> 16;
result[11] = ctx->c >> 24;
result[12] = ctx->d;
result[13] = ctx->d >> 8;
result[14] = ctx->d >> 16;
result[15] = ctx->d >> 24;
memset(ctx, 0, sizeof(*ctx));
}
#endif
htslib-1.2.1/cram/md5.h 0000664 0000000 0000000 00000002707 12464172677 0014606 0 ustar 00root root 0000000 0000000 #ifdef __cplusplus
extern "C" {
#endif
/*
* This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
* MD5 Message-Digest Algorithm (RFC 1321).
*
* Homepage:
* http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
*
* Author:
* Alexander Peslyak, better known as Solar Designer
*
* This software was written by Alexander Peslyak in 2001. No copyright is
* claimed, and the software is hereby placed in the public domain.
* In case this attempt to disclaim copyright and place the software in the
* public domain is deemed null and void, then the software is
* Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
* general public under the following terms:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* There's ABSOLUTELY NO WARRANTY, express or implied.
*
* See md5.c for more information.
*/
#ifdef HAVE_OPENSSL
#include
#elif !defined(_MD5_H)
#define _MD5_H
/* Any 32-bit or wider unsigned integer data type will do */
typedef unsigned int MD5_u32plus;
typedef struct {
MD5_u32plus lo, hi;
MD5_u32plus a, b, c, d;
unsigned char buffer[64];
MD5_u32plus block[16];
} MD5_CTX;
extern void MD5_Init(MD5_CTX *ctx);
extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size);
extern void MD5_Final(unsigned char *result, MD5_CTX *ctx);
#endif
#ifdef __cplusplus
}
#endif
htslib-1.2.1/cram/misc.h 0000664 0000000 0000000 00000010322 12464172677 0015044 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 1994-1997, 2001-2002 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1 Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2 Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Copyright (c) 2003-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _misc_h
#define _misc_h
#include "cram/os.h"
#include
#include /* varargs needed for v*printf() prototypes */
#include
#ifdef __cplusplus
extern "C" {
#endif
/*
* This informs gcc that crash() doesn't return, so it doesn't need to
* concern itself that code paths going via crash could mean some variables
* being undefined and then issuing uninitialised variable warnings.
* This particularly affected convert.
*/
#ifdef __GNUC__
# define __NORETURN__ __attribute__ ((__noreturn__))
#else
# define __NORETURN__
#endif
/*
* Used for printf style argument checking. We can request a function such
* as vTcl_SetResult does argument checking, avoiding bugs with using
* %d and passing in a 64-bit record.
*/
#ifdef __GNUC__
# define __PRINTF_FORMAT__(a,b) __attribute__ ((format (printf, a, b)))
#else
# define __PRINTF_FORMAT__(a,b)
#endif
extern int is_directory(char * fn);
extern int is_file(char * fn);
extern int file_size(char * fn);
#define MIN(A,B) ( ( (A) < (B) ) ? (A) : (B) )
#define MAX(A,B) ( ( (A) > (B) ) ? (A) : (B) )
#ifdef __cplusplus
}
#endif
#endif /*_misc_h*/
htslib-1.2.1/cram/open_trace_file.c 0000664 0000000 0000000 00000024150 12464172677 0017226 0 ustar 00root root 0000000 0000000 /*
Author: James Bonfield
Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Copyright (c) 2008, 2009, 2013, 2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "cram/os.h"
#ifndef PATH_MAX
# define PATH_MAX 1024
#endif
#include "cram/open_trace_file.h"
#include "cram/misc.h"
#include "htslib/hfile.h"
/*
* Tokenises the search path splitting on colons (unix) or semicolons
* (windows).
* We also explicitly add a "./" to the end of the search path
*
* Returns: A new search path with items separated by nul chars. Two nul
* chars in a row represent the end of the tokenised path.
* Returns NULL for a failure.
*
* The returned data has been malloced. It is up to the caller to free this
* memory.
*/
char *tokenise_search_path(char *searchpath) {
char *newsearch;
unsigned int i, j;
size_t len;
#ifdef _WIN32
char path_sep = ';';
#else
char path_sep = ':';
#endif
if (!searchpath)
searchpath="";
newsearch = (char *)malloc((len = strlen(searchpath))+5);
if (!newsearch)
return NULL;
for (i = 0, j = 0; i < len; i++) {
/* "::" => ":". Used for escaping colons in http://foo */
if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') {
newsearch[j++] = ':';
i++;
continue;
}
/* Handle http:// and ftp:// too without :: */
if (path_sep == ':') {
if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) &&
(!strncmp(&searchpath[i], "http:", 5) ||
!strncmp(&searchpath[i], "ftp:", 4) ||
!strncmp(&searchpath[i], "|http:", 6) ||
!strncmp(&searchpath[i], "|ftp:", 5) ||
!strncmp(&searchpath[i], "URL=http:", 9) ||
!strncmp(&searchpath[i], "URL=ftp:", 8))) {
do {
newsearch[j++] = searchpath[i];
} while (i 0) {
if (mfwrite(buf, len, 1, mf) <= 0) {
hclose_abruptly(hf);
mfdestroy(mf);
return NULL;
}
}
if (hclose(hf) < 0) {
mfdestroy(mf);
return NULL;
}
mrewind(mf);
return mf;
}
/*
* Searches for file in the directory 'dirname'. If it finds it, it opens
* it. This also searches for compressed versions of the file in dirname
* too.
*
* Returns mFILE pointer if found
* NULL if not
*/
static mFILE *find_file_dir(char *file, char *dirname) {
char path[PATH_MAX+1];
size_t len = strlen(dirname);
char *cp;
if (dirname[len-1] == '/')
len--;
/* Special case for "./" or absolute filenames */
if (*file == '/' || (len==1 && *dirname == '.')) {
sprintf(path, "%s", file);
} else {
/* Handle %[0-9]*s expansions, if required */
char *path_end = path;
*path = 0;
while ((cp = strchr(dirname, '%'))) {
char *endp;
long l = strtol(cp+1, &endp, 10);
if (*endp != 's') {
strncpy(path_end, dirname, (endp+1)-dirname);
path_end += (endp+1)-dirname;
dirname = endp+1;
continue;
}
strncpy(path_end, dirname, cp-dirname);
path_end += cp-dirname;
if (l) {
strncpy(path_end, file, l);
path_end += MIN(strlen(file), l);
file += MIN(strlen(file), l);
} else {
strcpy(path_end, file);
path_end += strlen(file);
file += strlen(file);
}
len -= (endp+1) - dirname;
dirname = endp+1;
}
strncpy(path_end, dirname, len);
path_end += MIN(strlen(dirname), len);
*path_end = 0;
if (*file) {
*path_end++ = '/';
strcpy(path_end, file);
}
//fprintf(stderr, "*PATH=\"%s\"\n", path);
}
if (is_file(path)) {
return mfopen(path, "rb");
}
return NULL;
}
/*
* ------------------------------------------------------------------------
* Public functions below.
*/
/*
* Opens a trace file named 'file'. This is initially looked for as a
* pathname relative to a file named "relative_to". This may (for
* example) be the name of an experiment file referencing the trace
* file. In this case by passing relative_to as the experiment file
* filename the trace file will be picked up in the same directory as
* the experiment file. Relative_to may be supplied as NULL.
*
* 'file' is looked for at relative_to, then the current directory, and then
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(char *file, char *path, char *relative_to) {
char *newsearch;
char *ele;
mFILE *fp;
/* Use path first */
if (!path)
path = getenv("RAWDATA");
if (NULL == (newsearch = tokenise_search_path(path)))
return NULL;
/*
* Step through the search path testing out each component.
* We now look through each path element treating some prefixes as
* special, otherwise we treat the element as a directory.
*/
for (ele = newsearch; *ele; ele += strlen(ele)+1) {
char *ele2;
/*
* '|' prefixing a path component indicates that we do not
* wish to perform the compression extension searching in that
* location.
*
* NB: this has been removed from the htslib implementation.
*/
if (*ele == '|') {
ele2 = ele+1;
} else {
ele2 = ele;
}
if (0 == strncmp(ele2, "URL=", 4)) {
if ((fp = find_file_url(file, ele2+4))) {
free(newsearch);
return fp;
}
} else if (!strncmp(ele2, "http:", 5) ||
!strncmp(ele2, "ftp:", 4)) {
if ((fp = find_file_url(file, ele2))) {
free(newsearch);
return fp;
}
} else if ((fp = find_file_dir(file, ele2))) {
free(newsearch);
return fp;
}
}
free(newsearch);
/* Look in the same location as the incoming 'relative_to' filename */
if (relative_to) {
char *cp;
char relative_path[PATH_MAX+1];
strcpy(relative_path, relative_to);
if ((cp = strrchr(relative_path, '/')))
*cp = 0;
if ((fp = find_file_dir(file, relative_path)))
return fp;
}
return NULL;
}
htslib-1.2.1/cram/open_trace_file.h 0000664 0000000 0000000 00000011206 12464172677 0017231 0 ustar 00root root 0000000 0000000 /*
Author: James Bonfield
Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Copyright (c) 2008, 2009, 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _OPEN_TRACE_FILE_H_
#define _OPEN_TRACE_FILE_H_
#include "cram/mFILE.h"
#ifdef __cplusplus
extern "C" {
#endif
/*
* Tokenises the search path splitting on colons (unix) or semicolons
* (windows).
* We also explicitly add a "./" to the end of the search path
*
* Returns: A new search path with items separated by nul chars. Two nul
* chars in a row represent the end of the tokenised path.
* Returns NULL for a failure.
*
* The returned data has been malloced. It is up to the caller to free this
* memory.
*/
char *tokenise_search_path(char *searchpath);
/*
* Opens a trace file named 'file'. This is initially looked for as a
* pathname relative to a file named "relative_to". This may (for
* example) be the name of an experiment file referencing the trace
* file. In this case by passing relative_to as the experiment file
* filename the trace file will be picked up in the same directory as
* the experiment file. Relative_to may be supplied as NULL.
*
* 'file' is looked for at relative_to, then the current directory, and then
* all of the locations listed in 'path' (which is a colon separated list).
* If 'path' is NULL it uses the RAWDATA environment variable instead.
*
* Returns a mFILE pointer when found.
* NULL otherwise.
*/
mFILE *open_path_mfile(char *file, char *path, char *relative_to);
/*
* Returns a mFILE containing the entire contents of the url;
* NULL on failure.
*/
mFILE *find_file_url(char *file, char *url);
#ifdef __cplusplus
}
#endif
#endif /* _OPEN_TRACE_FILE_H_ */
htslib-1.2.1/cram/os.h 0000664 0000000 0000000 00000022254 12464172677 0014541 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 1993, 1995-2002 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1 Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2 Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Copyright (c) 2004, 2006, 2009-2011, 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* File: os.h
*
* Author:
* MRC Laboratory of Molecular Biology
* Hills Road
* Cambridge CB2 2QH
* United Kingdom
*
* Description: operating system specific type definitions
*
*/
#ifndef _OS_H_
#define _OS_H_
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/*-----------------------------------------------------------------------------
* Detection of endianness. The main part of this is done in autoconf, but
* for the case of MacOS FAT binaries we fall back on auto-sensing based on
* processor type too.
*/
/* Set by autoconf */
#define SP_LITTLE_ENDIAN
/* Mac FAT binaries or unknown. Auto detect based on CPU type */
#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN)
/*
* x86 equivalents
*/
#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686)
# if defined(SP_BIG_ENDIAN)
# undef SP_BIG_ENDIAN
# endif
# define SP_LITTLE_ENDIAN
#endif
/*
* DEC Alpha
*/
#if defined(__alpha__) || defined(__alpha)
# if defined(SP_LITTLE_ENDIAN)
# undef SP_LITTLE_ENDIAN
# endif
# define SP_BIG_ENDIAN
#endif
/*
* SUN Sparc
*/
#if defined(__sparc__) || defined(__sparc)
# if defined(SP_LITTLE_ENDIAN)
# undef SP_LITTLE_ENDIAN
# endif
# define SP_BIG_ENDIAN
#endif
/*
* PowerPC
*/
#if defined(__ppc__) || defined(__ppc)
# if defined(SP_LITTLE_ENDIAN)
# undef SP_LITTLE_ENDIAN
# endif
# define SP_BIG_ENDIAN
#endif
/* Some catch-alls */
#if defined(__LITTLE_ENDIAN__) || defined(__LITTLEENDIAN__)
# define SP_LITTLE_ENDIAN
#endif
#if defined(__BIG_ENDIAN__) || defined(__BIGENDIAN__)
# define SP_BIG_ENDIAN
#endif
#if defined(SP_BIG_ENDIAN) && defined(SP_LITTLE_ENDIAN)
# error Both BIG and LITTLE endian defined. Fix os.h and/or Makefile
#endif
#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN)
# error Neither BIG nor LITTLE endian defined. Fix os.h and/or Makefile
#endif
#endif
/*-----------------------------------------------------------------------------
* Allow for unaligned memory access. This is used in BAM code as the packed
* structure has 4-byte cigar ints after the variable length name.
*
* Consider using AX_CHECK_ALIGNED_ACCESS_REQUIRED in autoconf.
*/
#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686)
# define ALLOW_UAC
#endif
/*-----------------------------------------------------------------------------
* Byte swapping macros
*/
/*
* Our new swap runs at the same speed on Ultrix, but substantially faster
* (300% for swap_int4, ~50% for swap_int2) on an Alpha (due to the lack of
* decent 'char' support).
*
* They also have the ability to swap in situ (src == dst). Newer code now
* relies on this so don't change back!
*/
#define iswap_int8(x) \
(((x & 0x00000000000000ffLL) << 56) + \
((x & 0x000000000000ff00LL) << 40) + \
((x & 0x0000000000ff0000LL) << 24) + \
((x & 0x00000000ff000000LL) << 8) + \
((x & 0x000000ff00000000LL) >> 8) + \
((x & 0x0000ff0000000000LL) >> 24) + \
((x & 0x00ff000000000000LL) >> 40) + \
((x & 0xff00000000000000LL) >> 56))
#define iswap_int4(x) \
(((x & 0x000000ff) << 24) + \
((x & 0x0000ff00) << 8) + \
((x & 0x00ff0000) >> 8) + \
((x & 0xff000000) >> 24))
#define iswap_int2(x) \
(((x & 0x00ff) << 8) + \
((x & 0xff00) >> 8))
/*
* Linux systems may use byteswap.h to get assembly versions of byte-swap
* on intel systems. This can be as trivial as the bswap opcode, which works
* out at over 2-times faster than iswap_int4 above.
*/
#if 0
#if defined(__linux__)
# include
# undef iswap_int8
# undef iswap_int4
# undef iswap_int2
# define iswap_int8 bswap_64
# define iswap_int4 bswap_32
# define iswap_int2 bswap_16
#endif
#endif
/*
* Macros to specify that data read in is of a particular endianness.
* The macros here swap to the appropriate order for the particular machine
* running the macro and return the new answer. These may also be used when
* writing to a file to specify that we wish to write in (eg) big endian
* format.
*
* This leads to efficient code as most of the time these macros are
* trivial.
*/
#ifdef SP_BIG_ENDIAN
#define le_int4(x) iswap_int4((x))
#define le_int2(x) iswap_int2((x))
#endif
#ifdef SP_LITTLE_ENDIAN
#define le_int4(x) (x)
#define le_int2(x) (x)
#endif
/*-----------------------------------------------------------------------------
* definitions, incase they're not present
*/
#ifndef PRId64
#define __PRI64__ "l"
#define PRId64 __PRI64__ "d"
#define PRId32 "d"
#define PRId16 "d"
#define PRId8 "d"
#define PRIu64 __PRI64__ "u"
#define PRIu32 "u"
#define PRIu16 "u"
#define PRIu8 "u"
#endif
/*-----------------------------------------------------------------------------
* Operating system specifics.
* These ought to be done by autoconf, but are legacy code.
*/
/*
* SunOS 4.x
* Even though we use the ANSI gcc, we make use the the standard SunOS 4.x
* libraries and include files, which are non-ansi
*/
#if defined(__sun__) && !defined(__svr4__)
#define SEEK_SET 0
#define SEEK_CUR 1
#define SEEK_END 2
#endif
/*
* Microsoft Visual C++
* Windows
*/
#if defined(_MSC_VER)
#define popen _popen
#define pclose _pclose
#define ftruncate(fd,len) _chsize(fd,len)
#endif
/*
* Microsoft Windows running MinGW
*/
#if defined(__MINGW32__)
/* #define mkdir(filename,mode) mkdir((filename)) */
#define sysconf(x) 512
#define ftruncate(fd,len) _chsize(fd,len)
#endif
/* Generic WIN32 API issues */
#ifdef _WIN32
# ifndef HAVE_FSEEKO
# if __MSVCRT_VERSION__ >= 0x800
/* if you have MSVCR80 installed then you can use these definitions: */
# define off_t __int64
# define fseeko _fseeki64
# define ftello _ftelli64
# else
/* otherwise we're stuck with 32-bit file support */
# define off_t long
# define fseeko fseek
# define ftello ftell
# endif
# endif /* !HAVE_FSEEKO */
#endif /* _WIN32 */
#ifdef __cplusplus
}
#endif
#endif /*_OS_H_*/
htslib-1.2.1/cram/pooled_alloc.c 0000664 0000000 0000000 00000010073 12464172677 0016543 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2009 Genome Research Ltd.
Author: Rob Davies
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include "cram/pooled_alloc.h"
//#define TEST_MAIN
#define PSIZE 1024*1024
pool_alloc_t *pool_create(size_t dsize) {
pool_alloc_t *p;
if (NULL == (p = (pool_alloc_t *)malloc(sizeof(*p))))
return NULL;
/* Minimum size is a pointer, for free list */
dsize = (dsize + sizeof(void *) - 1) & ~(sizeof(void *)-1);
if (dsize < sizeof(void *))
dsize = sizeof(void *);
p->dsize = dsize;
p->npools = 0;
p->pools = NULL;
p->free = NULL;
return p;
}
static pool_t *new_pool(pool_alloc_t *p) {
size_t n = PSIZE / p->dsize;
pool_t *pool;
pool = realloc(p->pools, (p->npools + 1) * sizeof(*p->pools));
if (NULL == pool) return NULL;
p->pools = pool;
pool = &p->pools[p->npools];
pool->pool = malloc(n * p->dsize);
if (NULL == pool->pool) return NULL;
pool->used = 0;
p->npools++;
return pool;
}
void pool_destroy(pool_alloc_t *p) {
size_t i;
for (i = 0; i < p->npools; i++) {
free(p->pools[i].pool);
}
free(p->pools);
free(p);
}
void *pool_alloc(pool_alloc_t *p) {
pool_t *pool;
void *ret;
/* Look on free list */
if (NULL != p->free) {
ret = p->free;
p->free = *((void **)p->free);
return ret;
}
/* Look for space in the last pool */
if (p->npools) {
pool = &p->pools[p->npools - 1];
if (pool->used + p->dsize < PSIZE) {
ret = ((char *) pool->pool) + pool->used;
pool->used += p->dsize;
return ret;
}
}
/* Need a new pool */
pool = new_pool(p);
if (NULL == pool) return NULL;
pool->used = p->dsize;
return pool->pool;
}
void pool_free(pool_alloc_t *p, void *ptr) {
*(void **)ptr = p->free;
p->free = ptr;
}
#ifdef TEST_MAIN
typedef struct {
int x, y, z;
} xyz;
#define NP 10000
int main(void) {
int i;
xyz *item;
xyz **items;
pool_alloc_t *p = pool_create(sizeof(xyz));
items = (xyz **)malloc(NP * sizeof(*items));
for (i = 0; i < NP; i++) {
item = pool_alloc(p);
item->x = i;
item->y = i+1;
item->z = i+2;
items[i] = item;
}
for (i = 0; i < NP; i++) {
item = items[i];
if (i % 3)
pool_free(p, item);
}
for (i = 0; i < NP; i++) {
item = pool_alloc(p);
item->x = 1000000+i;
item->y = 1000000+i+1;
item->z = 1000000+i+2;
}
for (i = 0; i < NP; i++) {
item = items[i];
printf("%d\t%d\t%d\t%d\n", i, item->x, item->y, item->z);
pool_free(p, item);
}
return 0;
}
#endif
htslib-1.2.1/cram/pooled_alloc.h 0000664 0000000 0000000 00000004035 12464172677 0016551 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2009 Genome Research Ltd.
Author: Rob Davies
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _POOLED_ALLOC_H_
#define _POOLED_ALLOC_H_
/*
* Implements a pooled block allocator where all items are the same size,
* but we need many of them.
*/
typedef struct {
void *pool;
size_t used;
} pool_t;
typedef struct {
size_t dsize;
size_t npools;
pool_t *pools;
void *free;
} pool_alloc_t;
pool_alloc_t *pool_create(size_t dsize);
void pool_destroy(pool_alloc_t *p);
void *pool_alloc(pool_alloc_t *p);
void pool_free(pool_alloc_t *p, void *ptr);
#endif /*_POOLED_ALLOC_H_*/
htslib-1.2.1/cram/rANS_byte.h 0000664 0000000 0000000 00000027064 12464172677 0015752 0 ustar 00root root 0000000 0000000 /* rans_byte.h originally from https://github.com/rygorous/ryg_rans
*
* This is a public-domain implementation of several rANS variants. rANS is an
* entropy coder from the ANS family, as described in Jarek Duda's paper
* "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540).
*/
/*-------------------------------------------------------------------------- */
// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014
//
// Not intended to be "industrial strength"; just meant to illustrate the general
// idea.
#ifndef RANS_BYTE_HEADER
#define RANS_BYTE_HEADER
#include
#ifdef assert
#define RansAssert assert
#else
#define RansAssert(x)
#endif
// READ ME FIRST:
//
// This is designed like a typical arithmetic coder API, but there's three
// twists you absolutely should be aware of before you start hacking:
//
// 1. You need to encode data in *reverse* - last symbol first. rANS works
// like a stack: last in, first out.
// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give
// it a pointer to the *end* of your buffer (exclusive), and it will
// slowly move towards the beginning as more bytes are emitted.
// 3. Unlike basically any other entropy coder implementation you might
// have used, you can interleave data from multiple independent rANS
// encoders into the same bytestream without any extra signaling;
// you can also just write some bytes by yourself in the middle if
// you want to. This is in addition to the usual arithmetic encoder
// property of being able to switch models on the fly. Writing raw
// bytes can be useful when you have some data that you know is
// incompressible, and is cheaper than going through the rANS encode
// function. Using multiple rANS coders on the same byte stream wastes
// a few bytes compared to using just one, but execution of two
// independent encoders can happen in parallel on superscalar and
// Out-of-Order CPUs, so this can be *much* faster in tight decoding
// loops.
//
// This is why all the rANS functions take the write pointer as an
// argument instead of just storing it in some context struct.
// --------------------------------------------------------------------------
// L ('l' in the paper) is the lower bound of our normalization interval.
// Between this and our byte-aligned emission, we use 31 (not 32!) bits.
// This is done intentionally because exact reciprocals for 31-bit uints
// fit in 32-bit uints: this permits some optimizations during encoding.
#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval
// State for a rANS encoder. Yep, that's all there is to it.
typedef uint32_t RansState;
// Initialize a rANS encoder.
static inline void RansEncInit(RansState* r)
{
*r = RANS_BYTE_L;
}
// Renormalize the encoder. Internal function.
static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits)
{
uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift.
if (x >= x_max) {
uint8_t* ptr = *pptr;
do {
*--ptr = (uint8_t) (x & 0xff);
x >>= 8;
} while (x >= x_max);
*pptr = ptr;
}
return x;
}
// Encodes a single symbol with range start "start" and frequency "freq".
// All frequencies are assumed to sum to "1 << scale_bits", and the
// resulting bytes get written to ptr (which is updated).
//
// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from
// beginning to end! Likewise, the output bytestream is written *backwards*:
// ptr starts pointing at the end of the output buffer and keeps decrementing.
static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
{
// renormalize
RansState x = RansEncRenorm(*r, pptr, freq, scale_bits);
// x = C(s,x)
*r = ((x / freq) << scale_bits) + (x % freq) + start;
}
// Flushes the rANS encoder.
static inline void RansEncFlush(RansState* r, uint8_t** pptr)
{
uint32_t x = *r;
uint8_t* ptr = *pptr;
ptr -= 4;
ptr[0] = (uint8_t) (x >> 0);
ptr[1] = (uint8_t) (x >> 8);
ptr[2] = (uint8_t) (x >> 16);
ptr[3] = (uint8_t) (x >> 24);
*pptr = ptr;
}
// Initializes a rANS decoder.
// Unlike the encoder, the decoder works forwards as you'd expect.
static inline void RansDecInit(RansState* r, uint8_t** pptr)
{
uint32_t x;
uint8_t* ptr = *pptr;
x = ptr[0] << 0;
x |= ptr[1] << 8;
x |= ptr[2] << 16;
x |= ptr[3] << 24;
ptr += 4;
*pptr = ptr;
*r = x;
}
// Returns the current cumulative frequency (map it to a symbol yourself!)
static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits)
{
return *r & ((1u << scale_bits) - 1);
}
// Advances in the bit stream by "popping" a single symbol with range start
// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits",
// and the resulting bytes get written to ptr (which is updated).
static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
{
uint32_t mask = (1u << scale_bits) - 1;
// s, x = D(x)
uint32_t x = *r;
x = freq * (x >> scale_bits) + (x & mask) - start;
// renormalize
if (x < RANS_BYTE_L) {
uint8_t* ptr = *pptr;
do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
*pptr = ptr;
}
*r = x;
}
// --------------------------------------------------------------------------
// That's all you need for a full encoder; below here are some utility
// functions with extra convenience or optimizations.
// Encoder symbol description
// This (admittedly odd) selection of parameters was chosen to make
// RansEncPutSymbol as cheap as possible.
typedef struct {
uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval
uint32_t rcp_freq; // Fixed-point reciprocal frequency
uint32_t bias; // Bias
uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq
uint16_t rcp_shift; // Reciprocal shift
} RansEncSymbol;
// Decoder symbols are straightforward.
typedef struct {
uint16_t start; // Start of range.
uint16_t freq; // Symbol frequency.
} RansDecSymbol;
// Initializes an encoder symbol to start "start" and frequency "freq"
static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits)
{
RansAssert(scale_bits <= 16);
RansAssert(start <= (1u << scale_bits));
RansAssert(freq <= (1u << scale_bits) - start);
// Say M := 1 << scale_bits.
//
// The original encoder does:
// x_new = (x/freq)*M + start + (x%freq)
//
// The fast encoder does (schematically):
// q = mul_hi(x, rcp_freq) >> rcp_shift (division)
// r = x - q*freq (remainder)
// x_new = q*M + bias + r (new x)
// plugging in r into x_new yields:
// x_new = bias + x + q*(M - freq)
// =: bias + x + q*cmpl_freq (*)
//
// and we can just precompute cmpl_freq. Now we just need to
// set up our parameters such that the original encoder and
// the fast encoder agree.
s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq;
s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq);
if (freq < 2) {
// freq=0 symbols are never valid to encode, so it doesn't matter what
// we set our values to.
//
// freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately,
// our fixed-point reciprocal approximation can only multiply by values
// smaller than 1.
//
// So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0.
// This gives:
// q = mul_hi(x, rcp_freq) >> rcp_shift
// = mul_hi(x, (1<<32) - 1)) >> 0
// = floor(x - x/(2^32))
// = x - 1 if 1 <= x < 2^32
// and we know that x>0 (x=0 is never in a valid normalization interval).
//
// So we now need to choose the other parameters such that
// x_new = x*M + start
// plug it in:
// x*M + start (desired result)
// = bias + x + q*cmpl_freq (*)
// = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq)
// = bias + 1 + (x - 1)*M
// = x*M + (bias + 1 - M)
//
// so we have start = bias + 1 - M, or equivalently
// bias = start + M - 1.
s->rcp_freq = ~0u;
s->rcp_shift = 0;
s->bias = start + (1 << scale_bits) - 1;
} else {
// Alverson, "Integer Division using reciprocals"
// shift=ceil(log2(freq))
uint32_t shift = 0;
while (freq > (1u << shift))
shift++;
s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq);
s->rcp_shift = shift - 1;
// With these values, 'q' is the correct quotient, so we
// have bias=start.
s->bias = start;
}
s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol
}
// Initialize a decoder symbol to start "start" and frequency "freq"
static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq)
{
RansAssert(start <= (1 << 16));
RansAssert(freq <= (1 << 16) - start);
s->start = (uint16_t) start;
s->freq = (uint16_t) freq;
}
// Encodes a given symbol. This is faster than straight RansEnc since we can do
// multiplications instead of a divide.
//
// See RansEncSymbolInit for a description of how this works.
static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym)
{
RansAssert(sym->x_max != 0); // can't encode symbol with freq=0
// renormalize
uint32_t x = *r;
uint32_t x_max = sym->x_max;
if (x >= x_max) {
uint8_t* ptr = *pptr;
do {
*--ptr = (uint8_t) (x & 0xff);
x >>= 8;
} while (x >= x_max);
*pptr = ptr;
}
// x = C(s,x)
// NOTE: written this way so we get a 32-bit "multiply high" when
// available. If you're on a 64-bit platform with cheap multiplies
// (e.g. x64), just bake the +32 into rcp_shift.
//uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift;
// The extra >>32 has already been added to RansEncSymbolInit
uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift);
*r = x + sym->bias + q * sym->cmpl_freq;
}
// Equivalent to RansDecAdvance that takes a symbol.
static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits)
{
RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits);
}
// Advances in the bit stream by "popping" a single symbol with range start
// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits".
// No renormalization or output happens.
static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits)
{
uint32_t mask = (1u << scale_bits) - 1;
// s, x = D(x)
uint32_t x = *r;
*r = freq * (x >> scale_bits) + (x & mask) - start;
}
// Equivalent to RansDecAdvanceStep that takes a symbol.
static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits)
{
RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits);
}
// Renormalize.
static inline void RansDecRenorm(RansState* r, uint8_t** pptr)
{
// renormalize
uint32_t x = *r;
if (x < RANS_BYTE_L) {
uint8_t* ptr = *pptr;
do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L);
*pptr = ptr;
}
*r = x;
}
#endif // RANS_BYTE_HEADER
htslib-1.2.1/cram/rANS_static.c 0000664 0000000 0000000 00000047425 12464172677 0016274 0 ustar 00root root 0000000 0000000 /*
* Copyright (c) 2014 Genome Research Ltd.
* Author(s): James Bonfield
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
* Institute nor the names of its contributors may be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Author: James Bonfield, Wellcome Trust Sanger Institute. 2014
*/
#include
#include
#include
#include
#include
#include
#include
#include "cram/rANS_static.h"
#include "cram/rANS_byte.h"
#define TF_SHIFT 12
#define TOTFREQ (1<0?(a):-(a))
#ifndef BLK_SIZE
# define BLK_SIZE 1024*1024
#endif
// Room to allow for expanded BLK_SIZE on worst case compression.
#define BLK_SIZE2 ((int)(1.05*BLK_SIZE))
/*-----------------------------------------------------------------------------
* Memory to memory compression functions.
*
* These are original versions without any manual loop unrolling. They
* are easier to understand, but can be up to 2x slower.
*/
unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
unsigned char *cp, *out_end;
RansEncSymbol syms[256];
RansState rans0, rans1, rans2, rans3;
uint8_t* ptr;
int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0;
int m = 0, M = 0;
uint64_t tr;
if (!out_buf)
return NULL;
ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
// Compute statistics
for (i = 0; i < in_size; i++) {
F[in[i]]++;
}
tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
// Normalise so T[i] == TOTFREQ
for (m = M = j = 0; j < 256; j++) {
if (!F[j])
continue;
if (m < F[j])
m = F[j], M = j;
if ((F[j] = (F[j]*tr)>>31) == 0)
F[j] = 1;
fsum += F[j];
}
fsum++;
if (fsum < TOTFREQ)
F[M] += TOTFREQ-fsum;
else
F[M] -= fsum-TOTFREQ;
//printf("F[%d]=%d\n", M, F[M]);
assert(F[M]>0);
// Encode statistics.
cp = out_buf+9;
for (x = rle = j = 0; j < 256; j++) {
if (F[j]) {
// j
if (rle) {
rle--;
} else {
*cp++ = j;
if (!rle && j && F[j-1]) {
for(rle=j+1; rle<256 && F[rle]; rle++)
;
rle -= j+1;
*cp++ = rle;
}
//fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
}
// F[j]
if (F[j]<128) {
*cp++ = F[j];
} else {
*cp++ = 128 | (F[j]>>8);
*cp++ = F[j]&0xff;
}
RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
x += F[j];
}
}
*cp++ = 0;
//write(1, out_buf+4, cp-(out_buf+4));
tab_size = cp-out_buf;
RansEncInit(&rans0);
RansEncInit(&rans1);
RansEncInit(&rans2);
RansEncInit(&rans3);
switch (i=(in_size&3)) {
case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
case 0:
break;
}
for (i=(in_size &~3); i>0; i-=4) {
RansEncSymbol *s3 = &syms[in[i-1]];
RansEncSymbol *s2 = &syms[in[i-2]];
RansEncSymbol *s1 = &syms[in[i-3]];
RansEncSymbol *s0 = &syms[in[i-4]];
RansEncPutSymbol(&rans3, &ptr, s3);
RansEncPutSymbol(&rans2, &ptr, s2);
RansEncPutSymbol(&rans1, &ptr, s1);
RansEncPutSymbol(&rans0, &ptr, s0);
}
RansEncFlush(&rans3, &ptr);
RansEncFlush(&rans2, &ptr);
RansEncFlush(&rans1, &ptr);
RansEncFlush(&rans0, &ptr);
// Finalise block size and return it
*out_size = (out_end - ptr) + tab_size;
cp = out_buf;
*cp++ = 0; // order
*cp++ = ((*out_size-9)>> 0) & 0xff;
*cp++ = ((*out_size-9)>> 8) & 0xff;
*cp++ = ((*out_size-9)>>16) & 0xff;
*cp++ = ((*out_size-9)>>24) & 0xff;
*cp++ = (in_size>> 0) & 0xff;
*cp++ = (in_size>> 8) & 0xff;
*cp++ = (in_size>>16) & 0xff;
*cp++ = (in_size>>24) & 0xff;
memmove(out_buf + tab_size, ptr, out_end-ptr);
return out_buf;
}
typedef struct {
struct {
int F;
int C;
} fc[256];
unsigned char *R;
} ari_decoder;
unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
/* Load in the static tables */
unsigned char *cp = in + 9;
int i, j, x, out_sz, in_sz, rle;
char *out_buf;
ari_decoder D;
RansDecSymbol syms[256];
memset(&D, 0, sizeof(D));
if (*in++ != 0) // Order-0 check
return NULL;
in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
if (in_sz != in_size-9)
return NULL;
out_buf = malloc(out_sz);
if (!out_buf)
return NULL;
//fprintf(stderr, "out_sz=%d\n", out_sz);
// Precompute reverse lookup of frequency.
rle = x = 0;
j = *cp++;
do {
if ((D.fc[j].F = *cp++) >= 128) {
D.fc[j].F &= ~128;
D.fc[j].F = ((D.fc[j].F & 127) << 8) | *cp++;
}
D.fc[j].C = x;
RansDecSymbolInit(&syms[j], D.fc[j].C, D.fc[j].F);
/* Build reverse lookup table */
if (!D.R) D.R = (unsigned char *)malloc(TOTFREQ);
memset(&D.R[x], j, D.fc[j].F);
x += D.fc[j].F;
if (!rle && j+1 == *cp) {
j = *cp++;
rle = *cp++;
} else if (rle) {
rle--;
j++;
} else {
j = *cp++;
}
} while(j);
assert(x < TOTFREQ);
RansState rans0, rans1, rans2, rans3;
uint8_t *ptr = cp;
RansDecInit(&rans0, &ptr);
RansDecInit(&rans1, &ptr);
RansDecInit(&rans2, &ptr);
RansDecInit(&rans3, &ptr);
int out_end = (out_sz&~3);
RansState R[4];
R[0] = rans0;
R[1] = rans1;
R[2] = rans2;
R[3] = rans3;
uint32_t mask = (1u << TF_SHIFT)-1;
for (i=0; i < out_end; i+=4) {
uint32_t m[4] = {R[0] & mask,
R[1] & mask,
R[2] & mask,
R[3] & mask};
uint8_t c[4] = {D.R[m[0]],
D.R[m[1]],
D.R[m[2]],
D.R[m[3]]};
out_buf[i+0] = c[0];
out_buf[i+1] = c[1];
out_buf[i+2] = c[2];
out_buf[i+3] = c[3];
// RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT);
// RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT);
// RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT);
// RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT);
R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT);
R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT);
R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT);
R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT);
R[0] += m[0] - syms[c[0]].start;
R[1] += m[1] - syms[c[1]].start;
R[2] += m[2] - syms[c[2]].start;
R[3] += m[3] - syms[c[3]].start;
RansDecRenorm(&R[0], &ptr);
RansDecRenorm(&R[1], &ptr);
RansDecRenorm(&R[2], &ptr);
RansDecRenorm(&R[3], &ptr);
}
rans0 = R[0];
rans1 = R[1];
rans2 = R[2];
rans3 = R[3];
switch(out_sz&3) {
unsigned char c;
case 0:
break;
case 1:
c = D.R[RansDecGet(&rans0, TF_SHIFT)];
RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end] = c;
break;
case 2:
c = D.R[RansDecGet(&rans0, TF_SHIFT)];
RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end] = c;
c = D.R[RansDecGet(&rans1, TF_SHIFT)];
RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end+1] = c;
break;
case 3:
c = D.R[RansDecGet(&rans0, TF_SHIFT)];
RansDecAdvanceSymbol(&rans0, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end] = c;
c = D.R[RansDecGet(&rans1, TF_SHIFT)];
RansDecAdvanceSymbol(&rans1, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end+1] = c;
c = D.R[RansDecGet(&rans2, TF_SHIFT)];
RansDecAdvanceSymbol(&rans2, &ptr, &syms[c], TF_SHIFT);
out_buf[out_end+2] = c;
break;
}
*out_size = out_sz;
if (D.R) free(D.R);
return (unsigned char *)out_buf;
}
unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
unsigned char *out_buf, *out_end, *cp;
unsigned int last_i, tab_size, rle_i, rle_j;
RansEncSymbol syms[256][256];
if (in_size < 4)
return rans_compress_O0(in, in_size, out_size);
out_buf = malloc(1.05*in_size + 257*257*3 + 9);
if (!out_buf)
return NULL;
out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
cp = out_buf+9;
int F[256][256], T[256], i, j;
unsigned char c;
memset(F, 0, 256*256*sizeof(int));
memset(T, 0, 256*sizeof(int));
//for (last = 0, i=in_size-1; i>=0; i--) {
// F[last][c = in[i]]++;
// T[last]++;
// last = c;
//}
for (last_i=i=0; i>2)]]++;
F[0][in[2*(in_size>>2)]]++;
F[0][in[3*(in_size>>2)]]++;
T[0]+=3;
// Normalise so T[i] == TOTFREQ
for (rle_i = i = 0; i < 256; i++) {
int t2, m, M;
unsigned int x;
if (T[i] == 0)
continue;
//uint64_t p = (TOTFREQ * TOTFREQ) / t;
double p = ((double)TOTFREQ)/T[i];
for (t2 = m = M = j = 0; j < 256; j++) {
if (!F[i][j])
continue;
if (m < F[i][j])
m = F[i][j], M = j;
//if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
if ((F[i][j] *= p) == 0)
F[i][j] = 1;
t2 += F[i][j];
}
t2++;
if (t2 < TOTFREQ)
F[i][M] += TOTFREQ-t2;
else
F[i][M] -= t2-TOTFREQ;
// Store frequency table
// i
if (rle_i) {
rle_i--;
} else {
*cp++ = i;
// FIXME: could use order-0 statistics to observe which alphabet
// symbols are present and base RLE on that ordering instead.
if (i && T[i-1]) {
for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
;
rle_i -= i+1;
*cp++ = rle_i;
}
}
int *F_i_ = F[i];
x = 0;
rle_j = 0;
for (j = 0; j < 256; j++) {
if (F_i_[j]) {
//fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
// j
if (rle_j) {
rle_j--;
} else {
*cp++ = j;
if (!rle_j && j && F_i_[j-1]) {
for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
;
rle_j -= j+1;
*cp++ = rle_j;
}
}
// F_i_[j]
if (F_i_[j]<128) {
*cp++ = F_i_[j];
} else {
*cp++ = 128 | (F_i_[j]>>8);
*cp++ = F_i_[j]&0xff;
}
RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
x += F_i_[j];
}
}
*cp++ = 0;
}
*cp++ = 0;
//write(1, out_buf+4, cp-(out_buf+4));
tab_size = cp - out_buf;
assert(tab_size < 257*257*3);
RansState rans0, rans1, rans2, rans3;
RansEncInit(&rans0);
RansEncInit(&rans1);
RansEncInit(&rans2);
RansEncInit(&rans3);
uint8_t* ptr = out_end;
int isz4 = in_size>>2;
int i0 = 1*isz4-2;
int i1 = 2*isz4-2;
int i2 = 3*isz4-2;
int i3 = 4*isz4-2;
unsigned char l0 = in[i0+1];
unsigned char l1 = in[i1+1];
unsigned char l2 = in[i2+1];
unsigned char l3 = in[i3+1];
// Deal with the remainder
l3 = in[in_size-1];
for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
unsigned char c3 = in[i3];
RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
l3 = c3;
}
for (; i0 >= 0; i0--, i1--, i2--, i3--) {
unsigned char c0, c1, c2, c3;
RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
RansEncPutSymbol(&rans3, &ptr, s3);
RansEncPutSymbol(&rans2, &ptr, s2);
RansEncPutSymbol(&rans1, &ptr, s1);
RansEncPutSymbol(&rans0, &ptr, s0);
l0 = c0;
l1 = c1;
l2 = c2;
l3 = c3;
}
RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
RansEncFlush(&rans3, &ptr);
RansEncFlush(&rans2, &ptr);
RansEncFlush(&rans1, &ptr);
RansEncFlush(&rans0, &ptr);
*out_size = (out_end - ptr) + tab_size;
cp = out_buf;
*cp++ = 1; // order
*cp++ = ((*out_size-9)>> 0) & 0xff;
*cp++ = ((*out_size-9)>> 8) & 0xff;
*cp++ = ((*out_size-9)>>16) & 0xff;
*cp++ = ((*out_size-9)>>24) & 0xff;
*cp++ = (in_size>> 0) & 0xff;
*cp++ = (in_size>> 8) & 0xff;
*cp++ = (in_size>>16) & 0xff;
*cp++ = (in_size>>24) & 0xff;
memmove(out_buf + tab_size, ptr, out_end-ptr);
return out_buf;
}
unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
/* Load in the static tables */
unsigned char *cp = in + 9;
int i, j = -999, x, out_sz, in_sz, rle_i, rle_j;
char *out_buf;
ari_decoder D[256];
RansDecSymbol syms[256][256];
memset(D, 0, 256*sizeof(*D));
if (*in++ != 1) // Order-1 check
return NULL;
in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | ((in[3])<<24);
out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | ((in[7])<<24);
if (in_sz != in_size-9)
return NULL;
out_buf = malloc(out_sz);
if (!out_buf)
return NULL;
//fprintf(stderr, "out_sz=%d\n", out_sz);
//i = *cp++;
rle_i = 0;
i = *cp++;
do {
rle_j = x = 0;
j = *cp++;
do {
if ((D[i].fc[j].F = *cp++) >= 128) {
D[i].fc[j].F &= ~128;
D[i].fc[j].F = ((D[i].fc[j].F & 127) << 8) | *cp++;
}
D[i].fc[j].C = x;
//fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, D[i].fc[j].F, D[i].fc[j].C);
if (!D[i].fc[j].F)
D[i].fc[j].F = TOTFREQ;
RansDecSymbolInit(&syms[i][j], D[i].fc[j].C, D[i].fc[j].F);
/* Build reverse lookup table */
if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ);
memset(&D[i].R[x], j, D[i].fc[j].F);
x += D[i].fc[j].F;
assert(x <= TOTFREQ);
if (!rle_j && j+1 == *cp) {
j = *cp++;
rle_j = *cp++;
} else if (rle_j) {
rle_j--;
j++;
} else {
j = *cp++;
}
} while(j);
if (!rle_i && i+1 == *cp) {
i = *cp++;
rle_i = *cp++;
} else if (rle_i) {
rle_i--;
i++;
} else {
i = *cp++;
}
} while (i);
// Precompute reverse lookup of frequency.
RansState rans0, rans1, rans2, rans3;
uint8_t *ptr = cp;
RansDecInit(&rans0, &ptr);
RansDecInit(&rans1, &ptr);
RansDecInit(&rans2, &ptr);
RansDecInit(&rans3, &ptr);
int isz4 = out_sz>>2;
int l0 = 0;
int l1 = 0;
int l2 = 0;
int l3 = 0;
int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
RansState R[4];
R[0] = rans0;
R[1] = rans1;
R[2] = rans2;
R[3] = rans3;
for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1),
R[1] & ((1u << TF_SHIFT)-1),
R[2] & ((1u << TF_SHIFT)-1),
R[3] & ((1u << TF_SHIFT)-1)};
uint8_t c[4] = {D[l0].R[m[0]],
D[l1].R[m[1]],
D[l2].R[m[2]],
D[l3].R[m[3]]};
out_buf[i4[0]] = c[0];
out_buf[i4[1]] = c[1];
out_buf[i4[2]] = c[2];
out_buf[i4[3]] = c[3];
//RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT);
//RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT);
//RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT);
//RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT);
R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT);
R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT);
R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT);
R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT);
R[0] += m[0] - syms[l0][c[0]].start;
R[1] += m[1] - syms[l1][c[1]].start;
R[2] += m[2] - syms[l2][c[2]].start;
R[3] += m[3] - syms[l3][c[3]].start;
RansDecRenorm(&R[0], &ptr);
RansDecRenorm(&R[1], &ptr);
RansDecRenorm(&R[2], &ptr);
RansDecRenorm(&R[3], &ptr);
l0 = c[0];
l1 = c[1];
l2 = c[2];
l3 = c[3];
}
rans0 = R[0];
rans1 = R[1];
rans2 = R[2];
rans3 = R[3];
// Remainder
for (; i4[3] < out_sz; i4[3]++) {
unsigned char c3 = D[l3].R[RansDecGet(&rans3, TF_SHIFT)];
out_buf[i4[3]] = c3;
RansDecAdvanceSymbol(&rans3, &ptr, &syms[l3][c3], TF_SHIFT);
l3 = c3;
}
*out_size = out_sz;
for (i = 0; i < 256; i++)
if (D[i].R) free(D[i].R);
return (unsigned char *)out_buf;
}
/*-----------------------------------------------------------------------------
* Simple interface to the order-0 vs order-1 encoders and decoders.
*/
unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
unsigned int *out_size, int order) {
return order
? rans_compress_O1(in, in_size, out_size)
: rans_compress_O0(in, in_size, out_size);
}
unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
return in[0]
? rans_uncompress_O1(in, in_size, out_size)
: rans_uncompress_O0(in, in_size, out_size);
}
#ifdef TEST_MAIN
/*-----------------------------------------------------------------------------
* Main.
*
* This is a simple command line tool for testing order-0 and order-1
* compression using the rANS codec. Simply compile with
*
* gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static
*
* Usage: cram/rANS_static -o0 < file > file.o0
* cram/rANS_static -d < file.o0 > file2
*
* cram/rANS_static -o1 < file > file.o1
* cram/rANS_static -d < file.o1 > file2
*/
int main(int argc, char **argv) {
int opt, order = 0;
unsigned char in_buf[BLK_SIZE2+257*257*3];
int decode = 0;
FILE *infp = stdin, *outfp = stdout;
struct timeval tv1, tv2;
size_t bytes = 0;
extern char *optarg;
extern int optind;
while ((opt = getopt(argc, argv, "o:d")) != -1) {
switch (opt) {
case 'o':
order = atoi(optarg);
break;
case 'd':
decode = 1;
break;
}
}
order = order ? 1 : 0; // Only support O(0) and O(1)
if (optind < argc) {
if (!(infp = fopen(argv[optind], "rb"))) {
perror(argv[optind]);
return 1;
}
optind++;
}
if (optind < argc) {
if (!(outfp = fopen(argv[optind], "wb"))) {
perror(argv[optind]);
return 1;
}
optind++;
}
gettimeofday(&tv1, NULL);
if (decode) {
// Only used in some test implementations of RC_GetFreq()
//RC_init();
//RC_init2();
for (;;) {
uint32_t in_size, out_size;
unsigned char *out;
if (4 != fread(&in_size, 1, 4, infp))
break;
if (in_size != fread(in_buf, 1, in_size, infp)) {
fprintf(stderr, "Truncated input\n");
exit(1);
}
out = rans_uncompress(in_buf, in_size, &out_size);
if (!out)
abort();
fwrite(out, 1, out_size, outfp);
free(out);
bytes += out_size;
}
} else {
for (;;) {
uint32_t in_size, out_size;
unsigned char *out;
in_size = fread(in_buf, 1, BLK_SIZE, infp);
if (in_size <= 0)
break;
out = rans_compress(in_buf, in_size, &out_size, order);
fwrite(&out_size, 1, 4, outfp);
fwrite(out, 1, out_size, outfp);
free(out);
bytes += in_size;
}
}
gettimeofday(&tv2, NULL);
fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n",
(long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
tv2.tv_usec - tv1.tv_usec,
(double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 +
tv2.tv_usec - tv1.tv_usec));
return 0;
}
#endif
htslib-1.2.1/cram/rANS_static.h 0000664 0000000 0000000 00000003655 12464172677 0016276 0 ustar 00root root 0000000 0000000 /*
* Copyright (c) 2014 Genome Research Ltd.
* Author(s): James Bonfield
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
* Institute nor the names of its contributors may be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef RANS_STATIC_H
#define RANS_STATIC_H
unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
unsigned int *out_size, int order);
unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
unsigned int *out_size);
#endif /* RANS_STATIC_H */
htslib-1.2.1/cram/sam_header.c 0000664 0000000 0000000 00000066636 12464172677 0016217 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include "cram/sam_header.h"
#include "cram/string_alloc.h"
static void sam_hdr_error(char *msg, char *line, int len, int lno) {
int j;
for (j = 0; j < len && line[j] != '\n'; j++)
;
fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
}
void sam_hdr_dump(SAM_hdr *hdr) {
khint_t k;
int i;
printf("===DUMP===\n");
for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
SAM_hdr_type *t1, *t2;
char c[2];
if (!kh_exist(hdr->h, k))
continue;
t1 = t2 = kh_val(hdr->h, k);
c[0] = kh_key(hdr->h, k)>>8;
c[1] = kh_key(hdr->h, k)&0xff;
printf("Type %.2s, count %d\n", c, t1->prev->order+1);
do {
SAM_hdr_tag *tag;
printf(">>>%d ", t1->order);
for (tag = t1->tag; tag; tag=tag->next) {
printf("\"%.2s\":\"%.*s\"\t",
tag->str, tag->len-3, tag->str+3);
}
putchar('\n');
t1 = t1->next;
} while (t1 != t2);
}
/* Dump out PG chains */
printf("\n@PG chains:\n");
for (i = 0; i < hdr->npg_end; i++) {
int j;
printf(" %d:", i);
for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
printf("%s%d(%.*s)",
j == hdr->pg_end[i] ? " " : "->",
j, hdr->pg[j].name_len, hdr->pg[j].name);
}
printf("\n");
}
puts("===END DUMP===");
}
/* Updates the hash tables in the SAM_hdr structure.
*
* Returns 0 on success;
* -1 on failure
*/
static int sam_hdr_update_hashes(SAM_hdr *sh,
int type,
SAM_hdr_type *h_type) {
/* Add to reference hash? */
if ((type>>8) == 'S' && (type&0xff) == 'Q') {
SAM_hdr_tag *tag;
int nref = sh->nref;
sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
if (!sh->ref)
return -1;
tag = h_type->tag;
sh->ref[nref].name = NULL;
sh->ref[nref].len = 0;
sh->ref[nref].ty = h_type;
sh->ref[nref].tag = tag;
while (tag) {
if (tag->str[0] == 'S' && tag->str[1] == 'N') {
if (!(sh->ref[nref].name = malloc(tag->len)))
return -1;
strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
sh->ref[nref].name[tag->len-3] = 0;
} else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
sh->ref[nref].len = atoi(tag->str+3);
}
tag = tag->next;
}
if (sh->ref[nref].name) {
khint_t k;
int r;
k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
if (-1 == r) return -1;
kh_val(sh->ref_hash, k) = nref;
}
sh->nref++;
}
/* Add to read-group hash? */
if ((type>>8) == 'R' && (type&0xff) == 'G') {
SAM_hdr_tag *tag;
int nrg = sh->nrg;
sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
if (!sh->rg)
return -1;
tag = h_type->tag;
sh->rg[nrg].name = NULL;
sh->rg[nrg].name_len = 0;
sh->rg[nrg].ty = h_type;
sh->rg[nrg].tag = tag;
sh->rg[nrg].id = nrg;
while (tag) {
if (tag->str[0] == 'I' && tag->str[1] == 'D') {
if (!(sh->rg[nrg].name = malloc(tag->len)))
return -1;
strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
sh->rg[nrg].name[tag->len-3] = 0;
sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
}
tag = tag->next;
}
if (sh->rg[nrg].name) {
khint_t k;
int r;
k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
if (-1 == r) return -1;
kh_val(sh->rg_hash, k) = nrg;
}
sh->nrg++;
}
/* Add to program hash? */
if ((type>>8) == 'P' && (type&0xff) == 'G') {
SAM_hdr_tag *tag;
int npg = sh->npg;
sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
if (!sh->pg)
return -1;
tag = h_type->tag;
sh->pg[npg].name = NULL;
sh->pg[npg].name_len = 0;
sh->pg[npg].ty = h_type;
sh->pg[npg].tag = tag;
sh->pg[npg].id = npg;
sh->pg[npg].prev_id = -1;
while (tag) {
if (tag->str[0] == 'I' && tag->str[1] == 'D') {
if (!(sh->pg[npg].name = malloc(tag->len)))
return -1;
strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
sh->pg[npg].name[tag->len-3] = 0;
sh->pg[npg].name_len = strlen(sh->pg[npg].name);
} else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
// Resolve later if needed
khint_t k;
char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
tag->str[tag->len] = tmp;
if (k != kh_end(sh->pg_hash)) {
int p_id = kh_val(sh->pg_hash, k);
sh->pg[npg].prev_id = sh->pg[p_id].id;
/* Unmark previous entry as a PG termination */
if (sh->npg_end > 0 &&
sh->pg_end[sh->npg_end-1] == p_id) {
sh->npg_end--;
} else {
int i;
for (i = 0; i < sh->npg_end; i++) {
if (sh->pg_end[i] == p_id) {
memmove(&sh->pg_end[i], &sh->pg_end[i+1],
(sh->npg_end-i-1)*sizeof(*sh->pg_end));
sh->npg_end--;
}
}
}
} else {
sh->pg[npg].prev_id = -1;
}
}
tag = tag->next;
}
if (sh->pg[npg].name) {
khint_t k;
int r;
k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
if (-1 == r) return -1;
kh_val(sh->pg_hash, k) = npg;
}
/* Add to npg_end[] array. Remove later if we find a PP line */
if (sh->npg_end >= sh->npg_end_alloc) {
sh->npg_end_alloc = sh->npg_end_alloc
? sh->npg_end_alloc*2
: 4;
sh->pg_end = realloc(sh->pg_end,
sh->npg_end_alloc * sizeof(int));
if (!sh->pg_end)
return -1;
}
sh->pg_end[sh->npg_end++] = npg;
sh->npg++;
}
return 0;
}
/*
* Appends a formatted line to an existing SAM header.
* Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
* optional new-line. If it contains more than 1 line then multiple lines
* will be added in order.
*
* Len is the length of the text data, or 0 if unknown (in which case
* it should be null terminated).
*
* Returns 0 on success
* -1 on failure
*/
int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
int i, lno = 1, text_offset;
char *hdr;
if (!len)
len = strlen(lines);
text_offset = ks_len(&sh->text);
if (EOF == kputsn(lines, len, &sh->text))
return -1;
hdr = ks_str(&sh->text) + text_offset;
for (i = 0; i < len; i++) {
khint32_t type;
khint_t k;
int l_start = i, new;
SAM_hdr_type *h_type;
SAM_hdr_tag *h_tag, *last;
if (hdr[i] != '@') {
int j;
for (j = i; j < len && hdr[j] != '\n'; j++)
;
sam_hdr_error("Header line does not start with '@'",
&hdr[l_start], len - l_start, lno);
return -1;
}
type = (hdr[i+1]<<8) | hdr[i+2];
if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
hdr[i+2] < 'A' || hdr[i+2] > 'z') {
sam_hdr_error("Header line does not have a two character key",
&hdr[l_start], len - l_start, lno);
return -1;
}
i += 3;
if (hdr[i] == '\n')
continue;
// Add the header line type
if (!(h_type = pool_alloc(sh->type_pool)))
return -1;
if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
return -1;
// Form the ring, either with self or other lines of this type
if (!new) {
SAM_hdr_type *t = kh_val(sh->h, k), *p;
p = t->prev;
assert(p->next = t);
p->next = h_type;
h_type->prev = p;
t->prev = h_type;
h_type->next = t;
h_type->order = p->order+1;
} else {
kh_val(sh->h, k) = h_type;
h_type->prev = h_type->next = h_type;
h_type->order = 0;
}
// Parse the tags on this line
last = NULL;
if ((type>>8) == 'C' && (type&0xff) == 'O') {
int j;
if (hdr[i] != '\t') {
sam_hdr_error("Missing tab",
&hdr[l_start], len - l_start, lno);
return -1;
}
for (j = ++i; j < len && hdr[j] != '\n'; j++)
;
if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
return -1;
h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
h_tag->len = j-i;
h_tag->next = NULL;
if (!h_tag->str)
return -1;
i = j;
} else {
do {
int j;
if (hdr[i] != '\t') {
sam_hdr_error("Missing tab",
&hdr[l_start], len - l_start, lno);
return -1;
}
for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++)
;
if (!(h_tag = pool_alloc(sh->tag_pool)))
return -1;
h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
h_tag->len = j-i;
h_tag->next = NULL;
if (!h_tag->str)
return -1;
if (h_tag->len < 3 || h_tag->str[2] != ':') {
sam_hdr_error("Malformed key:value pair",
&hdr[l_start], len - l_start, lno);
return -1;
}
if (last)
last->next = h_tag;
else
h_type->tag = h_tag;
last = h_tag;
i = j;
} while (i < len && hdr[i] != '\n');
}
/* Update RG/SQ hashes */
if (-1 == sam_hdr_update_hashes(sh, type, h_type))
return -1;
}
return 0;
}
/*
* Adds a single line to a SAM header.
* Specify type and one or more key,value pairs, ending with the NULL key.
* Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
*
* Returns index for specific entry on success (eg 2nd SQ, 4th RG)
* -1 on failure
*/
int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
va_list args;
va_start(args, type);
return sam_hdr_vadd(sh, type, args, NULL);
}
int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
va_list args;
SAM_hdr_type *h_type;
SAM_hdr_tag *h_tag, *last;
int new;
khint32_t type_i = (type[0]<<8) | type[1], k;
#if defined(HAVE_VA_COPY)
va_list ap_local;
#endif
if (EOF == kputc_('@', &sh->text))
return -1;
if (EOF == kputsn(type, 2, &sh->text))
return -1;
if (!(h_type = pool_alloc(sh->type_pool)))
return -1;
if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
return -1;
kh_val(sh->h, k) = h_type;
// Form the ring, either with self or other lines of this type
if (!new) {
SAM_hdr_type *t = kh_val(sh->h, k), *p;
p = t->prev;
assert(p->next = t);
p->next = h_type;
h_type->prev = p;
t->prev = h_type;
h_type->next = t;
h_type->order = p->order + 1;
} else {
h_type->prev = h_type->next = h_type;
h_type->order = 0;
}
last = NULL;
// Any ... varargs
va_start(args, ap);
for (;;) {
char *k, *v;
int idx;
if (!(k = (char *)va_arg(args, char *)))
break;
v = va_arg(args, char *);
if (EOF == kputc_('\t', &sh->text))
return -1;
if (!(h_tag = pool_alloc(sh->tag_pool)))
return -1;
idx = ks_len(&sh->text);
if (EOF == kputs(k, &sh->text))
return -1;
if (EOF == kputc_(':', &sh->text))
return -1;
if (EOF == kputs(v, &sh->text))
return -1;
h_tag->len = ks_len(&sh->text) - idx;
h_tag->str = string_ndup(sh->str_pool,
ks_str(&sh->text) + idx,
h_tag->len);
h_tag->next = NULL;
if (!h_tag->str)
return -1;
if (last)
last->next = h_tag;
else
h_type->tag = h_tag;
last = h_tag;
}
va_end(args);
#if defined(HAVE_VA_COPY)
va_copy(ap_local, ap);
# define ap ap_local
#endif
// Plus the specified va_list params
for (;;) {
char *k, *v;
int idx;
if (!(k = (char *)va_arg(ap, char *)))
break;
v = va_arg(ap, char *);
if (EOF == kputc_('\t', &sh->text))
return -1;
if (!(h_tag = pool_alloc(sh->tag_pool)))
return -1;
idx = ks_len(&sh->text);
if (EOF == kputs(k, &sh->text))
return -1;
if (EOF == kputc_(':', &sh->text))
return -1;
if (EOF == kputs(v, &sh->text))
return -1;
h_tag->len = ks_len(&sh->text) - idx;
h_tag->str = string_ndup(sh->str_pool,
ks_str(&sh->text) + idx,
h_tag->len);
h_tag->next = NULL;
if (!h_tag->str)
return -1;
if (last)
last->next = h_tag;
else
h_type->tag = h_tag;
last = h_tag;
}
va_end(ap);
if (EOF == kputc('\n', &sh->text))
return -1;
int itype = (type[0]<<8) | type[1];
if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
return -1;
return h_type->order;
}
/*
* Returns the first header item matching 'type'. If ID is non-NULL it checks
* for the tag ID: and compares against the specified ID.
*
* Returns NULL if no type/ID is found
*/
SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
char *ID_key, char *ID_value) {
SAM_hdr_type *t1, *t2;
int itype = (type[0]<<8)|(type[1]);
khint_t k;
/* Special case for types we have prebuilt hashes on */
if (ID_key) {
if (type[0] == 'S' && type[1] == 'Q' &&
ID_key[0] == 'S' && ID_key[1] == 'N') {
k = kh_get(m_s2i, hdr->ref_hash, ID_value);
return k != kh_end(hdr->ref_hash)
? hdr->ref[kh_val(hdr->ref_hash, k)].ty
: NULL;
}
if (type[0] == 'R' && type[1] == 'G' &&
ID_key[0] == 'I' && ID_key[1] == 'D') {
k = kh_get(m_s2i, hdr->rg_hash, ID_value);
return k != kh_end(hdr->rg_hash)
? hdr->rg[kh_val(hdr->rg_hash, k)].ty
: NULL;
}
if (type[0] == 'P' && type[1] == 'G' &&
ID_key[0] == 'I' && ID_key[1] == 'D') {
k = kh_get(m_s2i, hdr->pg_hash, ID_value);
return k != kh_end(hdr->pg_hash)
? hdr->pg[kh_val(hdr->pg_hash, k)].ty
: NULL;
}
}
k = kh_get(sam_hdr, hdr->h, itype);
if (k == kh_end(hdr->h))
return NULL;
if (!ID_key)
return kh_val(hdr->h, k);
t1 = t2 = kh_val(hdr->h, k);
do {
SAM_hdr_tag *tag;
for (tag = t1->tag; tag; tag = tag->next) {
if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
char *cp1 = tag->str+3;
char *cp2 = ID_value;
while (*cp1 && *cp1 == *cp2)
cp1++, cp2++;
if (*cp2 || *cp1)
continue;
return t1;
}
}
t1 = t1->next;
} while (t1 != t2);
return NULL;
}
/*
* As per SAM_hdr_type, but returns a complete line of formatted text
* for a specific head type/ID combination. If ID is NULL then it returns
* the first line of the specified type.
*
* The returned string is malloced and should be freed by the calling
* function with free().
*
* Returns NULL if no type/ID is found.
*/
char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
char *ID_key, char *ID_value) {
SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
kstring_t ks = KS_INITIALIZER;
SAM_hdr_tag *tag;
int r = 0;
if (!ty)
return NULL;
// Paste together the line from the hashed copy
r |= (kputc_('@', &ks) == EOF);
r |= (kputs(type, &ks) == EOF);
for (tag = ty->tag; tag; tag = tag->next) {
r |= (kputc_('\t', &ks) == EOF);
r |= (kputsn(tag->str, tag->len, &ks) == EOF);
}
if (r) {
KS_FREE(&ks);
return NULL;
}
return ks_str(&ks);
}
/*
* Looks for a specific key in a single sam header line.
* If prev is non-NULL it also fills this out with the previous tag, to
* permit use in key removal. *prev is set to NULL when the tag is the first
* key in the list. When a tag isn't found, prev (if non NULL) will be the last
* tag in the existing list.
*
* Returns the tag pointer on success
* NULL on failure
*/
SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
SAM_hdr_type *type,
char *key,
SAM_hdr_tag **prev) {
SAM_hdr_tag *tag, *p = NULL;
for (tag = type->tag; tag; p = tag, tag = tag->next) {
if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
if (prev)
*prev = p;
return tag;
}
}
if (prev)
*prev = p;
return NULL;
}
/*
* Adds or updates tag key,value pairs in a header line.
* Eg for adding M5 tags to @SQ lines or updating sort order for the
* @HD line (although use the sam_hdr_sort_order() function for
* HD manipulation, which is a wrapper around this funuction).
*
* Specify multiple key,value pairs ending in NULL.
*
* Returns 0 on success
* -1 on failure
*/
int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
va_list ap;
va_start(ap, type);
for (;;) {
char *k, *v;
int idx;
SAM_hdr_tag *tag, *prev;
if (!(k = (char *)va_arg(ap, char *)))
break;
v = va_arg(ap, char *);
tag = sam_hdr_find_key(hdr, type, k, &prev);
if (!tag) {
if (!(tag = pool_alloc(hdr->tag_pool)))
return -1;
if (prev)
prev->next = tag;
else
type->tag = tag;
tag->next = NULL;
}
idx = ks_len(&hdr->text);
if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
return -1;
tag->len = ks_len(&hdr->text) - idx;
tag->str = string_ndup(hdr->str_pool,
ks_str(&hdr->text) + idx,
tag->len);
if (!tag->str)
return -1;
}
va_end(ap);
return 0;
}
#define K(a) (((a)[0]<<8)|((a)[1]))
/*
* Reconstructs the kstring from the header hash table.
* Returns 0 on success
* -1 on failure
*/
int sam_hdr_rebuild(SAM_hdr *hdr) {
/* Order: HD then others */
kstring_t ks = KS_INITIALIZER;
khint_t k;
k = kh_get(sam_hdr, hdr->h, K("HD"));
if (k != kh_end(hdr->h)) {
SAM_hdr_type *ty = kh_val(hdr->h, k);
SAM_hdr_tag *tag;
if (EOF == kputs("@HD", &ks))
return -1;
for (tag = ty->tag; tag; tag = tag->next) {
if (EOF == kputc_('\t', &ks))
return -1;
if (EOF == kputsn_(tag->str, tag->len, &ks))
return -1;
}
if (EOF == kputc('\n', &ks))
return -1;
}
for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
SAM_hdr_type *t1, *t2;
if (!kh_exist(hdr->h, k))
continue;
if (kh_key(hdr->h, k) == K("HD"))
continue;
t1 = t2 = kh_val(hdr->h, k);
do {
SAM_hdr_tag *tag;
char c[2];
if (EOF == kputc_('@', &ks))
return -1;
c[0] = kh_key(hdr->h, k)>>8;
c[1] = kh_key(hdr->h, k)&0xff;
if (EOF == kputsn_(c, 2, &ks))
return -1;
for (tag = t1->tag; tag; tag=tag->next) {
if (EOF == kputc_('\t', &ks))
return -1;
if (EOF == kputsn_(tag->str, tag->len, &ks))
return -1;
}
if (EOF == kputc('\n', &ks))
return -1;
t1 = t1->next;
} while (t1 != t2);
}
if (ks_str(&hdr->text))
KS_FREE(&hdr->text);
hdr->text = ks;
return 0;
}
/*
* Creates an empty SAM header, ready to be populated.
*
* Returns a SAM_hdr struct on success (free with sam_hdr_free())
* NULL on failure
*/
SAM_hdr *sam_hdr_new() {
SAM_hdr *sh = calloc(1, sizeof(*sh));
if (!sh)
return NULL;
sh->h = kh_init(sam_hdr);
if (!sh->h)
goto err;
sh->ID_cnt = 1;
sh->ref_count = 1;
sh->nref = 0;
sh->ref = NULL;
if (!(sh->ref_hash = kh_init(m_s2i)))
goto err;
sh->nrg = 0;
sh->rg = NULL;
if (!(sh->rg_hash = kh_init(m_s2i)))
goto err;
sh->npg = 0;
sh->pg = NULL;
sh->npg_end = sh->npg_end_alloc = 0;
sh->pg_end = NULL;
if (!(sh->pg_hash = kh_init(m_s2i)))
goto err;
KS_INIT(&sh->text);
if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
goto err;
if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
goto err;
if (!(sh->str_pool = string_pool_create(8192)))
goto err;
return sh;
err:
if (sh->h)
kh_destroy(sam_hdr, sh->h);
if (sh->tag_pool)
pool_destroy(sh->tag_pool);
if (sh->type_pool)
pool_destroy(sh->type_pool);
if (sh->str_pool)
string_pool_destroy(sh->str_pool);
free(sh);
return NULL;
}
/*
* Tokenises a SAM header into a hash table.
* Also extracts a few bits on specific data types, such as @RG lines.
*
* Returns a SAM_hdr struct on success (free with sam_hdr_free())
* NULL on failure
*/
SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
/* Make an empty SAM_hdr */
SAM_hdr *sh;
sh = sam_hdr_new();
if (NULL == sh) return NULL;
if (NULL == hdr) return sh; // empty header is permitted
/* Parse the header, line by line */
if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
sam_hdr_free(sh);
return NULL;
}
//sam_hdr_dump(sh);
//sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
//sam_hdr_rebuild(sh);
//printf(">>%s<<", ks_str(sh->text));
//parse_references(sh);
//parse_read_groups(sh);
sam_hdr_link_pg(sh);
//sam_hdr_dump(sh);
return sh;
}
/*
* Produces a duplicate copy of hdr and returns it.
* Returns NULL on failure
*/
SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
if (-1 == sam_hdr_rebuild(hdr))
return NULL;
return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
}
/*! Increments a reference count on hdr.
*
* This permits multiple files to share the same header, all calling
* sam_hdr_free when done, without causing errors for other open files.
*/
void sam_hdr_incr_ref(SAM_hdr *hdr) {
hdr->ref_count++;
}
/*! Increments a reference count on hdr.
*
* This permits multiple files to share the same header, all calling
* sam_hdr_free when done, without causing errors for other open files.
*
* If the reference count hits zero then the header is automatically
* freed. This makes it a synonym for sam_hdr_free().
*/
void sam_hdr_decr_ref(SAM_hdr *hdr) {
sam_hdr_free(hdr);
}
/*! Deallocates all storage used by a SAM_hdr struct.
*
* This also decrements the header reference count. If after decrementing
* it is still non-zero then the header is assumed to be in use by another
* caller and the free is not done.
*
* This is a synonym for sam_hdr_dec_ref().
*/
void sam_hdr_free(SAM_hdr *hdr) {
if (!hdr)
return;
if (--hdr->ref_count > 0)
return;
if (ks_str(&hdr->text))
KS_FREE(&hdr->text);
if (hdr->h)
kh_destroy(sam_hdr, hdr->h);
if (hdr->ref_hash)
kh_destroy(m_s2i, hdr->ref_hash);
if (hdr->ref) {
int i;
for (i = 0; i < hdr->nref; i++)
if (hdr->ref[i].name)
free(hdr->ref[i].name);
free(hdr->ref);
}
if (hdr->rg_hash)
kh_destroy(m_s2i, hdr->rg_hash);
if (hdr->rg) {
int i;
for (i = 0; i < hdr->nrg; i++)
if (hdr->rg[i].name)
free(hdr->rg[i].name);
free(hdr->rg);
}
if (hdr->pg_hash)
kh_destroy(m_s2i, hdr->pg_hash);
if (hdr->pg) {
int i;
for (i = 0; i < hdr->npg; i++)
if (hdr->pg[i].name)
free(hdr->pg[i].name);
free(hdr->pg);
}
if (hdr->pg_end)
free(hdr->pg_end);
if (hdr->type_pool)
pool_destroy(hdr->type_pool);
if (hdr->tag_pool)
pool_destroy(hdr->tag_pool);
if (hdr->str_pool)
string_pool_destroy(hdr->str_pool);
free(hdr);
}
int sam_hdr_length(SAM_hdr *hdr) {
return ks_len(&hdr->text);
}
char *sam_hdr_str(SAM_hdr *hdr) {
return ks_str(&hdr->text);
}
/*
* Looks up a reference sequence by name and returns the numerical ID.
* Returns -1 if unknown reference.
*/
int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
}
/*
* Looks up a read-group by name and returns a pointer to the start of the
* associated tag list.
*
* Returns NULL on failure
*/
SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
return k == kh_end(hdr->rg_hash)
? NULL
: &hdr->rg[kh_val(hdr->rg_hash, k)];
}
/*
* Fixes any PP links in @PG headers.
* If the entries are in order then this doesn't need doing, but incase
* our header is out of order this goes through the sh->pg[] array
* setting the prev_id field.
*
* Note we can have multiple complete chains. This code should identify the
* tails of these chains as these are the entries we have to link to in
* subsequent PP records.
*
* Returns 0 on sucess
* -1 on failure (indicating broken PG/PP records)
*/
int sam_hdr_link_pg(SAM_hdr *hdr) {
int i, j, ret = 0;
hdr->npg_end_alloc = hdr->npg;
hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
if (!hdr->pg_end)
return -1;
for (i = 0; i < hdr->npg; i++)
hdr->pg_end[i] = i;
for (i = 0; i < hdr->npg; i++) {
khint_t k;
SAM_hdr_tag *tag;
char tmp;
for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
if (tag->str[0] == 'P' && tag->str[1] == 'P')
break;
}
if (!tag) {
/* Chain start points */
continue;
}
tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
tag->str[tag->len] = tmp;
if (k == kh_end(hdr->pg_hash)) {
ret = -1;
continue;
}
hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
}
for (i = j = 0; i < hdr->npg; i++) {
if (hdr->pg_end[i] != -1)
hdr->pg_end[j++] = hdr->pg_end[i];
}
hdr->npg_end = j;
return ret;
}
/*
* Returns a unique ID from a base name.
*
* The value returned is valid until the next call to
* this function.
*/
const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
khint_t k = kh_get(m_s2i, sh->pg_hash, name);
if (k == kh_end(sh->pg_hash))
return name;
do {
sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
} while (k == kh_end(sh->pg_hash));
return sh->ID_buf;
}
/*
* Add an @PG line.
*
* If we wish complete control over this use sam_hdr_add() directly. This
* function uses that, but attempts to do a lot of tedious house work for
* you too.
*
* - It will generate a suitable ID if the supplied one clashes.
* - It will generate multiple @PG records if we have multiple PG chains.
*
* Call it as per sam_hdr_add() with a series of key,value pairs ending
* in NULL.
*
* Returns 0 on success
* -1 on failure
*/
int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
va_list args;
va_start(args, name);
if (sh->npg_end) {
/* Copy ends array to avoid us looping while modifying it */
int *end = malloc(sh->npg_end * sizeof(int));
int i, nends = sh->npg_end;
if (!end)
return -1;
memcpy(end, sh->pg_end, nends * sizeof(*end));
for (i = 0; i < nends; i++) {
if (-1 == sam_hdr_vadd(sh, "PG", args,
"ID", sam_hdr_PG_ID(sh, name),
"PN", name,
"PP", sh->pg[end[i]].name,
NULL)) {
free(end);
return -1;
}
}
free(end);
} else {
if (-1 == sam_hdr_vadd(sh, "PG", args,
"ID", sam_hdr_PG_ID(sh, name),
"PN", name,
NULL))
return -1;
}
//sam_hdr_dump(sh);
return 0;
}
/*
* A function to help with construction of CL tags in @PG records.
* Takes an argc, argv pair and returns a single space-separated string.
* This string should be deallocated by the calling function.
*
* Returns malloced char * on success
* NULL on failure
*/
char *stringify_argv(int argc, char *argv[]) {
char *str, *cp;
size_t nbytes = 1;
int i, j;
/* Allocate */
for (i = 0; i < argc; i++) {
nbytes += strlen(argv[i]) + 1;
}
if (!(str = malloc(nbytes)))
return NULL;
/* Copy */
cp = str;
for (i = 0; i < argc; i++) {
j = 0;
while (argv[i][j]) {
if (argv[i][j] == '\t')
*cp++ = ' ';
else
*cp++ = argv[i][j];
j++;
}
*cp++ = ' ';
}
*cp++ = 0;
return str;
}
htslib-1.2.1/cram/sam_header.h 0000664 0000000 0000000 00000032230 12464172677 0016203 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013-2014 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* SAM header parsing.
*
* These functions can be shared between SAM, BAM and CRAM file
* formats as all three internally use the same string encoding for
* header fields.
*/
/*
* TODO.
*
* - Sort order (parse to struct, enum type, updating funcs)
* - Removal of lines.
* - Updating of lines
*/
#ifndef _SAM_HDR_H_
#define _SAM_HDR_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include "cram/string_alloc.h"
#include "cram/pooled_alloc.h"
#include "htslib/khash.h"
#include "htslib/kstring.h"
// For structure assignment. Eg kstring_t s = KS_INITIALIZER;
#define KS_INITIALIZER {0,0,0}
// For initialisation elsewhere. Eg KS_INIT(x->str);
#define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL)
// Frees the string subfield only. Assumes 's' itself is static.
#define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0)
/*
* Proposed new SAM header parsing
1 @SQ ID:foo LN:100
2 @SQ ID:bar LN:200
3 @SQ ID:ram LN:300 UR:xyz
4 @RG ID:r ...
5 @RG ID:s ...
Hash table for 2-char @keys without dup entries.
If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
HASH("SQ")--\
|
(3) <-> 1 <-> 2 <-> 3 <-> (1)
HASH("RG")--\
|
(5) <-> 4 <-> 5 <-> (4)
Items stored in the hash values also form their own linked lists:
Ie SQ->ID(foo)->LN(100)
SQ->ID(bar)->LN(200)
SQ->ID(ram)->LN(300)->UR(xyz)
RG->ID(r)
*/
/*! A single key:value pair on a header line
*
* These form a linked list and hold strings. The strings are
* allocated from a string_alloc_t pool referenced in the master
* SAM_hdr structure. Do not attempt to free, malloc or manipulate
* these strings directly.
*/
typedef struct SAM_hdr_tag_s {
struct SAM_hdr_tag_s *next;
char *str;
int len;
} SAM_hdr_tag;
/*! The parsed version of the SAM header string.
*
* Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type
* struct via the main hash table h in the SAM_hdr struct.
*
* These in turn consist of circular bi-directional linked lists (ie
* rings) to hold the multiple instances of the same header type
* code. For example if we have 5 \@SQ lines the primary hash table
* will key on \@SQ pointing to the first SAM_hdr_type and that in turn
* will be part of a ring of 5 elements.
*
* For each SAM_hdr_type structure we also point to a SAM_hdr_tag
* structure which holds the tokenised attributes; the tab separated
* key:value pairs per line.
*/
typedef struct SAM_hdr_item_s {
struct SAM_hdr_item_s *next; // cirular
struct SAM_hdr_item_s *prev;
SAM_hdr_tag *tag; // first tag
int order; // 0 upwards
} SAM_hdr_type;
/*! Parsed \@SQ lines */
typedef struct {
char *name;
uint32_t len;
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
} SAM_SQ;
/*! Parsed \@RG lines */
typedef struct {
char *name;
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
int name_len;
int id; // numerical ID
} SAM_RG;
/*! Parsed \@PG lines */
typedef struct {
char *name;
SAM_hdr_type *ty;
SAM_hdr_tag *tag;
int name_len;
int id; // numerical ID
int prev_id; // -1 if none
} SAM_PG;
KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*)
KHASH_MAP_INIT_STR(m_s2i, int)
/*! Primary structure for header manipulation
*
* The initial header text is held in the text kstring_t, but is also
* parsed out into SQ, RG and PG arrays. These have a hash table
* associated with each to allow lookup by ID or SN fields instead of
* their numeric array indices. Additionally PG has an array to hold
* the linked list start points (the last in a PP chain).
*
* Use the appropriate sam_hdr_* functions to edit the header, and
* call sam_hdr_rebuild() any time the textual form needs to be
* updated again.
*/
typedef struct {
kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag
khash_t(sam_hdr) *h;
string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings
pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs
pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs
// @SQ lines / references
int nref; //!< Number of \@SQ lines
SAM_SQ *ref; //!< Array of parsed \@SQ lines
khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index
// @RG lines / read-groups
int nrg; //!< Number of \@RG lines
SAM_RG *rg; //!< Array of parsed \@RG lines
khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
// @PG lines / programs
int npg; //!< Number of \@PG lines
int npg_end; //!< Number of terminating \@PG lines
int npg_end_alloc; //!< Size of pg_end field
SAM_PG *pg; //!< Array of parsed \@PG lines
khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
int *pg_end; //!< \@PG chain termination IDs
// @cond internal
char ID_buf[1024]; // temporary buffer
int ID_cnt;
int ref_count; // number of uses of this SAM_hdr
// @endcond
} SAM_hdr;
/*! Creates an empty SAM header, ready to be populated.
*
* @return
* Returns a SAM_hdr struct on success (free with sam_hdr_free())
* NULL on failure
*/
SAM_hdr *sam_hdr_new(void);
/*! Tokenises a SAM header into a hash table.
*
* Also extracts a few bits on specific data types, such as @RG lines.
*
* @return
* Returns a SAM_hdr struct on success (free with sam_hdr_free());
* NULL on failure
*/
SAM_hdr *sam_hdr_parse_(const char *hdr, int len);
/*! Produces a duplicate copy of hdr and returns it.
* @return
* Returns NULL on failure
*/
SAM_hdr *sam_hdr_dup(SAM_hdr *hdr);
/*! Increments a reference count on hdr.
*
* This permits multiple files to share the same header, all calling
* sam_hdr_free when done, without causing errors for other open files.
*/
void sam_hdr_incr_ref(SAM_hdr *hdr);
/*! Increments a reference count on hdr.
*
* This permits multiple files to share the same header, all calling
* sam_hdr_free when done, without causing errors for other open files.
*
* If the reference count hits zero then the header is automatically
* freed. This makes it a synonym for sam_hdr_free().
*/
void sam_hdr_decr_ref(SAM_hdr *hdr);
/*! Deallocates all storage used by a SAM_hdr struct.
*
* This also decrements the header reference count. If after decrementing
* it is still non-zero then the header is assumed to be in use by another
* caller and the free is not done.
*
* This is a synonym for sam_hdr_dec_ref().
*/
void sam_hdr_free(SAM_hdr *hdr);
/*! Returns the current length of the SAM_hdr in text form.
*
* Call sam_hdr_rebuild() first if editing has taken place.
*/
int sam_hdr_length(SAM_hdr *hdr);
/*! Returns the string form of the SAM_hdr.
*
* Call sam_hdr_rebuild() first if editing has taken place.
*/
char *sam_hdr_str(SAM_hdr *hdr);
/*! Appends a formatted line to an existing SAM header.
*
* Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
* optional new-line. If it contains more than 1 line then multiple lines
* will be added in order.
*
* Len is the length of the text data, or 0 if unknown (in which case
* it should be null terminated).
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len);
/*! Adds a single line to a SAM header.
*
* Specify type and one or more key,value pairs, ending with the NULL key.
* Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_add(SAM_hdr *sh, const char *type, ...);
/*! Adds a single line to a SAM header.
*
* This is much like sam_hdr_add() but with the additional va_list
* argument. This is followed by specifying type and one or more
* key,value pairs, ending with the NULL key.
*
* Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
*
* The purpose of the additional va_list parameter is to permit other
* varargs functions to call this while including their own additional
* parameters; an example is in sam_hdr_add_PG().
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...);
/*!
* @return
* Returns the first header item matching 'type'. If ID is non-NULL it checks
* for the tag ID: and compares against the specified ID.
*
* Returns NULL if no type/ID is found
*/
SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
char *ID_key, char *ID_value);
/*!
*
* As per SAM_hdr_type, but returns a complete line of formatted text
* for a specific head type/ID combination. If ID is NULL then it returns
* the first line of the specified type.
*
* The returned string is malloced and should be freed by the calling
* function with free().
*
* @return
* Returns NULL if no type/ID is found.
*/
char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
char *ID_key, char *ID_value);
/*! Looks for a specific key in a single sam header line.
*
* If prev is non-NULL it also fills this out with the previous tag, to
* permit use in key removal. *prev is set to NULL when the tag is the first
* key in the list. When a tag isn't found, prev (if non NULL) will be the last
* tag in the existing list.
*
* @return
* Returns the tag pointer on success;
* NULL on failure
*/
SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
SAM_hdr_type *type,
char *key,
SAM_hdr_tag **prev);
/*! Adds or updates tag key,value pairs in a header line.
*
* Eg for adding M5 tags to @SQ lines or updating sort order for the
* @HD line (although use the sam_hdr_sort_order() function for
* HD manipulation, which is a wrapper around this funuction).
*
* Specify multiple key,value pairs ending in NULL.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...);
/*! Reconstructs the kstring from the header hash table.
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_rebuild(SAM_hdr *hdr);
/*! Looks up a reference sequence by name and returns the numerical ID.
* @return
* Returns -1 if unknown reference.
*/
int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref);
/*! Looks up a read-group by name and returns a pointer to the start of the
* associated tag list.
*
* @return
* Returns NULL on failure
*/
SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg);
/*! Fixes any PP links in @PG headers.
*
* If the entries are in order then this doesn't need doing, but incase
* our header is out of order this goes through the sh->pg[] array
* setting the prev_id field.
*
* @return
* Returns 0 on sucess;
* -1 on failure (indicating broken PG/PP records)
*/
int sam_hdr_link_pg(SAM_hdr *hdr);
/*! Add an @PG line.
*
* If we wish complete control over this use sam_hdr_add() directly. This
* function uses that, but attempts to do a lot of tedious house work for
* you too.
*
* - It will generate a suitable ID if the supplied one clashes.
* - It will generate multiple @PG records if we have multiple PG chains.
*
* Call it as per sam_hdr_add() with a series of key,value pairs ending
* in NULL.
*
* @return
* Returns 0 on success;
* -1 on failure
*/
int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...);
/*!
* A function to help with construction of CL tags in @PG records.
* Takes an argc, argv pair and returns a single space-separated string.
* This string should be deallocated by the calling function.
*
* @return
* Returns malloced char * on success;
* NULL on failure
*/
char *stringify_argv(int argc, char *argv[]);
#ifdef __cplusplus
}
#endif
#endif /* _SAM_HDR_H_ */
htslib-1.2.1/cram/string_alloc.c 0000664 0000000 0000000 00000010070 12464172677 0016564 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2010 Genome Research Ltd.
Author: Andrew Whitwham
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
A pooled string allocator intended to cut down on the
memory overhead of many small string allocations.
Andrew Whitwham, September 2010.
*/
#include
#include
#include
#include "string_alloc.h"
#define MIN_STR_SIZE 1024
/* creates the string pool. max_length is the initial size
a single string can be. Tha max_length can grow as
needed */
string_alloc_t *string_pool_create(size_t max_length) {
string_alloc_t *a_str;
if (NULL == (a_str = (string_alloc_t *)malloc(sizeof(*a_str)))) {
return NULL;
}
if (max_length < MIN_STR_SIZE) max_length = MIN_STR_SIZE;
a_str->nstrings = 0;
a_str->max_length = max_length;
a_str->strings = NULL;
return a_str;
}
/* internal function to do the actual memory allocation */
static string_t *new_string_pool(string_alloc_t *a_str) {
string_t *str;
str = realloc(a_str->strings, (a_str->nstrings + 1) * sizeof(*a_str->strings));
if (NULL == str) return NULL;
a_str->strings = str;
str = &a_str->strings[a_str->nstrings];
str->str = malloc(a_str->max_length);;
if (NULL == str->str) return NULL;
str->used = 0;
a_str->nstrings++;
return str;
}
/* free allocated memory */
void string_pool_destroy(string_alloc_t *a_str) {
size_t i;
for (i = 0; i < a_str->nstrings; i++) {
free(a_str->strings[i].str);
}
free(a_str->strings);
free(a_str);
}
/* allocate space for a string */
char *string_alloc(string_alloc_t *a_str, size_t length) {
string_t *str;
char *ret;
if (length <= 0) return NULL;
// add to last string pool if we have space
if (a_str->nstrings) {
str = &a_str->strings[a_str->nstrings - 1];
if (str->used + length < a_str->max_length) {
ret = str->str + str->used;
str->used += length;
return ret;
}
}
// increase the max length if needs be
if (length > a_str->max_length) a_str->max_length = length;
// need a new string pool
str = new_string_pool(a_str);
if (NULL == str) return NULL;
str->used = length;
return str->str;
}
/* equivalent to strdup */
char *string_dup(string_alloc_t *a_str, char *instr) {
return string_ndup(a_str, instr, strlen(instr));
}
char *string_ndup(string_alloc_t *a_str, char *instr, size_t len) {
char *str = string_alloc(a_str, len + 1);
if (NULL == str) return NULL;
strncpy(str, instr, len);
str[len] = 0;
return str;
}
htslib-1.2.1/cram/string_alloc.h 0000664 0000000 0000000 00000004421 12464172677 0016574 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2010 Genome Research Ltd.
Author: Andrew Whitwham
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _STRING_ALLOC_H_
#define _STRING_ALLOC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include
/*
* A pooled string allocator intended to cut down on the
* memory overhead of many small string allocations.
*
* Andrew Whitwham, September 2010.
*/
typedef struct {
char *str;
size_t used;
} string_t;
typedef struct {
size_t max_length;
size_t nstrings;
string_t *strings;
} string_alloc_t;
string_alloc_t *string_pool_create(size_t max_length);
void string_pool_destroy(string_alloc_t *a_str);
char *string_alloc(string_alloc_t *a_str, size_t length);
char *string_dup(string_alloc_t *a_str, char *instr);
char *string_ndup(string_alloc_t *a_str, char *instr, size_t len);
#endif
#ifdef __cplusplus
}
#endif
htslib-1.2.1/cram/thread_pool.c 0000664 0000000 0000000 00000045116 12464172677 0016415 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include
#include
#include
#include
#include
#include
#include
#include "cram/thread_pool.h"
//#define DEBUG
//#define DEBUG_TIME
#define IN_ORDER
#ifdef DEBUG
static int worker_id(t_pool *p) {
int i;
pthread_t s = pthread_self();
for (i = 0; i < p->tsize; i++) {
if (pthread_equal(s, p->t[i].tid))
return i;
}
return -1;
}
#endif
/* ----------------------------------------------------------------------------
* A queue to hold results from the thread pool.
*
* Each thread pool may have jobs of multiple types being queued up and
* interleaved, so we allow several results queue per pool.
*
* The jobs themselves are expected to push their results onto their
* appropriate results queue.
*/
/*
* Adds a result to the end of the result queue.
*
* Returns 0 on success;
* -1 on failure
*/
static int t_pool_add_result(t_pool_job *j, void *data) {
t_results_queue *q = j->q;
t_pool_result *r;
#ifdef DEBUG
fprintf(stderr, "%d: Adding resulting to queue %p, serial %d\n",
worker_id(j->p), q, j->serial);
#endif
/* No results queue is fine if we don't want any results back */
if (!q)
return 0;
if (!(r = malloc(sizeof(*r))))
return -1;
r->next = NULL;
r->data = data;
r->serial = j->serial;
pthread_mutex_lock(&q->result_m);
if (q->result_tail) {
q->result_tail->next = r;
q->result_tail = r;
} else {
q->result_head = q->result_tail = r;
}
q->queue_len++;
q->pending--;
#ifdef DEBUG
fprintf(stderr, "%d: Broadcasting result_avail (id %d)\n",
worker_id(j->p), r->serial);
#endif
pthread_cond_signal(&q->result_avail_c);
#ifdef DEBUG
fprintf(stderr, "%d: Broadcast complete\n", worker_id(j->p));
#endif
pthread_mutex_unlock(&q->result_m);
return 0;
}
/* Core of t_pool_next_result() */
static t_pool_result *t_pool_next_result_locked(t_results_queue *q) {
t_pool_result *r, *last;
for (last = NULL, r = q->result_head; r; last = r, r = r->next) {
if (r->serial == q->next_serial)
break;
}
if (r) {
if (q->result_head == r)
q->result_head = r->next;
else
last->next = r->next;
if (q->result_tail == r)
q->result_tail = last;
if (!q->result_head)
q->result_tail = NULL;
q->next_serial++;
q->queue_len--;
}
return r;
}
/*
* Pulls a result off the head of the result queue. Caller should
* free it (and any internals as appropriate) after use. This doesn't
* wait for a result to be present.
*
* Results will be returned in strict order.
*
* Returns t_pool_result pointer if a result is ready.
* NULL if not.
*/
t_pool_result *t_pool_next_result(t_results_queue *q) {
t_pool_result *r;
#ifdef DEBUG
fprintf(stderr, "Requesting next result on queue %p\n", q);
#endif
pthread_mutex_lock(&q->result_m);
r = t_pool_next_result_locked(q);
pthread_mutex_unlock(&q->result_m);
#ifdef DEBUG
fprintf(stderr, "(q=%p) Found %p\n", q, r);
#endif
return r;
}
t_pool_result *t_pool_next_result_wait(t_results_queue *q) {
t_pool_result *r;
#ifdef DEBUG
fprintf(stderr, "Waiting for result %d...\n", q->next_serial);
#endif
pthread_mutex_lock(&q->result_m);
while (!(r = t_pool_next_result_locked(q))) {
/* Possible race here now avoided via _locked() call, but incase... */
struct timeval now;
struct timespec timeout;
gettimeofday(&now, NULL);
timeout.tv_sec = now.tv_sec + 10;
timeout.tv_nsec = now.tv_usec * 1000;
pthread_cond_timedwait(&q->result_avail_c, &q->result_m, &timeout);
}
pthread_mutex_unlock(&q->result_m);
return r;
}
/*
* Returns true if there are no items on the finished results queue and
* also none still pending.
*/
int t_pool_results_queue_empty(t_results_queue *q) {
int empty;
pthread_mutex_lock(&q->result_m);
empty = q->queue_len == 0 && q->pending == 0;
pthread_mutex_unlock(&q->result_m);
return empty;
}
/*
* Returns the number of completed jobs on the results queue.
*/
int t_pool_results_queue_len(t_results_queue *q) {
int len;
pthread_mutex_lock(&q->result_m);
len = q->queue_len;
pthread_mutex_unlock(&q->result_m);
return len;
}
int t_pool_results_queue_sz(t_results_queue *q) {
int len;
pthread_mutex_lock(&q->result_m);
len = q->queue_len + q->pending;
pthread_mutex_unlock(&q->result_m);
return len;
}
/*
* Frees a result 'r' and if free_data is true also frees
* the internal r->data result too.
*/
void t_pool_delete_result(t_pool_result *r, int free_data) {
if (!r)
return;
if (free_data && r->data)
free(r->data);
free(r);
}
/*
* Initialises a results queue.
*
* Results queue pointer on success;
* NULL on failure
*/
t_results_queue *t_results_queue_init(void) {
t_results_queue *q = malloc(sizeof(*q));
pthread_mutex_init(&q->result_m, NULL);
pthread_cond_init(&q->result_avail_c, NULL);
q->result_head = NULL;
q->result_tail = NULL;
q->next_serial = 0;
q->curr_serial = 0;
q->queue_len = 0;
q->pending = 0;
return q;
}
/* Deallocates memory for a results queue */
void t_results_queue_destroy(t_results_queue *q) {
#ifdef DEBUG
fprintf(stderr, "Destroying results queue %p\n", q);
#endif
if (!q)
return;
pthread_mutex_destroy(&q->result_m);
pthread_cond_destroy(&q->result_avail_c);
memset(q, 0xbb, sizeof(*q));
free(q);
#ifdef DEBUG
fprintf(stderr, "Destroyed results queue %p\n", q);
#endif
}
/* ----------------------------------------------------------------------------
* The thread pool.
*/
#define TDIFF(t2,t1) ((t2.tv_sec-t1.tv_sec)*1000000 + t2.tv_usec-t1.tv_usec)
/*
* A worker thread.
*
* Each thread waits for the pool to be non-empty.
* As soon as this applies, one of them succeeds in getting the lock
* and then executes the job.
*/
static void *t_pool_worker(void *arg) {
t_pool_worker_t *w = (t_pool_worker_t *)arg;
t_pool *p = w->p;
t_pool_job *j;
#ifdef DEBUG_TIME
struct timeval t1, t2, t3;
#endif
for (;;) {
// Pop an item off the pool queue
#ifdef DEBUG_TIME
gettimeofday(&t1, NULL);
#endif
pthread_mutex_lock(&p->pool_m);
#ifdef DEBUG_TIME
gettimeofday(&t2, NULL);
p->wait_time += TDIFF(t2,t1);
w->wait_time += TDIFF(t2,t1);
#endif
// If there is something on the job list and a higher priority
// thread waiting, let it handle this instead.
// while (p->head && p->t_stack_top != -1 && p->t_stack_top < w->idx) {
// pthread_mutex_unlock(&p->pool_m);
// pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
// pthread_mutex_lock(&p->pool_m);
// }
while (!p->head && !p->shutdown) {
p->nwaiting++;
if (p->njobs == 0)
pthread_cond_signal(&p->empty_c);
#ifdef DEBUG_TIME
gettimeofday(&t2, NULL);
#endif
#ifdef IN_ORDER
// Push this thread to the top of the waiting stack
if (p->t_stack_top == -1 || p->t_stack_top > w->idx)
p->t_stack_top = w->idx;
p->t_stack[w->idx] = 1;
pthread_cond_wait(&w->pending_c, &p->pool_m);
p->t_stack[w->idx] = 0;
/* Find new t_stack_top */
{
int i;
p->t_stack_top = -1;
for (i = 0; i < p->tsize; i++) {
if (p->t_stack[i]) {
p->t_stack_top = i;
break;
}
}
}
#else
pthread_cond_wait(&p->pending_c, &p->pool_m);
#endif
#ifdef DEBUG_TIME
gettimeofday(&t3, NULL);
p->wait_time += TDIFF(t3,t2);
w->wait_time += TDIFF(t3,t2);
#endif
p->nwaiting--;
}
if (p->shutdown) {
#ifdef DEBUG_TIME
p->total_time += TDIFF(t3,t1);
#endif
#ifdef DEBUG
fprintf(stderr, "%d: Shutting down\n", worker_id(p));
#endif
pthread_mutex_unlock(&p->pool_m);
pthread_exit(NULL);
}
j = p->head;
if (!(p->head = j->next))
p->tail = NULL;
if (p->njobs-- >= p->qsize)
pthread_cond_signal(&p->full_c);
if (p->njobs == 0)
pthread_cond_signal(&p->empty_c);
pthread_mutex_unlock(&p->pool_m);
// We have job 'j' - now execute it.
t_pool_add_result(j, j->func(j->arg));
#ifdef DEBUG_TIME
pthread_mutex_lock(&p->pool_m);
gettimeofday(&t3, NULL);
p->total_time += TDIFF(t3,t1);
pthread_mutex_unlock(&p->pool_m);
#endif
memset(j, 0xbb, sizeof(*j));
free(j);
}
return NULL;
}
/*
* Creates a worker pool of length qsize with tsize worker threads.
*
* Returns pool pointer on success;
* NULL on failure
*/
t_pool *t_pool_init(int qsize, int tsize) {
int i;
t_pool *p = malloc(sizeof(*p));
p->qsize = qsize;
p->tsize = tsize;
p->njobs = 0;
p->nwaiting = 0;
p->shutdown = 0;
p->head = p->tail = NULL;
p->t_stack = NULL;
#ifdef DEBUG_TIME
p->total_time = p->wait_time = 0;
#endif
p->t = malloc(tsize * sizeof(p->t[0]));
pthread_mutex_init(&p->pool_m, NULL);
pthread_cond_init(&p->empty_c, NULL);
pthread_cond_init(&p->full_c, NULL);
pthread_mutex_lock(&p->pool_m);
#ifdef IN_ORDER
if (!(p->t_stack = malloc(tsize * sizeof(*p->t_stack))))
return NULL;
p->t_stack_top = -1;
for (i = 0; i < tsize; i++) {
t_pool_worker_t *w = &p->t[i];
p->t_stack[i] = 0;
w->p = p;
w->idx = i;
w->wait_time = 0;
pthread_cond_init(&w->pending_c, NULL);
if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
return NULL;
}
#else
pthread_cond_init(&p->pending_c, NULL);
for (i = 0; i < tsize; i++) {
t_pool_worker_t *w = &p->t[i];
w->p = p;
w->idx = i;
pthread_cond_init(&w->pending_c, NULL);
if (0 != pthread_create(&w->tid, NULL, t_pool_worker, w))
return NULL;
}
#endif
pthread_mutex_unlock(&p->pool_m);
return p;
}
/*
* Adds an item to the work pool.
*
* FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs
* result returned. Ie rather than blocking on full queue we're permitted
* to return early on "result available" event too.
* Caller would then have a while loop around t_pool_dispatch.
* Or, return -1 and set errno to EAGAIN to indicate job not yet submitted.
*
* Returns 0 on success
* -1 on failure
*/
int t_pool_dispatch(t_pool *p, t_results_queue *q,
void *(*func)(void *arg), void *arg) {
t_pool_job *j = malloc(sizeof(*j));
if (!j)
return -1;
j->func = func;
j->arg = arg;
j->next = NULL;
j->p = p;
j->q = q;
if (q) {
pthread_mutex_lock(&q->result_m);
j->serial = q->curr_serial++;
q->pending++;
pthread_mutex_unlock(&q->result_m);
} else {
j->serial = 0;
}
#ifdef DEBUG
fprintf(stderr, "Dispatching job %p for queue %p, serial %d\n", j, q, j->serial);
#endif
pthread_mutex_lock(&p->pool_m);
// Check if queue is full
while (p->njobs >= p->qsize)
pthread_cond_wait(&p->full_c, &p->pool_m);
p->njobs++;
if (p->tail) {
p->tail->next = j;
p->tail = j;
} else {
p->head = p->tail = j;
}
// Let a worker know we have data.
#ifdef IN_ORDER
if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting)
pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
#else
pthread_cond_signal(&p->pending_c);
#endif
pthread_mutex_unlock(&p->pool_m);
#ifdef DEBUG
fprintf(stderr, "Dispatched (serial %d)\n", j->serial);
#endif
return 0;
}
/*
* As above but optional non-block flag.
*
* nonblock 0 => block if input queue is full
* nonblock +1 => don't block if input queue is full, but do not add task
* nonblock -1 => add task regardless of whether queue is full (over-size)
*/
int t_pool_dispatch2(t_pool *p, t_results_queue *q,
void *(*func)(void *arg), void *arg, int nonblock) {
t_pool_job *j;
#ifdef DEBUG
fprintf(stderr, "Dispatching job for queue %p, serial %d\n", q, q->curr_serial);
#endif
pthread_mutex_lock(&p->pool_m);
if (p->njobs >= p->qsize && nonblock == 1) {
pthread_mutex_unlock(&p->pool_m);
errno = EAGAIN;
return -1;
}
if (!(j = malloc(sizeof(*j))))
return -1;
j->func = func;
j->arg = arg;
j->next = NULL;
j->p = p;
j->q = q;
if (q) {
pthread_mutex_lock(&q->result_m);
j->serial = q->curr_serial;
pthread_mutex_unlock(&q->result_m);
} else {
j->serial = 0;
}
if (q) {
pthread_mutex_lock(&q->result_m);
q->curr_serial++;
q->pending++;
pthread_mutex_unlock(&q->result_m);
}
// Check if queue is full
if (nonblock == 0)
while (p->njobs >= p->qsize)
pthread_cond_wait(&p->full_c, &p->pool_m);
p->njobs++;
// if (q->curr_serial % 100 == 0)
// fprintf(stderr, "p->njobs = %d p->qsize = %d\n", p->njobs, p->qsize);
if (p->tail) {
p->tail->next = j;
p->tail = j;
} else {
p->head = p->tail = j;
}
#ifdef DEBUG
fprintf(stderr, "Dispatched (serial %d)\n", j->serial);
#endif
// Let a worker know we have data.
#ifdef IN_ORDER
// Keep incoming queue at 1 per running thread, so there is always
// something waiting when they end their current task. If we go above
// this signal to start more threads (if available). This has the effect
// of concentrating jobs to fewer cores when we are I/O bound, which in
// turn benefits systems with auto CPU frequency scaling.
if (p->t_stack_top >= 0 && p->njobs > p->tsize - p->nwaiting)
pthread_cond_signal(&p->t[p->t_stack_top].pending_c);
#else
pthread_cond_signal(&p->pending_c);
#endif
pthread_mutex_unlock(&p->pool_m);
return 0;
}
/*
* Flushes the pool, but doesn't exit. This simply drains the queue and
* ensures all worker threads have finished their current task.
*
* Returns 0 on success;
* -1 on failure
*/
int t_pool_flush(t_pool *p) {
int i;
#ifdef DEBUG
fprintf(stderr, "Flushing pool %p\n", p);
#endif
// Drains the queue
pthread_mutex_lock(&p->pool_m);
// Wake up everything for the final sprint!
for (i = 0; i < p->tsize; i++)
if (p->t_stack[i])
pthread_cond_signal(&p->t[i].pending_c);
while (p->njobs || p->nwaiting != p->tsize)
pthread_cond_wait(&p->empty_c, &p->pool_m);
pthread_mutex_unlock(&p->pool_m);
#ifdef DEBUG
fprintf(stderr, "Flushed complete for pool %p, njobs=%d, nwaiting=%d\n",
p, p->njobs, p->nwaiting);
#endif
return 0;
}
/*
* Destroys a thread pool. If 'kill' is true the threads are terminated now,
* otherwise they are joined into the main thread so they will finish their
* current work load.
*
* Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or
* t_pool_destroy(p,1) to quickly exit after a fatal error.
*/
void t_pool_destroy(t_pool *p, int kill) {
int i;
#ifdef DEBUG
fprintf(stderr, "Destroying pool %p, kill=%d\n", p, kill);
#endif
/* Send shutdown message to worker threads */
if (!kill) {
pthread_mutex_lock(&p->pool_m);
p->shutdown = 1;
#ifdef DEBUG
fprintf(stderr, "Sending shutdown request\n");
#endif
#ifdef IN_ORDER
for (i = 0; i < p->tsize; i++)
pthread_cond_signal(&p->t[i].pending_c);
#else
pthread_cond_broadcast(&p->pending_c);
#endif
pthread_mutex_unlock(&p->pool_m);
#ifdef DEBUG
fprintf(stderr, "Shutdown complete\n");
#endif
for (i = 0; i < p->tsize; i++)
pthread_join(p->t[i].tid, NULL);
} else {
for (i = 0; i < p->tsize; i++)
pthread_kill(p->t[i].tid, SIGINT);
}
pthread_mutex_destroy(&p->pool_m);
pthread_cond_destroy(&p->empty_c);
pthread_cond_destroy(&p->full_c);
#ifdef IN_ORDER
for (i = 0; i < p->tsize; i++)
pthread_cond_destroy(&p->t[i].pending_c);
#else
pthread_cond_destroy(&p->pending_c);
#endif
#ifdef DEBUG_TIME
fprintf(stderr, "Total time=%f\n", p->total_time / 1000000.0);
fprintf(stderr, "Wait time=%f\n", p->wait_time / 1000000.0);
fprintf(stderr, "%d%% utilisation\n",
(int)(100 - ((100.0 * p->wait_time) / p->total_time + 0.5)));
for (i = 0; i < p->tsize; i++)
fprintf(stderr, "%d: Wait time=%f\n", i,
p->t[i].wait_time / 1000000.0);
#endif
if (p->t_stack)
free(p->t_stack);
free(p->t);
free(p);
#ifdef DEBUG
fprintf(stderr, "Destroyed pool %p\n", p);
#endif
}
/*-----------------------------------------------------------------------------
* Test app.
*/
#ifdef TEST_MAIN
#include
#include
void *doit(void *arg) {
int i, k, x = 0;
int job = *(int *)arg;
int *res;
printf("Worker: execute job %d\n", job);
usleep(random() % 1000000); // to coerce job completion out of order
if (0) {
for (k = 0; k < 100; k++) {
for (i = 0; i < 100000; i++) {
x++;
x += x * sin(i);
x += x * cos(x);
}
}
x *= 100;
x += job;
} else {
x = job*job;
}
printf("Worker: job %d terminating, x=%d\n", job, x);
free(arg);
res = malloc(sizeof(*res));
*res = x;
return res;
}
#define NTHREADS 8
int main(int argc, char **argv) {
t_pool *p = t_pool_init(NTHREADS*2, NTHREADS);
t_results_queue *q = t_results_queue_init();
int i;
t_pool_result *r;
// Dispatch jobs
for (i = 0; i < 20; i++) {
int *ip = malloc(sizeof(*ip));
*ip = i;
printf("Submitting %d\n", i);
t_pool_dispatch(p, q, doit, ip);
// Check for results
if ((r = t_pool_next_result(q))) {
printf("RESULT: %d\n", *(int *)r->data);
t_pool_delete_result(r, 1);
}
}
t_pool_flush(p);
while ((r = t_pool_next_result(q))) {
printf("RESULT: %d\n", *(int *)r->data);
t_pool_delete_result(r, 1);
}
t_pool_destroy(p, 0);
t_results_queue_destroy(q);
return 0;
}
#endif
htslib-1.2.1/cram/thread_pool.h 0000664 0000000 0000000 00000014675 12464172677 0016430 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file implements a thread pool for multi-threading applications.
* It consists of two distinct interfaces: thread pools an results queues.
*
* The pool of threads is given a function pointer and void* data to pass in.
* This means the pool can run jobs of multiple types, albeit first come
* first served with no job scheduling.
*
* Upon completion, the return value from the function pointer is added to
* a results queue. We may have multiple queues in use for the one pool.
*
* An example: reading from BAM and writing to CRAM with 10 threads. We'll
* have a pool of 10 threads and two results queues holding decoded BAM blocks
* and encoded CRAM blocks respectively.
*/
#ifndef _THREAD_POOL_H_
#define _THREAD_POOL_H_
#include
struct t_pool;
struct t_results_queue;
typedef struct t_pool_job {
void *(*func)(void *arg);
void *arg;
struct t_pool_job *next;
struct t_pool *p;
struct t_results_queue *q;
int serial;
} t_pool_job;
typedef struct t_res {
struct t_res *next;
int serial; // sequential number for ordering
void *data; // result itself
} t_pool_result;
struct t_pool;
typedef struct {
struct t_pool *p;
int idx;
pthread_t tid;
pthread_cond_t pending_c;
long long wait_time;
} t_pool_worker_t;
typedef struct t_pool {
int qsize; // size of queue
int njobs; // pending job count
int nwaiting; // how many workers waiting for new jobs
int shutdown; // true if pool is being destroyed
// queue of pending jobs
t_pool_job *head, *tail;
// threads
int tsize; // maximum number of jobs
t_pool_worker_t *t;
// Mutexes
pthread_mutex_t pool_m; // used when updating head/tail
pthread_cond_t empty_c;
pthread_cond_t pending_c; // not empty
pthread_cond_t full_c;
// array of worker IDs free
int *t_stack, t_stack_top;
// Debugging to check wait time
long long total_time, wait_time;
} t_pool;
typedef struct t_results_queue {
t_pool_result *result_head;
t_pool_result *result_tail;
int next_serial;
int curr_serial;
int queue_len; // number of items in queue
int pending; // number of pending items (in progress or in pool list)
pthread_mutex_t result_m;
pthread_cond_t result_avail_c;
} t_results_queue;
/*
* Creates a worker pool of length qsize with tsize worker threads.
*
* Returns pool pointer on success;
* NULL on failure
*/
t_pool *t_pool_init(int qsize, int tsize);
/*
* Adds an item to the work pool.
*
* FIXME: Maybe return 1,0,-1 and distinguish between job dispathed vs
* result returned. Ie rather than blocking on full queue we're permitted
* to return early on "result available" event too.
* Caller would then have a while loop around t_pool_dispatch.
* Or, return -1 and set errno to E_AGAIN to indicate job not yet submitted.
*
* Returns 0 on success
* -1 on failure
*/
int t_pool_dispatch(t_pool *p, t_results_queue *q,
void *(*func)(void *arg), void *arg);
int t_pool_dispatch2(t_pool *p, t_results_queue *q,
void *(*func)(void *arg), void *arg, int nonblock);
/*
* Flushes the pool, but doesn't exit. This simply drains the queue and
* ensures all worker threads have finished their current task.
*
* Returns 0 on success;
* -1 on failure
*/
int t_pool_flush(t_pool *p);
/*
* Destroys a thread pool. If 'kill' is true the threads are terminated now,
* otherwise they are joined into the main thread so they will finish their
* current work load.
*
* Use t_pool_destroy(p,0) after a t_pool_flush(p) on a normal shutdown or
* t_pool_destroy(p,1) to quickly exit after a fatal error.
*/
void t_pool_destroy(t_pool *p, int kill);
/*
* Pulls a result off the head of the result queue. Caller should
* free it (and any internals as appropriate) after use. This doesn't
* wait for a result to be present.
*
* Results will be returned in strict order.
*
* Returns t_pool_result pointer if a result is ready.
* NULL if not.
*/
t_pool_result *t_pool_next_result(t_results_queue *q);
t_pool_result *t_pool_next_result_wait(t_results_queue *q);
/*
* Frees a result 'r' and if free_data is true also frees
* the internal r->data result too.
*/
void t_pool_delete_result(t_pool_result *r, int free_data);
/*
* Initialises a results queue.
*
* Results queue pointer on success;
* NULL on failure
*/
t_results_queue *t_results_queue_init(void);
/* Deallocates memory for a results queue */
void t_results_queue_destroy(t_results_queue *q);
/*
* Returns true if there are no items on the finished results queue and
* also none still pending.
*/
int t_pool_results_queue_empty(t_results_queue *q);
/*
* Returns the number of completed jobs on the results queue.
*/
int t_pool_results_queue_len(t_results_queue *q);
/*
* Returns the number of completed jobs plus the number queued up to run.
*/
int t_pool_results_queue_sz(t_results_queue *q);
#endif /* _THREAD_POOL_H_ */
htslib-1.2.1/cram/vlen.c 0000664 0000000 0000000 00000027725 12464172677 0015067 0 ustar 00root root 0000000 0000000 /*
Author: James Bonfield (jkb@sanger.ac.uk)
Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1 Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2 Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Copyright (c) 2004, 2009, 2011-2012 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include
#include
#include
#include "cram/vlen.h"
#include "cram/os.h"
#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif
#ifndef ABS
#define ABS(a) ((a)>0?(a):-(a))
#endif
/* #define DEBUG_printf(a,n) printf(a,n) */
#define DEBUG_printf(a,n)
/*
* vlen: 27/10/95 written by James Bonfield, jkb@mrc-lmb.cam.ac.uk
*
* Given sprintf style of arguments this routine returns the maximum
* size of buffer needed to allocate to use with sprintf. It errs on
* the side of caution by being simplistic in its approach: we assume
* all numbers are of maximum length.
*
* Handles the usual type conversions (%[%diuaxXcfeEgGpns]), but not
* the 'wide' character conversions (%C and %S).
* Precision is handled in the correct formats, including %*.*
* notations.
* Additionally, some of the more dubious (but probably illegal) cases
* are supported (eg "%10%" will expand to " %" on many
* systems).
*
* We also assume that the largest integer and larger pointer are 64
* bits, which at least covers the machines we'll need it for.
*/
int flen(char *fmt, ...)
{
va_list args;
va_start(args, fmt);
return vflen(fmt, args);
}
int vflen(char *fmt, va_list ap)
{
int len = 0;
char *cp, c;
long long l;
int i;
double d;
/*
* This code modifies 'ap', but we do not know if va_list is a structure
* or a pointer to an array so we do not know if it is a local variable
* or not.
* C99 gets around this by defining va_copy() to make copies of ap, but
* this does not exist on all systems.
* For now, I just assume that when va_list is a pointer the system also
* provides a va_copy macro to work around this problem. The only system
* I have seen needing this so far was Linux on AMD64.
*/
#if defined(HAVE_VA_COPY)
va_list ap_local;
va_copy(ap_local, ap);
# define ap ap_local
#endif
for(cp = fmt; *cp; cp++) {
switch(*cp) {
/* A format specifier */
case '%': {
char *endp;
long conv_len1=0, conv_len2=0, conv_len=0;
signed int arg_size;
/* Firstly, strip the modifier flags (+-#0 and [space]) */
for(; (c=*++cp);) {
if ('#' == c)
len+=2; /* Worst case of "0x" */
else if ('-' == c || '+' == c || ' ' == c)
len++;
else
break;
}
/* Width specifier */
l = strtol(cp, &endp, 10);
if (endp != cp) {
cp = endp;
conv_len = conv_len1 = l;
} else if (*cp == '*') {
conv_len = conv_len1 = (int)va_arg(ap, int);
cp++;
}
/* Precision specifier */
if ('.' == *cp) {
cp++;
conv_len2 = strtol(cp, &endp, 10);
if (endp != cp) {
cp = endp;
} else if (*cp == '*') {
conv_len2 = (int)va_arg(ap, int);
cp++;
}
conv_len = MAX(conv_len1, conv_len2);
}
/* Short/long identifier */
if ('h' == *cp) {
arg_size = -1; /* short */
cp++;
} else if ('l' == *cp) {
arg_size = 1; /* long */
cp++;
if ('l' == *cp) {
arg_size = 2; /* long long */
cp++;
}
} else {
arg_size = 0; /* int */
}
/* The actual type */
switch (*cp) {
case '%':
/*
* Not real ANSI I suspect, but we'll allow for the
* completely daft "%10%" example.
*/
len += MAX(conv_len1, 1);
break;
case 'd':
case 'i':
case 'u':
case 'a':
case 'x':
case 'X':
/* Remember: char and short are sent as int on the stack */
if (arg_size == -1)
l = (long)va_arg(ap, int);
else if (arg_size == 1)
l = va_arg(ap, long);
else if (arg_size == 2)
l = va_arg(ap, long long);
else
l = (long)va_arg(ap, int);
DEBUG_printf("%d", l);
/*
* No number can be more than 24 characters so we'll take
* the max of conv_len and 24 (23 is len(2^64) in octal).
* All that work above and we then go and estimate ;-),
* but it's needed incase someone does %500d.
*/
len += MAX(conv_len, 23);
break;
case 'c':
i = va_arg(ap, int);
DEBUG_printf("%c", i);
/*
* Note that %10c and %.10c act differently.
* Besides, I think precision is not really allowed for %c.
*/
len += MAX(conv_len1, i>=0x80 ?MB_CUR_MAX :1);
break;
case 'f':
d = va_arg(ap, double);
DEBUG_printf("%f", d);
/*
* Maybe "Inf" or "NaN", but we'll not worry about that.
* Again, err on side of caution and take max of conv_len
* and max length of a double. The worst case I can
* think of is 317 characters (-1[308 zeros].000000)
* without using precision codes. That's horrid. I
* cheat and either use 317 or 15 depending on how
* large the number is as I reckon 99% of floats
* aren't that long.
*/
l = (ABS(d) > 1000000) ? 317 : 15;
l = MAX(l, conv_len1 + 2);
if (conv_len2) l += conv_len2 - 6;
len += l;
break;
case 'e':
case 'E':
case 'g':
case 'G':
d = va_arg(ap, double);
DEBUG_printf("%g", d);
/*
* Maybe "Inf" or "NaN", but we'll not worry about that
* Again, err on side of caution and take max of conv_len
* and max length of a double (which defaults to only
* '-' + 6 + '.' + 'E[+-]xxx' == 13.
*/
len += MAX(conv_len, 13);
break;
case 'p':
l = (long)va_arg(ap, void *);
/*
* Max pointer is 64bits == 16 chars (on alpha),
* == 20 with + "0x".
*/
DEBUG_printf("%p", (void *)l);
len += MAX(conv_len, 20);
break;
case 'n':
/* produces no output */
break;
case 's': {
char *s = (char *)va_arg(ap, char *);
DEBUG_printf("%s", s);
if (!conv_len2) {
len += MAX(conv_len, (int)strlen(s));
} else {
len += conv_len;
}
break;
}
default:
/* wchar_t types of 'C' and 'S' aren't supported */
DEBUG_printf("Arg is %c\n", *cp);
}
}
case '\0':
break;
default:
DEBUG_printf("%c", *cp);
len++;
}
}
va_end(ap);
return len+1; /* one for the null character */
}
#if 0
int main() {
int l;
char buf[10000];
sprintf(buf, "d: %d\n", 500);
l = flen("d: %d\n", 500);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "");
l = flen("");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%s\n","test");
l = flen("%s\n", "test");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%c\n", 'a');
l = flen("%c\n", 'a');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%31.30f\n", -9999.99);
l = flen("%31.30f\n", -9999.99);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%f\n", -1e308);
l = flen("%f\n", -1e308);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.9f\n", -1e308);
l = flen("%.9f\n", -1e308);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%10.20f\n", -1.999222333);
l = flen("%10.20f\n", -1.999222333);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%#g\n", -3.14159265358e-222);
l = flen("%#g\n", -3.1415927e-222);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%e\n", -123456789123456789.1);
l = flen("%e\n", -123456789123456789.1);
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two");
l = flen("%c %f %d %s %c %g %ld %s\n", 'a', 3.1, 9, "one", 'b', 4.2, 9, "two");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%*.*e %*c\n", 10, 5, 9.0, 20, 'x');
l = flen("%*.*e %*c\n", 10, 5, 9.0, 20, 'x');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%10c\n", 'z');
l = flen("%10c\n", 'z');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.10c\n", 'z');
l = flen("%.10c\n", 'z');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%10d\n", 'z');
l = flen("%10d\n", 'z');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.10d\n", 'z');
l = flen("%.10d\n", 'z');
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%10%\n");
l = flen("%10%\n");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.10%\n");
l = flen("%.10%\n");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%s\n", "0123456789");
l = flen("%s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%5s\n", "0123456789");
l = flen("%5s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%50s\n", "0123456789");
l = flen("%50s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.5s\n", "0123456789");
l = flen("%.5s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%.50s\n", "0123456789");
l = flen("%.50s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%5.50s\n", "0123456789");
l = flen("%5.50s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
sprintf(buf, "%50.5s\n", "0123456789");
l = flen("%50.5s\n", "0123456789");
printf("%d %d\n\n", strlen(buf), l);
return 0;
}
#endif
htslib-1.2.1/cram/vlen.h 0000664 0000000 0000000 00000003427 12464172677 0015065 0 ustar 00root root 0000000 0000000 /*
Author: James Bonfield (jkb@sanger.ac.uk)
Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL
All rights reserved
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1 Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2 Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or
promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _VLEN_H_
#define _VLEN_H_
#ifdef __cplusplus
extern "C" {
#endif
extern int vflen(char *fmt, va_list ap);
extern int flen(char *fmt, ...);
#ifdef __cplusplus
}
#endif
#endif /* _VLEN_H_ */
htslib-1.2.1/cram/zfio.c 0000664 0000000 0000000 00000011523 12464172677 0015057 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2009-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif
#include
#include
#include "cram/os.h"
#include "cram/zfio.h"
/* ------------------------------------------------------------------------ */
/* Some wrappers around FILE * vs gzFile *, allowing for either */
/*
* gzopen() works on both compressed and uncompressed data, but it has
* a significant performance hit even for uncompressed data (tested as
* 25s using FILE* to 46s via gzOpen and 66s via gzOpen when gzipped).
*
* Hence we use our own wrapper 'zfp' which is a FILE* when uncompressed
* and gzFile* when compressed. This also means we could hide bzopen in
* there too if desired.
*/
off_t zftello(zfp *zf) {
return zf->fp ? ftello(zf->fp) : -1;
}
int zfseeko(zfp *zf, off_t offset, int whence) {
return zf->fp ? fseeko(zf->fp, offset, whence) : -1;
}
/*
* A wrapper for either fgets or gzgets depending on what has been
* opened.
*/
char *zfgets(char *line, int size, zfp *zf) {
if (zf->fp)
return fgets(line, size, zf->fp);
else
return gzgets(zf->gz, line, size);
}
/*
* A wrapper for either fputs or gzputs depending on what has been
* opened.
*/
int zfputs(char *line, zfp *zf) {
if (zf->fp)
return fputs(line, zf->fp);
else
return gzputs(zf->gz, line) ? 0 : EOF;
}
/*
* Peeks at and returns the next character without consuming it from the
* input. (Ie a combination of getc and ungetc).
*/
int zfpeek(zfp *zf) {
int c;
if (zf->fp) {
c = getc(zf->fp);
if (c != EOF)
ungetc(c, zf->fp);
} else {
c = gzgetc(zf->gz);
if (c != EOF)
gzungetc(c, zf->gz);
}
return c;
}
/* A replacement for either feof of gzeof */
int zfeof(zfp *zf) {
return zf->fp ? feof(zf->fp) : gzeof(zf->gz);
}
/* A replacement for either fopen or gzopen */
zfp *zfopen(const char *path, const char *mode) {
char path2[1024];
zfp *zf;
if (!(zf = (zfp *)malloc(sizeof(*zf))))
return NULL;
zf->fp = NULL;
zf->gz = NULL;
/* Try normal fopen */
if (mode[0] != 'z' && mode[1] != 'z' &&
NULL != (zf->fp = fopen(path, mode))) {
unsigned char magic[2];
if (2 != fread(magic, 1, 2, zf->fp)) {
free(zf);
return NULL;
}
if (!(magic[0] == 0x1f &&
magic[1] == 0x8b)) {
fseeko(zf->fp, 0, SEEK_SET);
return zf;
}
fclose(zf->fp);
zf->fp = NULL;
}
#ifdef HAVE_POPEN
/*
* I've no idea why, by gzgets is VERY slow, maybe because it handles
* arbitrary seeks.
* popen to gzip -cd is 3 times faster though.
*/
if (*mode == 'w') {
} else {
if (access(path, R_OK) == 0) {
sprintf(path2, "gzip -cd < %.*s", 1000, path);
if (NULL != (zf->fp = popen(path2, "r")))
return zf;
}
sprintf(path2, "gzip -cd < %.*s.gz", 1000, path);
if (NULL != (zf->fp = popen(path2, "r")))
return zf;
printf("Failed on %s\n", path);
} else {
sprintf(path2, "gzip > %.*s", 1000, path);
if (NULL != (zf->fp = popen(path2, "w")))
return zf;
}
printf("Failed on %s\n", path);
}
#else
/* Gzopen instead */
if ((zf->gz = gzopen(path, mode)))
return zf;
sprintf(path2, "%.*s.gz", 1020, path);
if ((zf->gz = gzopen(path2, mode)))
return zf;
#endif
perror(path);
free(zf);
return NULL;
}
int zfclose(zfp *zf) {
int r = (zf->fp) ? fclose(zf->fp) : gzclose(zf->gz);
free(zf);
return r;
}
htslib-1.2.1/cram/zfio.h 0000664 0000000 0000000 00000003745 12464172677 0015073 0 ustar 00root root 0000000 0000000 /*
Copyright (c) 2009-2013 Genome Research Ltd.
Author: James Bonfield
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _ZFIO_H_
#define _ZFIO_H_
#include
#include
/*
* Either a gzFile or a FILE.
*/
typedef struct {
FILE *fp;
gzFile gz;
} zfp;
off_t zftello(zfp *zf);
int zfseeko(zfp *zf, off_t offset, int whence);
char *zfgets(char *line, int size, zfp *zf);
int zfputs(char *line, zfp *zf);
zfp *zfopen(const char *path, const char *mode);
int zfclose(zfp *zf);
int zfpeek(zfp *zf);
int zfeof(zfp *zf);
#endif /* _ZFIO_H_ */
htslib-1.2.1/faidx.5 0000664 0000000 0000000 00000011274 12464172677 0014206 0 ustar 00root root 0000000 0000000 '\" t
.TH faidx 5 "August 2013" "htslib" "Bioinformatics formats"
.SH NAME
faidx \- an index enabling random access to FASTA files
.\"
.\" Copyright (C) 2013 Genome Research Ltd.
.\"
.\" Author: John Marshall
.\"
.\" Permission is hereby granted, free of charge, to any person obtaining a
.\" copy of this software and associated documentation files (the "Software"),
.\" to deal in the Software without restriction, including without limitation
.\" the rights to use, copy, modify, merge, publish, distribute, sublicense,
.\" and/or sell copies of the Software, and to permit persons to whom the
.\" Software is furnished to do so, subject to the following conditions:
.\"
.\" The above copyright notice and this permission notice shall be included in
.\" all copies or substantial portions of the Software.
.\"
.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
.\" DEALINGS IN THE SOFTWARE.
.\"
.SH SYNOPSIS
.IR file.fa .fai,
.IR file.fasta .fai
.SH DESCRIPTION
Using an \fBfai index\fP file in conjunction with a FASTA file containing
reference sequences enables efficient access to arbitrary regions within
those reference sequences.
The index file typically has the same filename as the corresponding FASTA
file, with \fB.fai\fP appended.
.P
An \fBfai index\fP file is a text file consisting of lines each with
five TAB-delimited columns:
.TS
lbl.
NAME Name of this reference sequence
LENGTH Total length of this reference sequence, in bases
OFFSET Offset within the FASTA file of this sequence's first base
LINEBASES The number of bases on each line
LINEWIDTH The number of bytes in each line, including the newline
.TE
.P
The \fBNAME\fP and \fBLENGTH\fP columns contain the same
data as would appear in the \fBSN\fP and \fBLN\fP fields of a
SAM \fB@SQ\fP header for the same reference sequence.
.P
The \fBOFFSET\fP column contains the offset within the FASTA file, in bytes
starting from zero, of the first base of this reference sequence, i.e., of
the character following the newline at the end of the "\fB>\fP" header line.
Typically the lines of a \fBfai index\fP file appear in the order in which the
reference sequences appear in the FASTA file, so \fB.fai\fP files are typically
sorted according to this column.
.P
The \fBLINEBASES\fP column contains the number of bases in each of the sequence
lines that form the body of this reference sequence, apart from the final line
which may be shorter.
The \fBLINEWIDTH\fP column contains the number of \fIbytes\fP in each of
the sequence lines (except perhaps the final line), thus differing from
\fBLINEBASES\fP in that it also counts the bytes forming the line terminator.
.SS FASTA Files
In order to be indexed with \fBsamtools faidx\fP, a FASTA file must be a text
file of the form
.LP
.RS
.RI > name
.RI [ description ...]
.br
ATGCATGCATGCATGCATGCATGCATGCAT
.br
GCATGCATGCATGCATGCATGCATGCATGC
.br
ATGCAT
.br
.RI > name
.RI [ description ...]
.br
ATGCATGCATGCAT
.br
GCATGCATGCATGC
.br
[...]
.RE
.LP
In particular, each reference sequence must be "well-formatted", i.e., all
of its sequence lines must be the same length, apart from the final sequence
line which may be shorter.
(While this sequence line length must be the same within each sequence,
it may vary between different reference sequences in the same FASTA file.)
.P
This also means that although the FASTA file may have Unix- or Windows-style
or other line termination, the newline characters present must be consistent,
at least within each reference sequence.
.P
The \fBsamtools\fP implementation uses the first word of the "\fB>\fP" header
line text (i.e., up to the first whitespace character) as the \fBNAME\fP column.
At present, there may be no whitespace between the
">" character and the \fIname\fP.
.SH EXAMPLE
For example, given this FASTA file
.LP
.RS
>one
.br
ATGCATGCATGCATGCATGCATGCATGCAT
.br
GCATGCATGCATGCATGCATGCATGCATGC
.br
ATGCAT
.br
>two another chromosome
.br
ATGCATGCATGCAT
.br
GCATGCATGCATGC
.br
.RE
.LP
formatted with Unix-style (LF) line termination, the corresponding fai index
would be
.RS
.TS
lnnnn.
one 66 5 30 31
two 28 98 14 15
.TE
.RE
.LP
If the FASTA file were formatted with Windows-style (CR-LF) line termination,
the fai index would be
.RS
.TS
lnnnn.
one 66 6 30 32
two 28 103 14 16
.TE
.RE
.SH SEE ALSO
.IR samtools (1)
.TP
http://en.wikipedia.org/wiki/FASTA_format
Further description of the FASTA format
htslib-1.2.1/faidx.c 0000664 0000000 0000000 00000033211 12464172677 0014257 0 ustar 00root root 0000000 0000000 /* faidx.c -- FASTA random access.
Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include
#include
#include
#include
#include
#include "htslib/bgzf.h"
#include "htslib/faidx.h"
#include "htslib/hfile.h"
#include "htslib/khash.h"
typedef struct {
int32_t line_len, line_blen;
int64_t len;
uint64_t offset;
} faidx1_t;
KHASH_MAP_INIT_STR(s, faidx1_t)
struct __faidx_t {
BGZF *bgzf;
int n, m;
char **name;
khash_t(s) *hash;
};
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
{
khint_t k;
int ret;
faidx1_t t;
if (idx->n == idx->m) {
idx->m = idx->m? idx->m<<1 : 16;
idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m);
}
idx->name[idx->n] = strdup(name);
k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
kh_value(idx->hash, k) = t;
++idx->n;
}
faidx_t *fai_build_core(BGZF *bgzf)
{
char *name;
int c;
int l_name, m_name;
int line_len, line_blen, state;
int l1, l2;
faidx_t *idx;
uint64_t offset;
int64_t len;
idx = (faidx_t*)calloc(1, sizeof(faidx_t));
idx->hash = kh_init(s);
name = 0; l_name = m_name = 0;
len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
while ( (c=bgzf_getc(bgzf))>=0 ) {
if (c == '\n') { // an empty line
if (state == 1) {
offset = bgzf_utell(bgzf);
continue;
} else if ((state == 0 && len < 0) || state == 2) continue;
}
if (c == '>') { // fasta header
if (len >= 0)
fai_insert_index(idx, name, len, line_len, line_blen, offset);
l_name = 0;
while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
if (m_name < l_name + 2) {
m_name = l_name + 2;
kroundup32(m_name);
name = (char*)realloc(name, m_name);
}
name[l_name++] = c;
}
name[l_name] = '\0';
if ( c<0 ) {
fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
free(name); fai_destroy(idx);
return 0;
}
if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
state = 1; len = 0;
offset = bgzf_utell(bgzf);
} else {
if (state == 3) {
fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
free(name); fai_destroy(idx);
return 0;
}
if (state == 2) state = 3;
l1 = l2 = 0;
do {
++l1;
if (isgraph(c)) ++l2;
} while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
if (state == 3 && l2) {
fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
free(name); fai_destroy(idx);
return 0;
}
++l1; len += l2;
if (state == 1) line_len = l1, line_blen = l2, state = 0;
else if (state == 0) {
if (l1 != line_len || l2 != line_blen) state = 2;
}
}
}
if ( name )
fai_insert_index(idx, name, len, line_len, line_blen, offset);
else
{
free(idx);
return NULL;
}
free(name);
return idx;
}
void fai_save(const faidx_t *fai, FILE *fp)
{
khint_t k;
int i;
for (i = 0; i < fai->n; ++i) {
faidx1_t x;
k = kh_get(s, fai->hash, fai->name[i]);
x = kh_value(fai->hash, k);
#ifdef _WIN32
fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
#else
fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
#endif
}
}
faidx_t *fai_read(FILE *fp)
{
faidx_t *fai;
char *buf, *p;
int len, line_len, line_blen;
#ifdef _WIN32
long offset;
#else
long long offset;
#endif
fai = (faidx_t*)calloc(1, sizeof(faidx_t));
fai->hash = kh_init(s);
buf = (char*)calloc(0x10000, 1);
while (!feof(fp) && fgets(buf, 0x10000, fp)) {
for (p = buf; *p && isgraph(*p); ++p);
*p = 0; ++p;
#ifdef _WIN32
sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
#else
sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
#endif
fai_insert_index(fai, buf, len, line_len, line_blen, offset);
}
free(buf);
return fai;
}
void fai_destroy(faidx_t *fai)
{
int i;
for (i = 0; i < fai->n; ++i) free(fai->name[i]);
free(fai->name);
kh_destroy(s, fai->hash);
if (fai->bgzf) bgzf_close(fai->bgzf);
free(fai);
}
int fai_build(const char *fn)
{
char *str;
BGZF *bgzf;
FILE *fp;
faidx_t *fai;
str = (char*)calloc(strlen(fn) + 5, 1);
sprintf(str, "%s.fai", fn);
bgzf = bgzf_open(fn, "r");
if ( !bgzf ) {
fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
free(str);
return -1;
}
if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
fai = fai_build_core(bgzf);
if ( !fai )
{
if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
free(str);
return -1;
}
if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
bgzf_close(bgzf);
fp = fopen(str, "wb");
if ( !fp ) {
fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
fai_destroy(fai); free(str);
return -1;
}
fai_save(fai, fp);
fclose(fp);
free(str);
fai_destroy(fai);
return 0;
}
static FILE *download_and_open(const char *fn)
{
const int buf_size = 1 * 1024 * 1024;
uint8_t *buf;
FILE *fp;
hFILE *fp_remote;
const char *url = fn;
const char *p;
int l = strlen(fn);
for (p = fn + l - 1; p >= fn; --p)
if (*p == '/') break;
fn = p + 1;
// First try to open a local copy
fp = fopen(fn, "r");
if (fp)
return fp;
// If failed, download from remote and open
fp_remote = hopen(url, "rb");
if (fp_remote == 0) {
fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
return NULL;
}
if ((fp = fopen(fn, "wb")) == 0) {
fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
hclose_abruptly(fp_remote);
return NULL;
}
buf = (uint8_t*)calloc(buf_size, 1);
while ((l = hread(fp_remote, buf, buf_size)) > 0)
fwrite(buf, 1, l, fp);
free(buf);
fclose(fp);
if (hclose(fp_remote) != 0)
fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url);
return fopen(fn, "r");
}
faidx_t *fai_load(const char *fn)
{
char *str;
FILE *fp;
faidx_t *fai;
str = (char*)calloc(strlen(fn) + 5, 1);
sprintf(str, "%s.fai", fn);
if (hisremote(str))
{
fp = download_and_open(str);
if ( !fp )
{
fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
free(str);
return 0;
}
}
else
fp = fopen(str, "rb");
if (fp == 0) {
fprintf(stderr, "[fai_load] build FASTA index.\n");
fai_build(fn);
fp = fopen(str, "rb");
if (fp == 0) {
fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
free(str);
return 0;
}
}
fai = fai_read(fp);
fclose(fp);
fai->bgzf = bgzf_open(fn, "rb");
free(str);
if (fai->bgzf == 0) {
fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
return 0;
}
if ( fai->bgzf->is_compressed==1 )
{
if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
{
fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
fai_destroy(fai);
return NULL;
}
}
return fai;
}
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
char *s;
int c, i, l, k, name_end;
khiter_t iter;
faidx1_t val;
khash_t(s) *h;
int beg, end;
beg = end = -1;
h = fai->hash;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
// remove space
for (i = k = 0; i < l; ++i)
if (!isspace(str[i])) s[k++] = str[i];
s[k] = 0; l = k;
// determine the sequence name
for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
if (i >= 0) name_end = i;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
iter = kh_get(s, h, s);
if (iter == kh_end(h)) { // cannot find the sequence name
iter = kh_get(s, h, str); // try str as the name
if (iter == kh_end(h)) {
*len = 0;
free(s); return 0;
} else s[name_end] = ':', name_end = l;
}
} else iter = kh_get(s, h, str);
if(iter == kh_end(h)) {
fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
free(s);
*len = -2;
return 0;
};
val = kh_value(h, iter);
// parse the interval
if (name_end < l) {
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
beg = atoi(s + name_end + 1);
for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
end = i < k? atoi(s + i + 1) : val.len;
if (beg > 0) --beg;
} else beg = 0, end = val.len;
if (beg >= val.len) beg = val.len;
if (end >= val.len) end = val.len;
if (beg > end) beg = end;
free(s);
// now retrieve the sequence
int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
if ( ret<0 )
{
*len = -1;
fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
return NULL;
}
l = 0;
s = (char*)malloc(end - beg + 2);
while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
if (isgraph(c)) s[l++] = c;
s[l] = '\0';
*len = l;
return s;
}
int faidx_fetch_nseq(const faidx_t *fai)
{
return fai->n;
}
int faidx_nseq(const faidx_t *fai)
{
return fai->n;
}
const char *faidx_iseq(const faidx_t *fai, int i)
{
return fai->name[i];
}
int faidx_seq_len(const faidx_t *fai, const char *seq)
{
khint_t k = kh_get(s, fai->hash, seq);
if ( k == kh_end(fai->hash) ) return -1;
return kh_val(fai->hash, k).len;
}
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
{
int l, c;
khiter_t iter;
faidx1_t val;
char *seq=NULL;
// Adjust position
iter = kh_get(s, fai->hash, c_name);
if (iter == kh_end(fai->hash))
{
*len = -2;
fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
return NULL;
}
val = kh_value(fai->hash, iter);
if(p_end_i < p_beg_i) p_beg_i = p_end_i;
if(p_beg_i < 0) p_beg_i = 0;
else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
if(p_end_i < 0) p_end_i = 0;
else if(val.len <= p_end_i) p_end_i = val.len - 1;
// Now retrieve the sequence
int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
if ( ret<0 )
{
*len = -1;
fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
return NULL;
}
l = 0;
seq = (char*)malloc(p_end_i - p_beg_i + 2);
while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
if (isgraph(c)) seq[l++] = c;
seq[l] = '\0';
*len = l;
return seq;
}
int faidx_has_seq(const faidx_t *fai, const char *seq)
{
khiter_t iter = kh_get(s, fai->hash, seq);
if (iter == kh_end(fai->hash)) return 0;
return 1;
}
htslib-1.2.1/hfile.c 0000664 0000000 0000000 00000035523 12464172677 0014263 0 ustar 00root root 0000000 0000000 /* hfile.c -- buffered low-level input/output streams.
Copyright (C) 2013-2015 Genome Research Ltd.
Author: John Marshall
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#include
#include
#include
#include
#include "htslib/hfile.h"
#include "hfile_internal.h"
/* hFILE fields are used as follows:
char *buffer; // Pointer to the start of the I/O buffer
char *begin; // First not-yet-read character / unused position
char *end; // First unfilled/unfillable position
char *limit; // Pointer to the first position past the buffer
const hFILE_backend *backend; // Methods to refill/flush I/O buffer
off_t offset; // Offset within the stream of buffer position 0
int at_eof:1; // For reading, whether EOF has been seen
int has_errno; // Error number from the last failure on this stream
For reading, begin is the first unread character in the buffer and end is the
first unfilled position:
-----------ABCDEFGHIJKLMNO---------------
^buffer ^begin ^end ^limit
For writing, begin is the first unused position and end is unused so remains
equal to buffer:
ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
^buffer ^begin ^limit
^end
Thus if begin > end then there is a non-empty write buffer, if begin < end
then there is a non-empty read buffer, and if begin == end then both buffers
are empty. In all cases, the stream's file position indicator corresponds
to the position pointed to by begin. */
hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
{
hFILE *fp = (hFILE *) malloc(struct_size);
if (fp == NULL) goto error;
if (capacity == 0) capacity = 32768;
// FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
fp->buffer = (char *) malloc(capacity);
if (fp->buffer == NULL) goto error;
fp->begin = fp->end = fp->buffer;
fp->limit = &fp->buffer[capacity];
fp->offset = 0;
fp->at_eof = 0;
fp->has_errno = 0;
return fp;
error:
hfile_destroy(fp);
return NULL;
}
void hfile_destroy(hFILE *fp)
{
int save = errno;
if (fp) free(fp->buffer);
free(fp);
errno = save;
}
static inline int writebuffer_is_nonempty(hFILE *fp)
{
return fp->begin > fp->end;
}
/* Refills the read buffer from the backend (once, so may only partially
fill the buffer), returning the number of additional characters read
(which might be 0), or negative when an error occurred. */
static ssize_t refill_buffer(hFILE *fp)
{
ssize_t n;
// Move any unread characters to the start of the buffer
if (fp->begin > fp->buffer) {
fp->offset += fp->begin - fp->buffer;
memmove(fp->buffer, fp->begin, fp->end - fp->begin);
fp->end = &fp->buffer[fp->end - fp->begin];
fp->begin = fp->buffer;
}
// Read into the available buffer space at fp->[end,limit)
if (fp->at_eof || fp->end == fp->limit) n = 0;
else {
n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
if (n < 0) { fp->has_errno = errno; return n; }
else if (n == 0) fp->at_eof = 1;
}
fp->end += n;
return n;
}
/* Called only from hgetc(), when our buffer is empty. */
int hgetc2(hFILE *fp)
{
return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
}
ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
{
size_t n = fp->end - fp->begin;
while (n < nbytes) {
ssize_t ret = refill_buffer(fp);
if (ret < 0) return ret;
else if (ret == 0) break;
else n += ret;
}
if (n > nbytes) n = nbytes;
memcpy(buffer, fp->begin, n);
return n;
}
/* Called only from hread(); when called, our buffer is empty and nread bytes
have already been placed in the destination buffer. */
ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
{
const size_t capacity = fp->limit - fp->buffer;
char *dest = (char *) destv;
dest += nread, nbytes -= nread;
// Read large requests directly into the destination buffer
while (nbytes * 2 >= capacity && !fp->at_eof) {
ssize_t n = fp->backend->read(fp, dest, nbytes);
if (n < 0) { fp->has_errno = errno; return n; }
else if (n == 0) fp->at_eof = 1;
fp->offset += n;
dest += n, nbytes -= n;
nread += n;
}
while (nbytes > 0 && !fp->at_eof) {
size_t n;
ssize_t ret = refill_buffer(fp);
if (ret < 0) return ret;
n = fp->end - fp->begin;
if (n > nbytes) n = nbytes;
memcpy(dest, fp->begin, n);
fp->begin += n;
dest += n, nbytes -= n;
nread += n;
}
return nread;
}
/* Flushes the write buffer, fp->[buffer,begin), out through the backend
returning 0 on success or negative if an error occurred. */
static ssize_t flush_buffer(hFILE *fp)
{
const char *buffer = fp->buffer;
while (buffer < fp->begin) {
ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
if (n < 0) { fp->has_errno = errno; return n; }
buffer += n;
fp->offset += n;
}
fp->begin = fp->buffer; // Leave the buffer empty
return 0;
}
int hflush(hFILE *fp)
{
if (flush_buffer(fp) < 0) return EOF;
if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
return 0;
}
/* Called only from hputc(), when our buffer is already full. */
int hputc2(int c, hFILE *fp)
{
if (flush_buffer(fp) < 0) return EOF;
*(fp->begin++) = c;
return c;
}
/* Called only from hwrite() and hputs2(); when called, our buffer is full and
ncopied bytes from the source have already been copied to our buffer. */
ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
{
const char *src = (const char *) srcv;
ssize_t ret;
const size_t capacity = fp->limit - fp->buffer;
size_t remaining = totalbytes - ncopied;
src += ncopied;
ret = flush_buffer(fp);
if (ret < 0) return ret;
// Write large blocks out directly from the source buffer
while (remaining * 2 >= capacity) {
ssize_t n = fp->backend->write(fp, src, remaining);
if (n < 0) { fp->has_errno = errno; return n; }
fp->offset += n;
src += n, remaining -= n;
}
// Just buffer any remaining characters
memcpy(fp->begin, src, remaining);
fp->begin += remaining;
return totalbytes;
}
/* Called only from hputs(), when our buffer is already full. */
int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
{
return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
}
off_t hseek(hFILE *fp, off_t offset, int whence)
{
off_t pos;
if (writebuffer_is_nonempty(fp)) {
int ret = flush_buffer(fp);
if (ret < 0) return ret;
}
else {
// Convert relative offsets from being relative to the hFILE's stream
// position (at begin) to being relative to the backend's physical
// stream position (at end, due to the buffering read-ahead).
if (whence == SEEK_CUR) offset -= fp->end - fp->begin;
}
pos = fp->backend->seek(fp, offset, whence);
if (pos < 0) { fp->has_errno = errno; return pos; }
// Seeking succeeded, so discard any non-empty read buffer
fp->begin = fp->end = fp->buffer;
fp->at_eof = 0;
fp->offset = pos;
return pos;
}
int hclose(hFILE *fp)
{
int err = fp->has_errno;
if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
if (fp->backend->close(fp) < 0) err = errno;
hfile_destroy(fp);
if (err) {
errno = err;
return EOF;
}
else return 0;
}
void hclose_abruptly(hFILE *fp)
{
int save = errno;
if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
hfile_destroy(fp);
errno = save;
}
/***************************
* File descriptor backend *
***************************/
#include
#include
#include
#include
#ifdef _WIN32
#define HAVE_CLOSESOCKET
#endif
/* For Unix, it doesn't matter whether a file descriptor is a socket.
However Windows insists on send()/recv() and its own closesocket()
being used when fd happens to be a socket. */
typedef struct {
hFILE base;
int fd;
int is_socket:1;
} hFILE_fd;
static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
{
hFILE_fd *fp = (hFILE_fd *) fpv;
ssize_t n;
do {
n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
: read(fp->fd, buffer, nbytes);
} while (n < 0 && errno == EINTR);
return n;
}
static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
{
hFILE_fd *fp = (hFILE_fd *) fpv;
ssize_t n;
do {
n = fp->is_socket? send(fp->fd, buffer, nbytes, 0)
: write(fp->fd, buffer, nbytes);
} while (n < 0 && errno == EINTR);
return n;
}
static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
{
hFILE_fd *fp = (hFILE_fd *) fpv;
return lseek(fp->fd, offset, whence);
}
static int fd_flush(hFILE *fpv)
{
hFILE_fd *fp = (hFILE_fd *) fpv;
int ret;
do {
#ifdef HAVE_FDATASYNC
ret = fdatasync(fp->fd);
#else
ret = fsync(fp->fd);
#endif
// Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
// and operation-not-supported errors (Mac OS X)
if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
} while (ret < 0 && errno == EINTR);
return ret;
}
static int fd_close(hFILE *fpv)
{
hFILE_fd *fp = (hFILE_fd *) fpv;
int ret;
do {
#ifdef HAVE_CLOSESOCKET
ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
#else
ret = close(fp->fd);
#endif
} while (ret < 0 && errno == EINTR);
return ret;
}
static const struct hFILE_backend fd_backend =
{
fd_read, fd_write, fd_seek, fd_flush, fd_close
};
static size_t blksize(int fd)
{
struct stat sbuf;
if (fstat(fd, &sbuf) != 0) return 0;
return sbuf.st_blksize;
}
static hFILE *hopen_fd(const char *filename, const char *mode)
{
hFILE_fd *fp = NULL;
int fd = open(filename, hfile_oflags(mode), 0666);
if (fd < 0) goto error;
fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
if (fp == NULL) goto error;
fp->fd = fd;
fp->is_socket = 0;
fp->base.backend = &fd_backend;
return &fp->base;
error:
if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
hfile_destroy((hFILE *) fp);
return NULL;
}
hFILE *hdopen(int fd, const char *mode)
{
hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
if (fp == NULL) return NULL;
fp->fd = fd;
fp->is_socket = (strchr(mode, 's') != NULL);
fp->base.backend = &fd_backend;
return &fp->base;
}
static hFILE *hopen_fd_stdinout(const char *mode)
{
int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
// TODO Set binary mode (for Windows)
return hdopen(fd, mode);
}
int hfile_oflags(const char *mode)
{
int rdwr = 0, flags = 0;
const char *s;
for (s = mode; *s; s++)
switch (*s) {
case 'r': rdwr = O_RDONLY; break;
case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC; break;
case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND; break;
case '+': rdwr = O_RDWR; break;
default: break;
}
#ifdef O_BINARY
flags |= O_BINARY;
#endif
return rdwr | flags;
}
/*********************
* In-memory backend *
*********************/
typedef struct {
hFILE base;
const char *buffer;
size_t length, pos;
} hFILE_mem;
static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
{
hFILE_mem *fp = (hFILE_mem *) fpv;
size_t avail = fp->length - fp->pos;
if (nbytes > avail) nbytes = avail;
memcpy(buffer, fp->buffer + fp->pos, nbytes);
fp->pos += nbytes;
return nbytes;
}
static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
{
hFILE_mem *fp = (hFILE_mem *) fpv;
size_t absoffset = (offset >= 0)? offset : -offset;
size_t origin;
switch (whence) {
case SEEK_SET: origin = 0; break;
case SEEK_CUR: origin = fp->pos; break;
case SEEK_END: origin = fp->length; break;
default: errno = EINVAL; return -1;
}
if ((offset < 0 && absoffset > origin) ||
(offset >= 0 && absoffset > fp->length - origin)) {
errno = EINVAL;
return -1;
}
fp->pos = origin + offset;
return fp->pos;
}
static int mem_close(hFILE *fpv)
{
return 0;
}
static const struct hFILE_backend mem_backend =
{
mem_read, NULL, mem_seek, NULL, mem_close
};
static hFILE *hopen_mem(const char *data, const char *mode)
{
// TODO Implement write modes, which will require memory allocation
if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; }
hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0);
if (fp == NULL) return NULL;
fp->buffer = data;
fp->length = strlen(data);
fp->pos = 0;
fp->base.backend = &mem_backend;
return &fp->base;
}
/******************************
* hopen() backend dispatcher *
******************************/
hFILE *hopen(const char *fname, const char *mode)
{
if (strncmp(fname, "http://", 7) == 0 ||
strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode);
#ifdef HAVE_IRODS
else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode);
#endif
else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode);
else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
else return hopen_fd(fname, mode);
}
int hisremote(const char *fname)
{
// FIXME Make a new backend entry to return this
if (strncmp(fname, "http://", 7) == 0 ||
strncmp(fname, "https://", 8) == 0 ||
strncmp(fname, "ftp://", 6) == 0) return 1;
#ifdef HAVE_IRODS
else if (strncmp(fname, "irods:", 6) == 0) return 1;
#endif
else return 0;
}
htslib-1.2.1/hfile_internal.h 0000664 0000000 0000000 00000007013 12464172677 0016155 0 ustar 00root root 0000000 0000000 /* hfile_internal.h -- internal parts of low-level input/output streams.
Copyright (C) 2013-2015 Genome Research Ltd.
Author: John Marshall