pax_global_header00006660000000000000000000000064130714000400014477gustar00rootroot0000000000000052 comment=1d27c245276cc9b10c4b20e1fedc5e4adfcc0123 vcftools-0.1.15/000077500000000000000000000000001307140004000134225ustar00rootroot00000000000000vcftools-0.1.15/.gitignore000066400000000000000000000006641307140004000154200ustar00rootroot00000000000000# created by autoscan /autom4te.cache /autoscan.log /configure.scan # created by autoreconf -fi (includes autoconf, automake, libtool) Makefile.in /aclocal.m4 /autom4te.cache /compile /config.guess /config.h.in* /config.sub /configure /depcomp /install-sh /ltmain.sh /m4 /missing # created by configure Makefile /config.h /config.log /config.status /libtool /stamp-h1 .deps # created by make *.la *.lo *.o .libs vcftools *.tar.* vcftools-0.1.15/.tarball-version000066400000000000000000000000071307140004000165240ustar00rootroot000000000000000.1.15 vcftools-0.1.15/LICENSE000066400000000000000000000167441307140004000144430ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. vcftools-0.1.15/Makefile.am000066400000000000000000000004041307140004000154540ustar00rootroot00000000000000SUBDIRS = src EXTRA_DIST = LICENSE README.md examples # Create a '.tarball-version' file containing the version string # when creating a distribution tarball # (not needed when using git repository) dist-hook: @echo $(VERSION) > $(distdir)/.tarball-version vcftools-0.1.15/README.md000066400000000000000000000036101307140004000147010ustar00rootroot00000000000000# VCFtools A set of tools written in Perl and C++ for working with [VCF files](https://samtools.github.io/hts-specs/VCFv4.2.pdf), such as those generated by the [1000 Genomes Project](http://www.1000genomes.org/). Project website: https://vcftools.github.io/ License ------- The program package is released under the GNU Lesser General Public License version 3.0 (LGPLv3). See the `LICENSE` file for the complete LGPL license text. Credits ------- - Adam Auton (Binary Executable) - Petr Danecek (Perl Module) - Anthony Marcketta (Binary Executable) Building VCFtools ----------------- General help about the building process's configuration step can be acquired via: ``` ./configure --help ``` ### Build from Release Tarball ``` ./configure make make install ``` You may need `sudo` permissions to run `make install`. ### Build from GitHub ``` git clone https://github.com/vcftools/vcftools.git cd vcftools ./autogen.sh ./configure make make install ``` You many need `sudo` permissions to run `make install`. Documentation ------------- Documentation and usage examples can be found here: https://vcftools.github.io/examples.html A manual page is also available. If prefix is set to `/usr` or if `MANPATH` points to `$prefix/share/man`, you can access the manual page via: ``` man vcftools ``` Getting Help ------------ The best way to get help regarding VCFtools is to email the mailing list: vcftools-help@lists.sourceforge.net Citation -------- If you make use of VCFtools in your research, we would appreciate a citation of the following paper: > **The Variant Call Format and VCFtools**, Petr Danecek, Adam Auton, Goncalo Abecasis, Cornelis > A. Albers, Eric Banks, Mark A. DePristo, Robert Handsaker, Gerton Lunter, Gabor Marth, Stephen > T. Sherry, Gilean McVean, Richard Durbin and 1000 Genomes Project Analysis Group, > **Bioinformatics**, 2011 http://dx.doi.org/10.1093/bioinformatics/btr330 vcftools-0.1.15/autogen.sh000077500000000000000000000000321307140004000154160ustar00rootroot00000000000000#!/bin/sh autoreconf -fi vcftools-0.1.15/build-aux/000077500000000000000000000000001307140004000153145ustar00rootroot00000000000000vcftools-0.1.15/build-aux/git-version-gen000077500000000000000000000175711307140004000202720ustar00rootroot00000000000000#!/bin/sh # Print a version string. scriptversion=2012-12-31.23; # UTC # Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 # Free Software Foundation, Inc. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # This script is derived from GIT-VERSION-GEN from GIT: http://git.or.cz/. # It may be run two ways: # - from a git repository in which the "git describe" command below # produces useful output (thus requiring at least one signed tag) # - from a non-git-repo directory containing a .tarball-version file, which # presumes this script is invoked like "./git-version-gen .tarball-version". # In order to use intra-version strings in your project, you will need two # separate generated version string files: # # .tarball-version - present only in a distribution tarball, and not in # a checked-out repository. Created with contents that were learned at # the last time autoconf was run, and used by git-version-gen. Must not # be present in either $(srcdir) or $(builddir) for git-version-gen to # give accurate answers during normal development with a checked out tree, # but must be present in a tarball when there is no version control system. # Therefore, it cannot be used in any dependencies. GNUmakefile has # hooks to force a reconfigure at distribution time to get the value # correct, without penalizing normal development with extra reconfigures. # # .version - present in a checked-out repository and in a distribution # tarball. Usable in dependencies, particularly for files that don't # want to depend on config.h but do want to track version changes. # Delete this file prior to any autoconf run where you want to rebuild # files to pick up a version string change; and leave it stale to # minimize rebuild time after unrelated changes to configure sources. # # As with any generated file in a VC'd directory, you should add # /.version to .gitignore, so that you don't accidentally commit it. # .tarball-version is never generated in a VC'd directory, so needn't # be listed there. # # Use the following line in your configure.ac, so that $(VERSION) will # automatically be up-to-date each time configure is run (and note that # since configure.ac no longer includes a version string, Makefile rules # should not depend on configure.ac for version updates). # # AC_INIT([GNU project], # m4_esyscmd([build-aux/git-version-gen .tarball-version]), # [bug-project@example]) # # Then use the following lines in your Makefile.am, so that .version # will be present for dependencies, and so that .version and # .tarball-version will exist in distribution tarballs. # # EXTRA_DIST = $(top_srcdir)/.version # BUILT_SOURCES = $(top_srcdir)/.version # $(top_srcdir)/.version: # echo $(VERSION) > $@-t && mv $@-t $@ # dist-hook: # echo $(VERSION) > $(distdir)/.tarball-version me=$0 version="git-version-gen $scriptversion Copyright 2011 Free Software Foundation, Inc. There is NO warranty. You may redistribute this software under the terms of the GNU General Public License. For more information about these matters, see the files named COPYING." usage="\ Usage: $me [OPTION]... \$srcdir/.tarball-version [TAG-NORMALIZATION-SED-SCRIPT] Print a version string. Options: --prefix prefix of git tags (default 'v') --fallback fallback version to use if \"git --version\" fails --help display this help and exit --version output version information and exit Running without arguments will suffice in most cases." prefix=v fallback= while test $# -gt 0; do case $1 in --help) echo "$usage"; exit 0;; --version) echo "$version"; exit 0;; --prefix) shift; prefix="$1";; --fallback) shift; fallback="$1";; -*) echo "$0: Unknown option '$1'." >&2 echo "$0: Try '--help' for more information." >&2 exit 1;; *) if test "x$tarball_version_file" = x; then tarball_version_file="$1" elif test "x$tag_sed_script" = x; then tag_sed_script="$1" else echo "$0: extra non-option argument '$1'." >&2 exit 1 fi;; esac shift done if test "x$tarball_version_file" = x; then echo "$usage" exit 1 fi tag_sed_script="${tag_sed_script:-s/x/x/}" nl=' ' # Avoid meddling by environment variable of the same name. v= v_from_git= # First see if there is a tarball-only version file. # then try "git describe", then default. if test -f $tarball_version_file then v=`cat $tarball_version_file` || v= case $v in *$nl*) v= ;; # reject multi-line output [0-9]*) ;; *) v= ;; esac test "x$v" = x \ && echo "$0: WARNING: $tarball_version_file is missing or damaged" 1>&2 fi if test "x$v" != x then : # use $v # Otherwise, if there is at least one git commit involving the working # directory, and "git describe" output looks sensible, use that to # derive a version string. elif test "`git log -1 --pretty=format:x . 2>&1`" = x \ && v=`git describe --abbrev=4 --match="$prefix*" HEAD 2>/dev/null \ || git describe --abbrev=4 HEAD 2>/dev/null` \ && v=`printf '%s\n' "$v" | sed "$tag_sed_script"` \ && case $v in $prefix[0-9]*) ;; *) (exit 1) ;; esac then # Is this a new git that lists number of commits since the last # tag or the previous older version that did not? # Newer: v6.10-77-g0f8faeb # Older: v6.10-g0f8faeb case $v in *-*-*) : git describe is okay three part flavor ;; *-*) : git describe is older two part flavor # Recreate the number of commits and rewrite such that the # result is the same as if we were using the newer version # of git describe. vtag=`echo "$v" | sed 's/-.*//'` commit_list=`git rev-list "$vtag"..HEAD 2>/dev/null` \ || { commit_list=failed; echo "$0: WARNING: git rev-list failed" 1>&2; } numcommits=`echo "$commit_list" | wc -l` v=`echo "$v" | sed "s/\(.*\)-\(.*\)/\1-$numcommits-\2/"`; test "$commit_list" = failed && v=UNKNOWN ;; esac # Change the first '-' to a '.', so version-comparing tools work properly. # Remove the "g" in git describe's output string, to save a byte. v=`echo "$v" | sed 's/-/./;s/\(.*\)-g/\1-/'`; v_from_git=1 elif test "x$fallback" = x || git --version >/dev/null 2>&1; then v=UNKNOWN else v=$fallback fi v=`echo "$v" |sed "s/^$prefix//"` # Test whether to append the "-dirty" suffix only if the version # string we're using came from git. I.e., skip the test if it's "UNKNOWN" # or if it came from .tarball-version. if test "x$v_from_git" != x; then # Don't declare a version "dirty" merely because a time stamp has changed. git update-index --refresh > /dev/null 2>&1 dirty=`exec 2>/dev/null;git diff-index --name-only HEAD` || dirty= case "$dirty" in '') ;; *) # Append the suffix only if there isn't one already. case $v in *-dirty) ;; *) v="$v-dirty" ;; esac ;; esac fi # Omit the trailing newline, so that m4_esyscmd can use the result directly. echo "$v" | tr -d "$nl" # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-time-zone: "UTC" # time-stamp-end: "; # UTC" # End: vcftools-0.1.15/configure.ac000066400000000000000000000040531307140004000157120ustar00rootroot00000000000000# -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) AC_INIT([vcftools], [m4_esyscmd([build-aux/git-version-gen .tarball-version])], [https://github.com/vcftools/vcftools/issues]) AC_CONFIG_SRCDIR([src/cpp/vcftools.cpp]) AC_CONFIG_HEADERS([config.h]) # Automake invocation. AM_INIT_AUTOMAKE([foreign]) # Checks for programs. AC_PROG_CXX AC_PROG_CC AC_PROG_CPP # Checks for perl. AC_PATH_PROGS([PERL], [perl] [perl5], [false]) AS_IF([test "x$PERL" = "xfalse"],[ AC_MSG_ERROR([Perl not found; check your \$PATH.]) ]) pmdir_relative_path=`\ $PERL -MConfig \ -wle '($_ = $Config{installsitelib}) =~ s!^\Q$Config{siteprefix}/!!; \ print'` AC_ARG_WITH( [pmdir], AS_HELP_STRING( [--with-pmdir=DIR], [install Perl modules in DIR]), [PMDIR=${withval}], [PMDIR="$pmdir_relative_path"]) AC_SUBST([PMDIR]) # Checks for libraries. PKG_CHECK_MODULES([ZLIB], [zlib]) # Checks for header files. AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h stdint.h stdlib.h string.h sys/socket.h unistd.h]) # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL AC_C_INLINE AC_TYPE_INT16_T AC_TYPE_INT32_T AC_TYPE_INT64_T AC_TYPE_INT8_T AC_TYPE_OFF_T AC_TYPE_SIZE_T AC_TYPE_SSIZE_T AC_TYPE_UINT16_T AC_TYPE_UINT32_T AC_TYPE_UINT8_T # Checks for operating system services or capabilities. AC_SYS_LARGEFILE # Checks for library functions. AC_FUNC_ERROR_AT_LINE AC_FUNC_MALLOC AC_FUNC_REALLOC AC_CHECK_FUNCS([gethostbyaddr gethostbyname memset pow select socket sqrt strchr strdup strerror strstr strtol]) # Optional features. AC_ARG_ENABLE([pca], AS_HELP_STRING([--enable-pca], [enable PCA feature]), [pca=${enableval}], [pca=no]) AS_IF([test "x$pca" = "xyes"],[ AC_SEARCH_LIBS([dgeev_], [lapack]) ]) # Generate output. AC_CONFIG_FILES([Makefile src/Makefile src/cpp/Makefile src/perl/Makefile]) AC_OUTPUT vcftools-0.1.15/examples/000077500000000000000000000000001307140004000152405ustar00rootroot00000000000000vcftools-0.1.15/examples/annotate-test.vcf000066400000000000000000000032251307140004000205300ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT G 1806 q10 DP=35 1 104 . C . 1792 PASS DP=32 1 105 . C T 246 PASS DP=10 1 106 . C A 246 PASS DP=10 2 107 . C . 1806 q10 DP=35 2 108 . C . 1792 PASS DP=32 2 109 . C . 628 q10 DP=21 2 110 . C G 1016 PASS DP=22 2 111 . C G 727 PASS DP=30 2 112 . C G 246 PASS DP=10 2 113 . C . 246 PASS DP=10 2 114 . T . 246 PASS DP=10 2 115 . T . 246 PASS DP=10 2 116 . T . 246 PASS DP=10 2 117 . T A 246 PASS DP=10 2 118 . T C 246 PASS DP=10 2 119 . TAAA T 246 PASS DP=10 2 124 . TA T 246 PASS DP=10 2 128 . T TA 246 PASS DP=10 2 130 . C A 246 PASS DP=10 2 131 . T A 246 PASS DP=10 2 132 . T A 246 PASS DP=10 2 133 . T A 246 PASS DP=10 2 134 . T A 246 PASS DP=10 2 135 . T C 246 PASS DP=10 2 136 . TT T 246 PASS DP=10;AF=0.1 2 138 . TT T 246 PASS DP=10;AF=0.2 2 140 . TT T 246 PASS DP=10;AF=0.1 17 12412 . CAGAGAGAGA CAGAGAGAGAGA 74.8 . INDEL;DP=5388;AF1=0.006576;CI95=0.005525,0.01105;DP4=2077,2367,21,22;MQ=47;FQ=74.8;PV4=0.88,1,0.34,0.021 17 12427 . G A 999 . DP=5557;AF1=0.06028;CI95=0.04972,0.07182;DP4=2461,2689,106,74;MQ=47;FQ=999;PV4=0.0038,1,2.6e-12,1 17 69284 . G A 14.6 . DP=3946;AF1=0.003468;CI95=0.002762,0.008287;DP4=1529,2177,7,9;MQ=44;FQ=14.6;PV4=1,0.035,0.098,1 17 69293 . GTTTCATTTC GTTTCTTTTCATTTC 999 . INDEL;DP=3568;AF1=0.1295;CI95=0.1077,0.1547;DP4=1014,1238,118,121;MQ=44;FQ=999;PV4=0.22,1,9.4e-54,1 vcftools-0.1.15/examples/annotate.out000066400000000000000000000025401307140004000176030ustar00rootroot00000000000000##fileformat=VCFv4.0 ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=5;GN=gene1;HM2 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=6,6 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=6,6 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=7,7;HM2=, GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=7,7;HM2=, GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=8 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=9 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS GN=gene2;HM2 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 GN=gene2;HM2 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS GN=gene2;HM2 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS GN=gene2;HM2 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS GN=gene2;HM2 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS DP=11;GN=gene3 GT:GQ:DP 0/2:12:10 vcftools-0.1.15/examples/annotate.txt000066400000000000000000000004531307140004000176140ustar00rootroot00000000000000100 100 1 id1_100 . . HM2 gene1 5 110 110 1 id1_110 CAAA C,CA 0 . 6 110 110 1 id2_110 C T 0 . 6 130 130 1 id1_130 G T HM2 . 7 130 130 1 id2_130 GAA GG HM2 . 7 140 140 1 id1_140 GT G 0 . 8 150 150 1 id1_150 TAAAA T 0 . 9 110 150 2 id2_110_150 CAAA C HM2 gene2 . 160 160 2 id2_160 TAAAA TC 0 gene3 11 vcftools-0.1.15/examples/annotate2.out000066400000000000000000000060431307140004000176670ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT G 1806 q10;MaxDP DP=35 1 104 . C . 1792 PASS DP=32 1 105 . C T 246 SnpGap;SnpCluster DP=10 1 106 . C A 246 SnpGap;SnpCluster DP=10 2 107 . C . 1806 q10;MaxDP DP=35 2 108 . C . 1792 PASS DP=32 2 109 . C . 628 q10 DP=21 2 110 . C G 1016 SnpGap;SnpCluster DP=22 2 111 . C G 727 SnpGap;SnpCluster DP=30 2 112 . C G 246 SnpGap;SnpCluster DP=10 2 113 . C . 246 PASS DP=10 2 114 . T . 246 PASS DP=10 2 115 . T . 246 PASS DP=10 2 116 . T . 246 PASS DP=10 2 117 . T A 246 SnpGap;SnpCluster DP=10 2 118 . T C 246 SnpGap;SnpCluster DP=10 2 119 . TAAA T 246 SnpCluster DP=10 2 124 . TA T 246 GapWin DP=10 2 128 . T TA 246 SnpCluster DP=10 2 130 . C A 246 SnpGap;SnpCluster DP=10 2 131 . T A 246 SnpGap;SnpCluster DP=10 2 132 . T A 246 SnpGap;SnpCluster DP=10 2 133 . T A 246 SnpGap;SnpCluster DP=10 2 134 . T A 246 SnpGap;SnpCluster DP=10 2 135 . T C 246 SnpGap;SnpCluster DP=10 2 136 . TT T 246 GapWin;SnpCluster DP=10;AF=0.1 2 138 . TT T 246 SnpCluster DP=10;AF=0.2 2 140 . TT T 246 GapWin;SnpCluster DP=10;AF=0.1 17 12412 . CAGAGAGAGA CAGAGAGAGAGA 74.8 MaxDP INDEL;DP=5388;AF1=0.006576;CI95=0.005525,0.01105;DP4=2077,2367,21,22;MQ=47;FQ=74.8;PV4=0.88,1,0.34,0.021 17 12427 . G A 999 MaxDP;SnpGap DP=5557;AF1=0.06028;CI95=0.04972,0.07182;DP4=2461,2689,106,74;MQ=47;FQ=999;PV4=0.0038,1,2.6e-12,1 17 69284 . G A 14.6 MaxDP;SnpGap DP=3946;AF1=0.003468;CI95=0.002762,0.008287;DP4=1529,2177,7,9;MQ=44;FQ=14.6;PV4=1,0.035,0.098,1 17 69293 . GTTTCATTTC GTTTCTTTTCATTTC 999 MaxDP INDEL;DP=3568;AF1=0.1295;CI95=0.1077,0.1547;DP4=1014,1238,118,121;MQ=44;FQ=999;PV4=0.22,1,9.4e-54,1 vcftools-0.1.15/examples/annotate3.out000066400000000000000000000025621307140004000176720ustar00rootroot00000000000000##fileformat=VCFv4.0 ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 id1_100 GTTT G 1806 q10 DP=5;GN=gene1;HM2 GT:GQ:DP 0/1:409:35 1 110 id2_110 C T,G 1792 PASS DP=6 GT:GQ:DP 0/1:245:32 1 110 id1_110 CAAA C 1792 PASS DP=6 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 id1_130 G T 1016 PASS DP=7;HM2 GT:GQ:DP 0/1:212:22 1 130 id2_130 GAA GG 1016 PASS DP=7;HM2 GT:GQ:DP 0/1:212:22 1 140 id1_140 GT G 727 PASS DP=8 GT:GQ:DP 0/1:150:30 1 150 id1_150 TAAAA TA,T 246 PASS DP=9 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 id2_110_150 CAAA C 1792 PASS GN=gene2;HM2 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 id2_160 TAAAA TA,TC,T 246 PASS DP=11;GN=gene3 GT:GQ:DP 0/2:12:10 vcftools-0.1.15/examples/cmp-test-a-3.3.vcf000066400000000000000000000010121307140004000202050ustar00rootroot00000000000000##fileformat=VCFv3.3 ##INFO=DP,1,Integer,"Total Depth" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 ./.:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 1/1:41:1 0/2:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools-0.1.15/examples/cmp-test-a.vcf000066400000000000000000000011761307140004000177170ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 ./.:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 1/1:41:1 0/2:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools-0.1.15/examples/cmp-test-b-3.3.vcf000066400000000000000000000010101307140004000202040ustar00rootroot00000000000000##fileformat=VCFv3.3 ##INFO=DP,1,Integer,"Total Depth" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 1/0:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 1|0:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100400 . C G 35 . DP=1 GT:GQ:DP 1/1:41:1 0/0:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools-0.1.15/examples/cmp-test-b.vcf000066400000000000000000000011741307140004000177160ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 1/0:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 1|0:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100400 . C G 35 . DP=1 GT:GQ:DP 1/1:41:1 0/0:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools-0.1.15/examples/cmp-test.out000066400000000000000000000051111307140004000175230ustar00rootroot00000000000000# This file was generated by vcf-compare. # #VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part. #VN The columns are: #VN 1 .. number of sites unique to this particular combination of files #VN 2- .. combination of files and space-separated number, a fraction of sites in the file VN 6 cmp-test-a.vcf.gz (100.0%) cmp-test-b.vcf.gz (100.0%) #SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN Number of REF matches: 6 SN Number of ALT matches: 5 SN Number of REF mismatches: 0 SN Number of ALT mismatches: 1 SN Number of samples in GT comparison: 2 #GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part. #GS The columns are: #GS 1 .. variant type #GS 2 .. number of mismatches #GS 3 .. number of matches #GS 4 .. discordance GS hom_RR 0 3 0.00% GS het_RA 1 3 25.00% GS hom_AA 0 4 0.00% GS het_AA 0 0 0.00% SN Non-reference Discordance Rate (NDR): 12.50 SN Summary: NDR 12.50, RR 0.00, RA 25.00, AA 0.00 #GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part. #GC The columns are: #GC 1 .. Sample #GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA #GC 7-9 .. Gtype lost: total het_RA het_AA #GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA #GC 15-17 .. Phase lost: total het_RA het_AA #GC 18 .. Phase gained #GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA #GC 24 .. Phased matches: het_RA #GC 25 .. Misphased matches: het_RA GC A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 4 2 0 2 1 GC B 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 4 3 0 1 0 0 0 #AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part. #AF The columns are: #AF 1 .. Non-ref allele count #AF 2 .. Hom(RR) matches #AF 3 .. Het(RA) matches #AF 4 .. Hom(AA) matches #AF 5 .. Het(AA) matches #AF 6 .. Hom(RR) mismatches #AF 7 .. Het(RA) mismatches #AF 8 .. Hom(AA) mismatches #AF 9 .. Het(AA) mismatches AF 0.25 1 1 0 0 0 0 0 0 AF 0.50 2 2 2 0 0 0 0 0 AF 0.75 0 0 1 0 0 1 0 0 AF 1.00 0 0 1 0 0 0 0 0 #DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part. #DP The columns are: #DP 1 .. depth #DP 2 .. RR matches #DP 3 .. RA matches #DP 4 .. AA matches #DP 5 .. RR -> RA mismatches #DP 6 .. RR -> AA mismatches #DP 7 .. RA -> RR mismatches #DP 8 .. RA -> AA mismatches #DP 9 .. AA -> RR mismatches #DP 10 .. AA -> RA mismatches DP 1 3 3 4 0 0 1 0 0 0 vcftools-0.1.15/examples/concat-a.vcf000066400000000000000000000022341307140004000174260ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS DP=10 GT:GQ:DP 0/2:12:10 vcftools-0.1.15/examples/concat-b.vcf000066400000000000000000000012421307140004000174250ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 141 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 151 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 161 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 171 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 181 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 191 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools-0.1.15/examples/concat-c.vcf000066400000000000000000000017141307140004000174320ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 2 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools-0.1.15/examples/concat.out000066400000000000000000000040321307140004000172370ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 141 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 151 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 161 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 171 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 181 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 191 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 160 . TAAAA TA,TC,T 246 PASS DP=10 GT:GQ:DP 0/2:12:10 2 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools-0.1.15/examples/consensus.fa000066400000000000000000000020161307140004000175670ustar00rootroot00000000000000>1:1-500 ATACCATATGTGACTTATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTTG CAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG ATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAATC TTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG AGAGATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAA ATCAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGT GTCGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAG TTATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGA TTTGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGT GTTACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTG ACTCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT TCAGACACAGTTAATCCAGAC vcftools-0.1.15/examples/consensus.out000066400000000000000000000020151307140004000200070ustar00rootroot00000000000000>1:1-500 ATAC*ATAT*TG*T***ATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTT G*AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCAT TAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAA TATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCT CTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAA CTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAA GGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCT GATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAAT CTTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG A*ATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAAAT CAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGTGT CGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAGTT ATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGATT TGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGT TACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGAC TCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT*C AGACACAGTTAATCCAGAC vcftools-0.1.15/examples/consensus.out2000066400000000000000000000020151307140004000200710ustar00rootroot00000000000000>1:1-500 ATAC*ATATGTG*T***ATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTT G*AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCAT TAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAA TATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCT CTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAA CTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAA GGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCT GATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAAT CTTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG A*ATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAAAT CAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGTGT CGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAGTT ATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGATT TGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGT TACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGAC TCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT@C AGACACAGTTAATCCAGAC vcftools-0.1.15/examples/consensus.vcf000066400000000000000000000004231307140004000177570ustar00rootroot00000000000000##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 1 5 . C * . PASS . GT 0/1 1 10 . G * . PASS . GT 0/0 1 12 . GACT G* . PASS . GT 0/1 1 16 . T T*** . PASS . GT 1/1 1 61 . C * . PASS . GT 1/1 2 61 . AGAG A* . PASS . GT 0/1 2 481 . T *,@ . PASS . GT 0/2 vcftools-0.1.15/examples/contrast.out000066400000000000000000000154471307140004000176410ustar00rootroot00000000000000##fileformat=VCFv4.1 ##samtoolsVersion=0.1.18-r572 ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##INFO= ##INFO= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D 1 10250 . A C 61 MinMQ DP=271;VDB=0.0265;AF1=0.125;AC1=1;DP4=87,78,18,9;MQ=17;FQ=61;PV4=0.21,1,1,0.1;AN=8;AC=1;NOVELAL=D;NOVELTY=255 GT:DP:SP:GQ 0/0:60:0:99 0/0:32:5:53 0/0:50:2:83 0/1:50:3:62 1 10352 . TACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC 253 MinMQ INDEL;DP=413;VDB=0.0226;AF1=0.8598;AC1=7;DP4=7,17,13,44;MQ=15;FQ=4.35;PV4=0.58,1,1,0.0055;AN=8;AC=7;NOVELAL=D;NOVELTY=6 GT:PL:DP:SP:GQ 1/1:67,6,0:18:2:11 1/1:14,7,0:12:0:12 1/1:111,22,0:23:0:26 0/1:83,0,22:28:2:18 1 17538 . C A 64 MinMQ DP=393;VDB=0.0314;AF1=0.125;AC1=1;DP4=138,205,17,27;MQ=28;FQ=64;PV4=0.87,0.32,1,1;AN=8;AC=1;NOVELAL=D;NOVELTY=29 GT:PL:DP:SP:GQ 0/0:0,152,255:148:1:99 0/0:0,29,227:72:4:34 0/0:0,71,255:86:6:76 0/1:70,0,226:81:4:65 1 28563 . A G 999 MinMQ DP=124;VDB=0.0072;AF1=1;AC1=8;DP4=22,31,27,39;MQ=18;FQ=-3.67;PV4=1,1,1,1;AN=8;AC=8;NOVELAL=D;NOVELTY=1 GT:PL:DP:SP:GQ 1/1:191,6,0:41:1:14 1/1:90,2,0:24:0:11 1/1:213,20,0:31:4:28 1/1:104,0,1:23:0:8 1 28590 . TT TTGGT 116 MinMQ INDEL;DP=112;VDB=0.0233;AF1=0.3933;AC1=3;DP4=5,46,10,16;MQ=19;FQ=54.6;PV4=0.005,1,1,0.00097;AN=8;AC=3;NOVELTY=9;NOVELGT=D GT:PL:DP:SP:GQ 0/1:80,0,2:23:10:8 0/1:9,0,9:15:15:9 0/1:51,0,26:21:2:31 0/0:0,17,39:18:5:16 1 55085 . T A 149 MinMQ DP=190;VDB=0.0199;AF1=0.3891;AC1=3;DP4=73,61,13,39;MQ=25;FQ=149;PV4=0.0003,0.35,0.01,1;AN=8;AC=3;NOVELTY=7;NOVELGT=D GT:PL:DP:SP:GQ 0/1:79,0,161:48:4:80 0/1:9,0,146:22:13:10 0/1:68,0,250:49:12:69 0/0:0,7,228:67:12:7 1 58176 . G A 94.7 MinMQ DP=93;VDB=0.0330;AF1=0.3746;AC1=3;DP4=51,13,15,9;MQ=17;FQ=94.7;PV4=0.11,0.0027,1,1;AN=8;AC=3;NOVELTY=18;NOVELGT=D GT:PL:DP:SP:GQ 0/1:30,0,23:22:0:26 0/1:18,0,15:12:7:17 0/1:55,0,102:29:2:56 0/0:0,42,114:25:9:41 1 66507 . T A 999 PASS DP=202;VDB=0.0385;AF1=0.626;AC1=5;DP4=25,14,63,82;MQ=42;FQ=999;PV4=0.03,0.023,1,0.0014;AN=8;AC=5;NOVELTY=20;NOVELGT=D GT:PL:DP:SP:GQ 0/1:255,0,205:42:7:99 0/1:255,0,20:37:12:21 0/1:255,0,155:57:4:99 1/1:255,72,0:48:0:71 1 66521 . TATATAATATA TATATAATATAATATA 999 PASS INDEL;DP=200;VDB=0.0384;AF1=0.3747;AC1=3;DP4=61,75,25,12;MQ=43;FQ=999;PV4=0.016,1,3.8e-20,0.38;AN=8;AC=3;NOVELTY=25;NOVELGT=D GT:PL:DP:SP:GQ 0/1:233,0,255:40:7:99 0/1:25,0,255:32:16:26 0/1:178,0,255:56:3:99 0/0:0,75,255:45:3:74 1 73841 . C T 999 PASS DP=182;VDB=0.0366;AF1=0.3748;AC1=3;DP4=50,64,12,26;MQ=30;FQ=999;PV4=0.25,1.6e-10,0.084,1;AN=8;AC=3;NOVELTY=28;NOVELGT=D GT:PL:DP:SP:GQ 0/1:95,0,255:33:3:96 0/1:174,0,204:27:9:99 0/1:28,0,255:53:17:29 0/0:0,64,255:39:6:63 X 2 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1;NOVELTY=255;NOVELGT=D GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 X 3 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1;NOVELAL=D;NOVELTY=255 GT:PL:DP 0/0:0,95,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 vcftools-0.1.15/examples/contrast.vcf000066400000000000000000000666411307140004000176120ustar00rootroot00000000000000##fileformat=VCFv4.1 ##samtoolsVersion=0.1.18-r572 ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##source_20120424.1=vcf-annotate(r735) --fill-AC-AN -f + ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##INFO= ##INFO= ##source_20120424.2=vcf-annotate(r735) --fill-AC-AN -f + ##FILTER= ##source_20120710.1=vcf-annotate(r761) -f q=30 mpileup-v1/merged.filt.vcf.gz #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D 1 10177 . A C 37 MinMQ DP=495;VDB=0.0168;AF1=0.1596;AC1=1;DP4=167,82,52,21;MQ=13;FQ=37;PV4=0.57,1,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,0,15:101:2:5 0/0:0,36,89:51:5:40 0/0:0,79,103:85:1:83 0/1:41,0,31:85:16:36 1 10250 . A C 61 MinMQ DP=271;VDB=0.0265;AF1=0.125;AC1=1;DP4=87,78,18,9;MQ=17;FQ=61;PV4=0.21,1,1,0.1;AN=8;AC=1 GT:DP:SP:GQ 0/0:60:0:99 0/0:32:5:53 0/0:50:2:83 0/1:50:3:62 1 10257 . A C 31.9 MinMQ DP=400;VDB=0.0245;AF1=0.2404;AC1=2;DP4=93,100,26,10;MQ=16;FQ=31.9;PV4=0.01,1,1,0.013;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,93,197:65:3:95 0/1:13,0,92:41:9:11 0/0:0,91,128:59:0:93 0/1:27,0,70:64:14:25 1 10329 . ACCCC ACCC 26.4 MinMQ INDEL;DP=315;VDB=0.0160;AF1=0.2047;AC1=2;DP4=2,42,9,16;MQ=17;FQ=29.3;PV4=0.0011,1,0.061,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,4,68:22:10:7 0/0:2,0,16:7:0:3 0/0:0,15,61:18:0:17 0/1:46,0,34:22:7:40 1 10352 . TACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC 253 MinMQ INDEL;DP=413;VDB=0.0226;AF1=0.8598;AC1=7;DP4=7,17,13,44;MQ=15;FQ=4.35;PV4=0.58,1,1,0.0055;AN=8;AC=7 GT:PL:DP:SP:GQ 1/1:67,6,0:18:2:11 1/1:14,7,0:12:0:12 1/1:111,22,0:23:0:26 0/1:83,0,22:28:2:18 1 10492 . C T 999 PASS DP=213;VDB=0.0102;AF1=0.375;AC1=3;DP4=84,74,34,19;MQ=32;FQ=999;PV4=0.2,0.11,0.057,0.13;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:85,0,255:57:3:86 0/0:0,123,255:41:0:99 0/1:255,0,255:47:0:99 0/1:114,0,255:66:4:99 1 10583 . G A 20 PASS DP=134;VDB=0.0071;AF1=0.1242;AC1=1;DP4=78,41,6,6;MQ=32;FQ=20;PV4=0.35,0.29,0.052,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:26,0,227:40:8:21 0/0:0,35,255:21:0:40 0/0:0,108,255:36:0:99 0/0:0,21,255:34:0:26 1 10797 . CAGA CAGAGA 90.4 MinMQ INDEL;DP=37;VDB=0.0243;AF1=0.2819;AC1=2;DP4=7,3,0,6;MQ=29;FQ=93.3;PV4=0.011,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,9,104:3:0:10 0/0:0,6,39:2:0:8 0/1:59,0,36:5:0:43 0/1:56,0,32:6:4:39 1 10821 . T A 49.7 MinMQ DP=9;VDB=0.0091;AF1=1;AC1=5;DP4=1,3,0,4;MQ=12;FQ=7.75;PV4=1,1,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:42,9,0:3:0:10 0/1:0,3,4:1:0:2 1/1:12,1,0:2:0:4 0/1:0,6,8:2:0:2 1 14907 . A G 999 MinMQ DP=461;VDB=0.0384;AF1=0.5;G3=8.874e-45,1,8.011e-40;HWE=0.0185;AC1=4;DP4=101,122,129,102;MQ=25;FQ=999;PV4=0.031,0.011,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:225,0,255:133:0:99 0/1:213,0,225:91:20:99 0/1:255,0,188:104:0:99 0/1:255,0,208:126:4:99 1 14930 . A G 999 MinMQ DP=502;VDB=0.0393;AF1=0.5;G3=1.282e-48,1,7.866e-46;HWE=0.0185;AC1=4;DP4=117,121,135,111;MQ=28;FQ=999;PV4=0.24,0.02,0.42,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:150:0:99 0/1:232,0,255:84:9:99 0/1:255,0,250:114:0:99 0/1:255,0,218:136:4:99 1 15118 . A G 196 MinMQ DP=408;VDB=0.0389;AF1=0.4995;G3=4.894e-09,1,2.035e-09;HWE=0.0193;AC1=4;DP4=79,107,98,101;MQ=13;FQ=196;PV4=0.19,1,0.16,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:42,0,97:110:1:45 0/1:14,0,34:82:7:17 0/1:54,0,93:90:2:57 0/1:92,0,15:103:1:18 1 15211 . T G 999 MinMQ DP=381;VDB=0.0374;AF1=0.6993;AC1=6;DP4=52,44,122,137;MQ=17;FQ=156;PV4=0.28,0.31,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 0/1:146,0,101:114:5:99 1/1:77,1,0:67:2:4 0/1:121,0,61:78:7:60 1/1:192,89,0:96:11:90 1 15274 . A T 999 MinMQ DP=229;VDB=0.0313;AF1=1;AC1=8;DP4=0,0,99,120;MQ=11;FQ=-92.5;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:83,114,0:54:0:99 1/1:82,108,0:48:0:99 1/1:84,105,0:47:0:99 1/1:112,175,0:70:0:99 1 15820 . G T 90.6 MinMQ DP=149;VDB=0.0252;AF1=0.374;AC1=3;DP4=24,68,15,40;MQ=17;FQ=90.6;PV4=1,1,2e-07,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:40,0,124:48:1:41 0/0:0,27,153:33:2:26 1/1:65,25,0:15:3:20 0/0:0,65,195:51:5:64 1 15903 . GCC GCCC 158 MinMQ INDEL;DP=14;VDB=0.0182;AF1=1;AC1=8;DP4=0,0,0,7;MQ=29;FQ=-18.2;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:29,3,0:1:0:13 1/1:44,6,0:2:0:16 1/1:29,3,0:1:0:13 1/1:72,9,0:3:0:19 1 16103 . T G 24.3 PASS DP=110;VDB=0.0122;AF1=0.2119;AC1=2;DP4=49,26,23,2;MQ=31;FQ=24.3;PV4=0.01,1,6.9e-13,0.33;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,2,228:39:11:5 0/0:0,7,189:15:6:10 0/0:0,0,218:25:0:4 0/1:26,0,192:21:5:24 1 16378 . T C 999 MinMQ DP=587;VDB=0.0267;AF1=0.5;G3=9.954e-26,1,3.125e-18;HWE=0.0185;AC1=4;DP4=128,75,245,120;MQ=18;FQ=999;PV4=0.36,0.23,0.024,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:127,0,150:194:9:99 0/1:118,0,139:108:4:99 0/1:156,0,80:130:3:83 0/1:166,0,148:136:1:99 1 16495 . G C 97.7 MinMQ DP=644;VDB=0.0239;AF1=0.2493;AC1=2;DP4=226,252,67,87;MQ=19;FQ=97.7;PV4=0.46,0.14,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,41,126:190:5:43 0/0:0,17,168:115:1:19 0/1:19,0,132:166:1:17 0/1:87,0,176:161:4:85 1 16534 . C T 264 MinMQ DP=516;VDB=0.0397;AF1=0.5;G3=3.737e-14,1,2.067e-30;HWE=0.0185;AC1=4;DP4=129,149,109,113;MQ=14;FQ=264;PV4=0.59,0.34,1,0.0011;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:97,0,110:151:4:99 0/1:38,0,92:115:2:41 0/1:50,0,120:118:5:53 0/1:85,0,158:116:5:88 1 16571 . G A 120 MinMQ DP=435;VDB=0.0388;AF1=0.4998;G3=1.594e-10,1,7.561e-11;HWE=0.0189;AC1=4;DP4=94,134,84,109;MQ=10;FQ=120;PV4=0.69,0.018,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:33,0,20:123:0:23 0/1:42,0,24:95:2:27 0/1:18,0,43:107:0:21 0/1:33,0,70:96:4:36 1 17538 . C A 64 MinMQ DP=393;VDB=0.0314;AF1=0.125;AC1=1;DP4=138,205,17,27;MQ=28;FQ=64;PV4=0.87,0.32,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,152,255:148:1:99 0/0:0,29,227:72:4:34 0/0:0,71,255:86:6:76 0/1:70,0,226:81:4:65 1 20144 . G A 98.2 MinMQ DP=304;VDB=0.0356;AF1=0.4851;G3=4.916e-07,1,8.127e-35;HWE=0.0213;AC1=4;DP4=91,122,40,43;MQ=15;FQ=98.2;PV4=0.44,0.0094,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:6,0,72:94:6:9 0/1:32,0,80:44:13:35 0/1:28,0,112:81:9:31 0/1:38,0,62:77:0:41 1 28558 . C T 164 MinMQ DP=142;VDB=0.0026;AF1=0.4529;G3=1.465e-06,1,4.392e-30;HWE=0.0307;AC1=4;DP4=38,62,18,20;MQ=17;FQ=164;PV4=0.34,1,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:0,0,104:44:9:4 0/1:27,0,28:32:3:27 0/1:77,0,113:35:5:79 0/1:64,0,31:27:0:35 1 28563 . A G 999 MinMQ DP=124;VDB=0.0072;AF1=1;AC1=8;DP4=22,31,27,39;MQ=18;FQ=-3.67;PV4=1,1,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:191,6,0:41:1:14 1/1:90,2,0:24:0:11 1/1:213,20,0:31:4:28 1/1:104,0,1:23:0:8 1 28590 . TT TTGGT 116 MinMQ INDEL;DP=112;VDB=0.0233;AF1=0.3933;AC1=3;DP4=5,46,10,16;MQ=19;FQ=54.6;PV4=0.005,1,1,0.00097;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:80,0,2:23:10:8 0/1:9,0,9:15:15:9 0/1:51,0,26:21:2:31 0/0:0,17,39:18:5:16 1 30867 . CCTCTCTCTCTCTCTCTCTCTCTCTC CCTCTCTCTCTCTCTCTCTCTC 999 PASS INDEL;DP=229;VDB=0.0320;AF1=0.5;G3=4.953e-17,1,5e-52;HWE=0.0185;AC1=4;DP4=56,66,27,32;MQ=37;FQ=999;PV4=1,1,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:211,0,255:47:1:99 0/1:74,0,255:20:2:77 0/1:255,0,255:70:3:99 0/1:176,0,255:44:0:99 1 30923 . G T 999 PASS DP=107;VDB=0.0022;AF1=1;AC1=8;DP4=0,0,47,50;MQ=37;FQ=-36;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,75,0:25:0:99 1/1:221,30,0:10:0:72 1/1:255,117,0:39:0:99 1/1:255,69,0:23:0:99 1 40639 . CTTTTTTTTTTTTTTTTTTT CTTTTTTTTTTTTTTTT 118 PASS INDEL;DP=72;VDB=0.0379;AF1=1;AC1=8;DP4=0,0,14,2;MQ=33;FQ=-18.8;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:77,30,0:11:0:40 1/1:16,3,0:1:0:14 1/1:16,3,0:1:0:14 1/1:25,6,0:3:0:16 1 46633 . T A 46.4 MinMQ DP=169;VDB=0.0275;AF1=0.1322;AC1=1;DP4=67,81,9,9;MQ=15;FQ=46.4;PV4=0.8,0.5,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:52,0,60:47:0:47 0/0:0,7,71:30:0:12 0/0:0,114,146:38:0:99 0/0:0,154,179:51:0:99 1 49298 . T C 999 MinMQ DP=124;VDB=0.0376;AF1=1;AC1=8;DP4=17,14,49,36;MQ=24;FQ=-3.76;PV4=0.83,1,1,0.49;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:130,0,4:20:4:6 1/1:127,8,0:12:3:17 1/1:247,27,0:45:26:36 1/1:252,60,0:39:12:69 1 51803 . T C 999 MinMQ DP=88;VDB=0.0284;AF1=1;AC1=8;DP4=9,30,20,25;MQ=15;FQ=-3.64;PV4=0.065,0.21,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:45,2,0:22:11:10 1/1:60,5,0:6:3:13 1/1:105,1,0:30:2:9 1/1:153,6,0:26:2:14 1 51898 . C A 22.2 PASS DP=128;VDB=0.0230;AF1=0.1272;AC1=1;DP4=56,50,14,2;MQ=41;FQ=22.2;PV4=0.013,0.069,9.8e-17,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:28,0,255:23:13:23 0/0:0,35,255:19:3:40 0/0:0,59,255:44:0:64 0/0:0,11,255:36:9:16 1 51928 . G A 54.1 PASS DP=149;VDB=0.0311;AF1=0.1269;AC1=1;DP4=67,52,22,5;MQ=41;FQ=54.1;PV4=0.017,0.0073,7.3e-34,0.37;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:60,0,255:29:13:55 0/0:0,27,255:19:0:32 0/0:0,30,255:51:2:35 0/0:0,13,255:47:11:18 1 52058 . G C 17.5 PASS DP=132;VDB=0.0277;AF1=0.2308;AC1=2;DP4=55,57,15,2;MQ=35;FQ=17.5;PV4=0.0031,0.036,0.00091,0.03;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:13,0,178:20:7:11 0/0:0,60,255:20:0:62 0/1:12,0,255:51:8:10 0/0:0,15,255:38:9:17 1 52238 . T G 999 PASS DP=138;VDB=0.0125;AF1=1;AC1=8;DP4=0,0,65,60;MQ=37;FQ=-42;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,63,0:21:0:99 1/1:255,36,0:12:0:84 1/1:255,166,0:55:0:99 1/1:255,111,0:37:0:99 1 54586 . T C 51.1 PASS DP=116;VDB=0.0136;AF1=0.236;AC1=2;DP4=47,45,14,7;MQ=36;FQ=51.1;PV4=0.23,1,6.1e-11,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:48,0,186:24:0:46 0/0:0,26,252:14:0:28 0/1:11,0,255:32:7:9 0/0:0,40,255:43:9:42 1 54676 . C T 999 PASS DP=143;VDB=0.0244;AF1=0.4969;G3=1.224e-08,1,4.851e-96;HWE=0.0191;AC1=4;DP4=54,48,19,21;MQ=40;FQ=999;PV4=0.58,0.47,8.6e-13,0.2;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:82,0,233:23:7:85 0/1:121,0,237:20:2:99 0/1:173,0,255:49:6:99 0/1:13,0,255:50:2:16 1 54712 . TTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTT TTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTT 423 PASS INDEL;DP=161;VDB=0.0367;AF1=0.8839;AC1=7;DP4=1,0,4,12;MQ=45;FQ=-9.11;PV4=0.29,1,1,0.06;AN=8;AC=7 GT:PL:DP:SP:GQ 0/1:0,3,13:1:0:5 1/1:125,9,0:3:0:14 1/1:67,6,0:2:0:11 1/1:255,33,0:11:0:38 1 54753 . T G 61.5 PASS DP=177;VDB=0.0130;AF1=0.2019;AC1=2;DP4=48,82,1,5;MQ=40;FQ=61.5;PV4=0.42,1,0.27,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:5,0,237:20:3:4 0/1:63,0,193:16:0:60 0/0:0,160,255:53:0:99 0/0:0,141,255:47:0:99 1 54844 . G A 999 MinMQ DP=172;VDB=0.0254;AF1=0.4999;G3=4.104e-12,1,1.068e-33;HWE=0.0185;AC1=4;DP4=70,44,38,18;MQ=20;FQ=999;PV4=0.5,0.27,1,0.29;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:88,0,103:45:5:91 0/1:31,0,120:25:0:34 0/1:97,0,124:58:2:99 0/1:49,0,181:42:5:52 1 55085 . T A 149 MinMQ DP=190;VDB=0.0199;AF1=0.3891;AC1=3;DP4=73,61,13,39;MQ=25;FQ=149;PV4=0.0003,0.35,0.01,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:79,0,161:48:4:80 0/1:9,0,146:22:13:10 0/1:68,0,250:49:12:69 0/0:0,7,228:67:12:7 1 55164 . C A 999 MinMQ DP=96;VDB=0.0334;AF1=1;AC1=8;DP4=0,0,54,35;MQ=23;FQ=-36;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:163,60,0:20:0:99 1/1:124,30,0:10:0:72 1/1:198,69,0:23:0:99 1/1:203,108,0:36:0:99 1 55926 . T C 999 MinMQ DP=56;VDB=0.0269;AF1=1;AC1=8;DP4=0,0,23,32;MQ=14;FQ=-11.9;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:114,30,0:10:0:48 1/1:8,6,0:2:0:24 1/1:130,63,0:21:0:81 1/1:78,66,0:22:0:84 1 57376 . C T 16 MinMQ DP=143;VDB=0.0237;AF1=0.1883;AC1=2;DP4=70,55,2,13;MQ=28;FQ=16;PV4=0.002,0.034,0.001,0.027;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:4,0,201:26:8:3 0/1:18,0,188:19:10:15 0/0:0,26,246:38:15:29 0/0:0,154,255:57:0:99 1 57856 . T A 999 MinMQ DP=191;VDB=0.0263;AF1=0.5;G3=5.154e-23,1,1.244e-12;HWE=0.0185;AC1=4;DP4=58,51,29,51;MQ=21;FQ=999;PV4=0.027,0.028,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:204,0,154:54:13:99 0/1:121,0,52:29:9:55 0/1:142,0,175:58:2:99 0/1:104,0,198:48:0:99 1 57952 . A C 118 MinMQ DP=30;VDB=0.0356;AF1=0.8939;AC1=7;DP4=1,1,8,19;MQ=10;FQ=7.51;PV4=0.53,0.23,0.0064,0.46;AN=8;AC=7 GT:PL:DP:SP:GQ 1/1:36,21,0:7:0:27 1/1:17,6,0:2:0:12 1/1:40,33,0:11:0:39 0/1:29,0,12:9:4:7 1 58176 . G A 94.7 MinMQ DP=93;VDB=0.0330;AF1=0.3746;AC1=3;DP4=51,13,15,9;MQ=17;FQ=94.7;PV4=0.11,0.0027,1,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:30,0,23:22:0:26 0/1:18,0,15:12:7:17 0/1:55,0,102:29:2:56 0/0:0,42,114:25:9:41 1 58211 . A G 999 MinMQ DP=46;VDB=0.0332;AF1=1;AC1=8;DP4=0,0,30,15;MQ=22;FQ=-11.8;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:113,30,0:10:0:47 1/1:7,6,0:2:0:23 1/1:148,60,0:20:0:77 1/1:143,39,0:13:0:56 1 58771 . T C 999 MinMQ DP=263;VDB=0.0270;AF1=0.375;AC1=3;DP4=99,85,29,44;MQ=20;FQ=999;PV4=0.053,0.46,1,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:95,0,124:60:2:96 0/0:0,123,209:41:0:99 0/1:179,0,208:101:6:99 0/1:94,0,169:55:6:95 1 58866 . C G 119 MinMQ DP=233;VDB=0.0293;AF1=0.2581;AC1=2;DP4=77,98,12,40;MQ=18;FQ=119;PV4=0.0092,0.058,1,0.19;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:58,0,112:49:1:56 0/0:0,102,152:34:0:99 0/1:69,0,180:70:11:67 0/0:0,10,148:74:16:12 1 60332 . T C 97.6 MinMQ DP=239;VDB=0.0192;AF1=0.3089;AC1=2;DP4=77,104,22,32;MQ=17;FQ=97.6;PV4=0.88,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,0,99:61:9:3 0/0:0,111,185:37:0:99 0/1:70,0,145:68:7:70 0/1:33,0,123:69:0:33 1 61219 . T C 42.6 MinMQ DP=180;VDB=0.0243;AF1=0.1351;AC1=1;DP4=41,105,18,15;MQ=24;FQ=42.6;PV4=0.0069,0.27,0.32,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,6,128:36:5:11 0/0:0,57,168:31:0:62 0/1:48,0,227:53:21:43 0/0:0,16,255:59:6:21 1 61442 . A G 999 PASS DP=96;VDB=0.0348;AF1=1;AC1=8;DP4=0,0,41,47;MQ=30;FQ=-27;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:197,42,0:14:0:75 1/1:108,21,0:7:0:54 1/1:255,99,0:33:0:99 1/1:255,102,0:34:0:99 1 61499 . G A 87.1 PASS DP=140;VDB=0.0120;AF1=0.3006;AC1=2;DP4=54,55,18,12;MQ=35;FQ=87.1;PV4=0.41,0.3,2.4e-12,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:61,0,243:24:2:60 0/0:0,1,231:14:7:4 0/1:32,0,255:43:3:31 0/0:0,48,255:58:6:49 1 61579 . G A 107 MinMQ DP=161;VDB=0.0304;AF1=0.25;AC1=2;DP4=88,29,32,10;MQ=20;FQ=107;PV4=1,0.12,0.015,0.43;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,65,171:34:3:67 0/0:0,32,152:18:3:34 0/1:40,0,174:56:0:38 0/1:75,0,207:51:1:73 1 61987 . A G 999 PASS DP=206;VDB=0.0287;AF1=0.5;G3=1.244e-38,1,7.859e-46;HWE=0.0185;AC1=4;DP4=46,65,42,48;MQ=39;FQ=999;PV4=0.48,0.072,0.00033,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:58:0:99 0/1:182,0,218:22:2:99 0/1:255,0,255:62:3:99 0/1:220,0,255:59:6:99 1 61989 . G C 999 PASS DP=208;VDB=0.0311;AF1=0.5;G3=3.141e-40,1,8.15e-49;HWE=0.0185;AC1=4;DP4=47,65,42,49;MQ=39;FQ=999;PV4=0.57,0.058,8.7e-05,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:59:1:99 0/1:190,0,233:22:2:99 0/1:255,0,255:62:3:99 0/1:216,0,255:60:6:99 1 62203 . T C 999 PASS DP=258;VDB=0.0354;AF1=0.5;G3=3.125e-31,1,5e-52;HWE=0.0185;AC1=4;DP4=77,73,47,52;MQ=40;FQ=999;PV4=0.61,1,2.6e-25,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:66:5:99 0/1:145,0,255:38:1:99 0/1:252,0,255:84:1:99 0/1:210,0,255:61:0:99 1 62239 . TACACACACACACACACA TACACACACACACACA 999 PASS INDEL;DP=223;VDB=0.0280;AF1=0.4961;G3=3.069e-08,1,4.923e-103;HWE=0.0192;AC1=4;DP4=83,54,34,25;MQ=41;FQ=999;PV4=0.75,0.056,2.1e-17,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:248,0,255:45:1:99 0/1:12,0,255:23:9:15 0/1:183,0,255:68:2:99 0/1:158,0,255:60:6:99 1 62271 . A G 134 PASS DP=187;VDB=0.0101;AF1=0.498;G3=2.233e-09,1,4.959e-103;HWE=0.0189;AC1=4;DP4=92,56,14,20;MQ=41;FQ=134;PV4=0.033,0.013,1.9e-22,0.0028;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:65,0,255:35:1:68 0/1:19,0,255:28:2:22 0/1:17,0,255:60:11:20 0/1:39,0,255:59:14:42 1 62777 . A T 999 MinMQ DP=251;VDB=0.0308;AF1=0.499;G3=3.104e-08,1,1.551e-35;HWE=0.0187;AC1=4;DP4=80,108,35,26;MQ=21;FQ=999;PV4=0.055,1,1,0.39;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:115,0,137:69:1:99 0/1:124,0,169:34:6:99 0/1:87,0,202:76:5:90 0/1:18,0,109:70:3:21 1 63735 . CCTACTA CCTA 999 MinMQ INDEL;DP=141;VDB=0.0354;AF1=0.5;G3=9.836e-28,1,9.22e-15;HWE=0.0185;AC1=4;DP4=36,24,32,41;MQ=20;FQ=216;PV4=0.082,1.2e-09,1,0.023;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:83,0,86:30:6:84 0/1:130,0,56:21:0:59 0/1:184,0,40:43:5:43 0/1:181,0,55:39:3:58 1 64613 . T A 999 MinMQ DP=328;VDB=0.0091;AF1=0.5;G3=7.862e-12,1,3.13e-37;HWE=0.0185;AC1=4;DP4=134,130,23,36;MQ=27;FQ=999;PV4=0.11,1,1,0.36;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:79,0,175:95:0:82 0/1:48,0,208:47:2:51 0/1:193,0,248:95:13:99 0/1:107,0,211:86:0:99 1 66162 . A T 999 PASS DP=215;VDB=0.0231;AF1=0.4998;G3=1.238e-10,1,1.58e-77;HWE=0.0186;AC1=4;DP4=62,67,26,36;MQ=39;FQ=999;PV4=0.44,1,3.3e-21,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:170,0,255:45:3:99 0/1:77,0,255:30:0:80 0/1:183,0,255:69:3:99 0/1:26,0,255:47:0:29 1 66442 . TATATAATATA TATATAATATAATATA 132 PASS INDEL;DP=233;VDB=0.0328;AF1=0.3333;AC1=3;DP4=64,69,21,10;MQ=42;FQ=135;PV4=0.071,1,3.5e-27,0.11;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:62,0,255:30:2:62 0/0:0,50,255:37:0:50 0/1:91,0,255:65:8:91 0/1:3,0,255:32:2:5 1 66507 . T A 999 PASS DP=202;VDB=0.0385;AF1=0.626;AC1=5;DP4=25,14,63,82;MQ=42;FQ=999;PV4=0.03,0.023,1,0.0014;AN=8;AC=5 GT:PL:DP:SP:GQ 0/1:255,0,205:42:7:99 0/1:255,0,20:37:12:21 0/1:255,0,155:57:4:99 1/1:255,72,0:48:0:71 1 66521 . TATATAATATA TATATAATATAATATA 999 PASS INDEL;DP=200;VDB=0.0384;AF1=0.3747;AC1=3;DP4=61,75,25,12;MQ=43;FQ=999;PV4=0.016,1,3.8e-20,0.38;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:233,0,255:40:7:99 0/1:25,0,255:32:16:26 0/1:178,0,255:56:3:99 0/0:0,75,255:45:3:74 1 69511 . A G 999 MinMQ DP=79;VDB=0.0355;AF1=1;AC1=8;DP4=1,0,44,31;MQ=18;FQ=-30.9;PV4=1,1,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:133,42,0:14:0:79 1/1:95,25,0:12:0:62 1/1:170,57,0:19:0:94 1/1:192,93,0:31:0:99 1 70300 . C T 14.2 MinMQ DP=147;VDB=0.0063;AF1=0.1206;AC1=1;DP4=63,68,7,5;MQ=19;FQ=14.2;PV4=0.56,1,0.057,0.22;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:20,0,181:35:0:15 0/0:0,39,144:25:7:45 0/0:0,122,201:46:0:99 0/0:0,111,207:37:0:99 1 73822 . A G 161 MinMQ DP=175;VDB=0.0290;AF1=0.2494;AC1=2;DP4=67,87,9,10;MQ=25;FQ=161;PV4=0.81,0.5,1,0.45;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:25,0,203:41:1:23 0/1:144,0,170:29:6:99 0/0:0,178,255:59:0:99 0/0:0,132,255:44:0:99 1 73841 . C T 999 PASS DP=182;VDB=0.0366;AF1=0.3748;AC1=3;DP4=50,64,12,26;MQ=30;FQ=999;PV4=0.25,1.6e-10,0.084,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:95,0,255:33:3:96 0/1:174,0,204:27:9:99 0/1:28,0,255:53:17:29 0/0:0,64,255:39:6:63 1 74092 . G A 26.8 MinMQ DP=158;VDB=0.0267;AF1=0.2721;G3=0.7501,7.846e-07,0.2499;HWE=0.0437;AC1=2;DP4=91,48,4,10;MQ=11;FQ=26.8;PV4=0.0093,0.39,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,5,79:32:10:7 1/1:40,26,0:33:7:19 0/0:0,105,85:35:0:93 0/0:0,160,38:53:0:46 1 79033 . A G 217 MinMQ DP=19;VDB=0.0139;AF1=1;AC1=8;DP4=0,0,12,7;MQ=18;FQ=-12.7;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:77,18,0:6:0:36 1/1:56,12,0:4:0:30 1/1:28,18,0:6:0:36 1/1:56,9,0:3:0:27 1 79050 . G T 258 MinMQ DP=29;VDB=0.0203;AF1=1;AC1=8;DP4=0,0,15,11;MQ=16;FQ=-18.3;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:87,21,0:7:0:45 1/1:52,15,0:5:0:39 1/1:45,24,0:8:0:48 1/1:74,18,0:6:0:42 1 79418 . G C 28.5 PASS DP=99;VDB=0.0139;AF1=0.2016;AC1=2;DP4=31,59,1,5;MQ=39;FQ=28.5;PV4=0.66,0.015,0.045,0.00068;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:5,0,236:21:3:4 0/1:30,0,229:18:0:27 0/0:0,78,255:26:0:81 0/0:0,93,255:31:0:96 1 79772 . C G 999 PASS DP=138;VDB=0.0342;AF1=0.25;AC1=2;DP4=68,47,11,9;MQ=30;FQ=999;PV4=0.81,1,0.41,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:132,0,127:29:6:99 0/1:84,0,219:27:2:82 0/0:0,117,255:39:0:99 0/0:0,120,255:40:0:99 1 82115 . A G 51.1 MinMQ DP=137;VDB=0.0291;AF1=0.1264;AC1=1;DP4=71,55,3,7;MQ=27;FQ=51.1;PV4=0.19,0.085,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:57,0,204:36:6:52 0/0:0,14,206:20:0:19 0/0:0,123,255:41:0:99 0/0:0,117,255:39:0:99 1 82133 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAA 81.9 PASS INDEL;DP=107;VDB=0.0354;AF1=1;AC1=8;DP4=0,0,5,17;MQ=37;FQ=-22.7;AN=8;AC=7,1 GT:PL:DP:SP:GQ 1/2:83,73,58,31,0,18:9:0:29 1/1:10,3,0,10,3,10:1:0:17 1/1:32,15,0,32,15,32:7:0:29 1/1:32,15,0,32,15,32:5:0:29 1 82303 . T C 21 PASS DP=111;VDB=0.0241;AF1=0.1243;AC1=1;DP4=47,50,3,8;MQ=38;FQ=21;PV4=0.22,1,1.5e-14,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:27,0,255:20:5:22 0/0:0,24,255:21:0:29 0/0:0,96,255:32:0:99 0/0:0,75,255:35:0:80 1 82456 . A G 49.6 MinMQ DP=151;VDB=0.0367;AF1=0.2495;AC1=2;DP4=77,55,15,0;MQ=20;FQ=49.6;PV4=0.0011,0.29,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:31,0,124:31:8:29 0/1:27,0,107:23:9:25 0/0:0,151,251:50:0:99 0/0:0,129,255:43:0:99 1 82676 . T G 999 PASS DP=152;VDB=0.0213;AF1=0.25;AC1=2;DP4=70,59,9,11;MQ=34;FQ=999;PV4=0.48,0.37,0.0004,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:124,0,255:34:0:99 0/1:89,0,255:31:6:87 0/0:0,138,255:46:0:99 0/0:0,114,255:38:0:99 1 83084 . T A 999 PASS DP=84;VDB=0.0128;AF1=1;AC1=8;DP4=0,0,38,37;MQ=37;FQ=-33;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,48,0:16:0:87 1/1:203,27,0:9:0:66 1/1:255,72,0:24:0:99 1/1:255,78,0:26:0:99 1 83514 . C T 88.2 MinMQ DP=139;VDB=0.0336;AF1=0.4441;AC1=4;DP4=54,53,22,8;MQ=14;FQ=87.9;PV4=0.037,0.05,1,0.35;AN=8;AC=4 GT:PL:DP:SP:GQ 1/1:30,5,0:38:2:4 0/0:0,30,24:20:6:24 0/1:17,0,23:36:4:18 0/1:51,0,62:43:8:53 1 83786 . TAAAAAAAA TAAAAAAAAAAA 134 PASS INDEL;DP=144;VDB=0.0396;AF1=0.2505;AC1=2;DP4=43,54,4,9;MQ=40;FQ=137;PV4=0.39,1,2.7e-06,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:47,0,67:25:0:45 0/1:113,0,16:11:0:24 0/0:0,84,98:29:0:86 0/0:0,135,116:45:0:99 1 83829 . GAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA GAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA 999 PASS INDEL;DP=131;VDB=0.0388;AF1=0.5;AC1=4;DP4=20,14,15,17;MQ=39;FQ=999;PV4=0.46,1,7e-05,1;AN=8;AC=4 GT:PL:DP:SP:GQ 1/1:255,39,0:13:0:36 0/1:36,0,124:6:0:39 0/1:255,0,203:27:2:99 0/0:0,60,255:20:0:57 1 83895 . GAGAAAGAAAGAAAGAAAGA GAGAAAGAAAGAAAGA 999 PASS INDEL;DP=151;VDB=0.0325;AF1=0.75;AC1=6;DP4=18,19,38,34;MQ=44;FQ=999;PV4=0.69,0.00045,0.12,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:255,60,0:20:0:62 1/1:255,48,0:16:0:50 0/1:255,0,255:37:1:99 0/1:255,0,255:36:3:99 1 84010 . G A 37 PASS DP=190;VDB=0.0033;AF1=0.125;AC1=1;DP4=85,71,6,3;MQ=38;FQ=37;PV4=0.73,0.2,1,0.14;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,126,255:42:0:99 0/0:0,81,255:27:0:86 0/1:43,0,255:38:2:38 0/0:0,64,255:58:2:69 1 84014 . G A 38 PASS DP=192;VDB=0.0055;AF1=0.125;AC1=1;DP4=89,67,3,4;MQ=39;FQ=38;PV4=0.47,0.21,1,0.021;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:44,0,255:42:0:39 0/0:0,84,255:28:0:89 0/0:0,72,255:36:0:77 0/0:0,172,255:57:0:99 1 84018 . G A 79.6 PASS DP=188;VDB=0.0107;AF1=0.2497;AC1=2;DP4=77,64,8,3;MQ=41;FQ=79.6;PV4=0.35,0.17,1,0.0032;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,120,255:40:0:99 0/1:28,0,255:24:0:26 0/0:0,108,255:36:0:99 0/1:60,0,255:52:6:58 1 84244 . A C 999 PASS DP=213;VDB=0.0276;AF1=0.25;AC1=2;DP4=83,93,14,22;MQ=41;FQ=999;PV4=0.46,0.29,0.019,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:47:6:99 0/1:208,0,255:49:1:99 0/0:0,178,255:59:0:99 0/0:0,172,255:57:0:99 1 85597 . A C 999 PASS DP=139;VDB=0.0342;AF1=0.25;AC1=2;DP4=47,60,16,14;MQ=30;FQ=999;PV4=0.41,0.057,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:198,0,135:26:0:99 0/1:184,0,221:35:5:99 0/0:0,114,255:38:0:99 0/0:0,114,255:38:0:99 1 86018 . C G 999 PASS DP=181;VDB=0.0399;AF1=0.25;AC1=2;DP4=70,69,13,26;MQ=42;FQ=999;PV4=0.07,1,7e-14,0.12;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:200,0,255:36:11:99 0/1:240,0,255:46:2:99 0/0:0,129,255:43:0:99 0/0:0,160,255:53:0:99 1 86303 . G T 999 PASS DP=182;VDB=0.0329;AF1=0.25;AC1=2;DP4=76,66,17,22;MQ=40;FQ=999;PV4=0.28,1,3.7e-11,0.31;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:44:3:99 0/1:207,0,255:31:1:99 0/0:0,138,255:46:0:99 0/0:0,181,255:60:0:99 1 86331 . A G 999 PASS DP=187;VDB=0.0331;AF1=0.25;AC1=2;DP4=69,74,18,23;MQ=40;FQ=999;PV4=0.72,1,2.4e-05,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:51:1:99 0/1:216,0,255:34:0:99 0/0:0,120,255:40:0:99 0/0:0,178,255:59:0:99 1 86656 . G T 36 MinMQ DP=148;VDB=0.0132;AF1=0.125;AC1=1;DP4=76,63,0,9;MQ=28;FQ=36;PV4=0.0012,1,0.028,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:42,0,220:30:8:37 0/0:0,39,255:24:6:44 0/0:0,126,255:42:0:99 0/0:0,157,255:52:0:99 1 89677 . A G 195 MinMQ DP=136;VDB=0.0371;AF1=0.25;AC1=2;DP4=46,51,14,17;MQ=23;FQ=195;PV4=0.84,0.28,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,72,186:24:0:74 0/0:0,63,181:21:0:65 0/1:130,0,145:39:1:99 0/1:73,0,155:44:1:71 1 91072 . A G 200 MinMQ DP=88;VDB=0.0383;AF1=0.25;AC1=2;DP4=44,22,11,10;MQ=25;FQ=200;PV4=0.3,1,1,0.41;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,42,166:14:0:44 0/0:0,33,123:11:0:35 0/1:114,0,203:38:10:99 0/1:94,0,104:24:2:92 1 91075 . T C 187 MinMQ DP=85;VDB=0.0399;AF1=0.25;AC1=2;DP4=41,21,11,9;MQ=26;FQ=187;PV4=0.43,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,42,148:14:0:44 0/0:0,33,136:11:0:35 0/1:100,0,186:36:8:98 0/1:95,0,112:21:5:93 1 91336 . A T 999 MinMQ DP=69;VDB=0.0304;AF1=0.7419;AC1=6;DP4=11,19,1,37;MQ=20;FQ=216;PV4=0.0003,1,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:51,10,0:11:3:12 1/1:104,39,0:13:0:41 0/1:54,0,110:17:6:62 0/1:61,0,114:27:13:69 1 98929 . A G 28 MinMQ DP=172;VDB=0.0198;AF1=0.1249;AC1=1;DP4=63,78,12,9;MQ=21;FQ=28;PV4=0.35,0.23,0.15,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,90,234:30:0:95 0/1:34,0,191:43:3:29 0/0:0,114,245:38:0:99 0/0:0,28,150:51:6:33 1 98999 . TTTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTT TTTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTT 999 PASS INDEL;DP=236;VDB=0.0395;AF1=0.5;G3=1.982e-19,1,1.257e-17;HWE=0.0185;AC1=4;DP4=33,36,32,46;MQ=35;FQ=999;PV4=0.51,1,0.0067,0.38;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:112,0,192:30:0:99 0/1:255,0,77:31:4:80 0/1:86,0,191:32:2:89 0/1:255,0,100:54:12:99 1 101686 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:95,0,255:58:0:90 0/0:0,72,255:24:0:77 0/0:0,101,255:102:0:99 0/0:0,255,255:111:0:99 X 1 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 0:0,255:11 X 2 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 X 3 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/0:0,95,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 vcftools-0.1.15/examples/fill-an-ac.out000066400000000000000000000027641307140004000177050ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 AC=1;AN=2;DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS AC=1,0;AN=2;DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS AC=1;AN=2;DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 AC=2;AN=2;DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS AC=1;AN=2;DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 AC=1;AN=2;DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS AC=1;AN=2;DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 AC=2;AN=2;DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS AC=1;AN=2;DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS AC=0,1,0;AN=2;DP=10 GT:GQ:DP 0/2:12:10 vcftools-0.1.15/examples/filters.txt000066400000000000000000000114451307140004000174560ustar00rootroot00000000000000# Examples of user-defined filters. Edit and run with -f filters.txt. # The examples below are self-explanatory. Notice the use of the predefined # variables ($PASS, $FAIL, $MATCH, $RECORD) and methods (error). # In this example, a minimum value of AF1=0.1 is required { tag => 'INFO/AF1', # The VCF tag to apply this filter on name => 'MinAF', # The filter ID desc => 'Minimum AF1 [0.01]', # Description for the VCF header test => sub { return $MATCH < 0.01 ? $FAIL : $PASS }, }, # Filter all indels (presence of INDEL tag is tested) { tag => 'INFO/INDEL', apply_to => 'indels', # Can be one of SNPs, indels, all. Default: [All] name => 'Indel', desc => 'INDEL tag present', test => sub { return $FAIL }, }, # Only loci with enough reads supporting the variant will pass the filter { tag => 'INFO/DP4', name => 'FewAlts', desc => 'Too few reads supporting the variant', apply_to => 'SNPs', test => sub { if ( !($MATCH =~ /^([^,]+),([^,]+),([^,]+),(.+)$/) ) { error("Could not parse INFO/DP4: $CHROM:$POS [$MATCH]"); } if ( 0.1*($1+$2) > $3+$4 ) { return $PASS; } return $FAIL; }, }, # Example of filtering based on genotype columns and the QUAL column { tag => 'FORMAT/PL', name => 'NoHets', desc => 'Inbred homozygous mouse, no hets expected', apply_to => 'SNPs', test => sub { for my $pl (@$MATCH) { my @pls = split(/,/,$pl); if ( $pls[1]<$pls[0] && $pls[1]<$pls[2] ) { return $FAIL; } } return $PASS; }, }, # This example splits the four PV4 values into four tags names PV0, PV1, PV2 and PV3. # Note the use of the 'header' key, and the $RECORD and $VCF variables. { header => [ qq[key=INFO,ID=PV0,Number=1,Type=Float,Description="P-value for strand bias"], qq[key=INFO,ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias"], qq[key=INFO,ID=PV2,Number=1,Type=Float,Description="P-value for mapQ bias"], qq[key=INFO,ID=PV3,Number=1,Type=Float,Description="P-value for tail distance bias"] ], tag => 'INFO/PV4', name => 'SplitPV4', desc => 'Split PV4', apply_to => 'all', test => sub { my @vals = split(/,/,$MATCH); $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'PV0'=>$vals[0],'PV1'=>$vals[1],'PV2'=>$vals[2],'PV3'=>$vals[3]); return $PASS; }, }, # Do whatever you want with every record and edit it according to your needs. This silly # example removes the tag SILLY in records where ID is set and depth is bigger than 5. { tag => 'Dummy', test => sub { if ( $$RECORD[2] eq '.' ) { return $PASS; } # Modify only lines with ID my $dp = $vcf->get_info_field($$RECORD[7],'DP'); if ( $dp>5 ) { $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'SILLY'=>undef); } return $PASS; }, } # Filter records with the value XY absent or not equal to 42 { tag => 'Dummy', header => [ qq[key=FILTER,ID=XY,Description="XY not OK"], ], test => sub { my $xy = $VCF->get_info_field($$RECORD[7],'XY'); my $is_bad = ( !defined $xy or $xy!=42 ) ? 1 : 0; $$RECORD[6] = $VCF->add_filter($$RECORD[6],'XY'=>$is_bad); return $PASS; }, }, # Annotate INFO field with SINGLETON flag when one and only one sample is different from the reference { header => [ qq[key=INFO,ID=SINGLETON,Number=0,Type=Flag,Description="Only one non-ref sample"], ], tag => 'FORMAT/GT', name => 'Dummy', desc => 'Dummy', test => sub { my $nalt = 0; for my $gt (@$MATCH) { my @gt = $VCF->split_gt($gt); for my $allele (@gt) { if ( $allele ne 0 && $allele ne '.' ) { $nalt++; last; } } if ( $nalt>1 ) { last; } } if ( $nalt==1 ) { $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'SINGLETON'=>''); } return $PASS; }, }, # Set genotypes to unknown ("." or "./." depending on ploidy) when coverage is low (by Shane McCarthy). { tag => 'FORMAT/DP', name => 'MinSampleDP', desc => 'Genotypes set to . for samples with DP < 2', apply_to => 'all', test => sub { my $i = 8; for my $dp (@$MATCH) { $i++; next unless ($dp<2); my @format = split(/:/,$$RECORD[$i]); $format[0] = $format[0] =~ /\// ? "./." : "."; $$RECORD[$i] = join(":",@format); } return $PASS; }, }, vcftools-0.1.15/examples/fix-ploidy.out000066400000000000000000000036041307140004000200600ustar00rootroot0000000000000061098 M1 0 0,9,72,5,6,7 M2 0 0,15,140,5,6,7 F3 1/1 147,0,5 F4 0/0 0,131,5 M5 0 0,9,83,5,6,7 M6 0 0,6,56,5,6,7 61270 M1 0 8,14,58 M2 0 0,6,52 F3 0/0 0,6,56 F4 0/0 0,15,117 M5 0 0,6,45 M6 0 0,12,87 61275 M1 0 0,3,13 M2 0 0,3,28 F3 0/0 8,0,41 F4 0/0 0,12,97 M5 0 0,6,49 M6 0 0,9,67 61282 M1 0/1 15,3,0 M2 0/0 0,6,51 F3 0 6,0,31 F4 0 0,6,57 M5 0/1 7,0,19 M6 0/1 16,0,20 61795 M1 0/0 0,27,203 M2 0/0 0,21,174 F3 0 0,45,229 F4 0 0,27,199 M5 0/0 0,24,182 M6 0/0 0,9,85 62731 M1 0/0 0,27,194 M2 0/0 0,24,194 F3 0 0,18,141 F4 0 0,30,201 M5 0/0 0,18,153 M6 0/0 0,33,202 63008 M1 0/0 0,42,255 M2 0/0 0,15,128 F3 0 0,15,136 F4 0 0,39,251 M5 0/0 0,15,111 M6 0/0 0,27,200 63231 M1 0/0 0,42,246 M2 0/0 0,18,141 F3 0 0,27,209 F4 0 0,24,186 M5 0/0 0,12,110 M6 0/0 0,21,145 63244 M1 0 0,36,209 M2 0 0,21,174 F3 . 0,27,198 F4 . 0,24,184 M5 0 0,15,132 M6 0 0,21,159 63328 M1 0 0,42,242 M2 0 0,12,110 F3 . 0,36,231 F4 . 0,36,226 M5 0 0,15,135 M6 0 0,18,132 63452 M1 0 0,27,200 M2 0 0,15,123 F3 . 0,33,228 F4 . 0,9,88 M5 0 0,24,171 M6 0 0,15,134 63799 M1 0 0,36,211 M2 0 0,27,205 F3 . 0,18,125 F4 . 0,15,125 M5 0 1,0,150 M6 0 0,12,106 63967 M1 0 0,30,183 M2 0 0,30,206 F3 . 0,30,206 F4 . 0,33,230 M5 0 0,21,160 M6 0 0,12,112 65288 M1 . 0,18,155 M2 . 0,12,113 F3 0 0,18,144 F4 0 0,21,155 M5 . 0,21,176 M6 . 0,6,63 65900 M1 . 162,24,0 M2 . 160,21,0 F3 1 219,30,0 F4 1 213,30,0 M5 . 248,42,0 M6 . 148,21,0 65951 M1 . 0,18,125 M2 . 0,15,132 F3 0 0,24,183 F4 0 0,45,252 M5 . 0,30,217 M6 . 0,15,101 66370 M1 . 255,57,0 M2 . 193,24,0 F3 1 97,12,0 F4 1 208,30,0 M5 . 129,15,0 M6 . 72,9,0 67184 M1 . 0,33,202 M2 . 0,6,57 F3 0 0,42,223 F4 0 0,21,142 M5 . 0,30,181 M6 . 0,33,201 67760 M1 . 0,42,224 M2 . 0,9,77 F3 0 0,45,243 F4 0 0,21,147 M5 . 0,33,205 M6 . 0,12,102 68303 M1 . 0,33,205 M2 . 0,33,236 F3 0 0,33,214 F4 0 0,27,197 M5 . 0,18,149 M6 . 0,9,72 68618 M1 0/0 0,24,176 M2 0/0 0,30,214 F3 0/0 0,21,159 F4 0/0 0,24,191 M5 0/0 0,15,133 M6 0/0 0,18,133 vcftools-0.1.15/examples/fix-ploidy.samples000066400000000000000000000000521307140004000207070ustar00rootroot00000000000000M1 M M2 M F3 F F4 F M5 M M6 M vcftools-0.1.15/examples/fix-ploidy.txt000066400000000000000000000003371307140004000200700ustar00rootroot00000000000000ploidy => { 20 => [ { from=>1, to=>61275, M=>1, F=>2 }, { from=>61282, to=>63231, F=>1 }, { from=>63244, to=>63967, M=>1, F=>0 }, { from=>65288, to=>68303, M=>0, F=>1 }, ], } vcftools-0.1.15/examples/fix-ploidy.vcf000066400000000000000000000214551307140004000200330ustar00rootroot00000000000000##fileformat=VCFv4.1 ##samtoolsVersion=0.1.16-dev (r969:252) ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##source_20111007.1=/software/vertres/codebase/scripts/vcf-annotate -f +/D=1200 ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##source_20111007.2=/software/vertres/codebase/scripts/vcf-annotate -f +/D=1200 ##source_20120109.1=vcf-subset(r660) -c QTL190284,QTL190301,QTL190321,QTL190576,QTL190627,QTL190628 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT M1 M2 F3 F4 M5 M6 20 61098 . C A,T 999 PASS AC1=41;AF1=0.2104;DP4=209,284,67,76;DP=658;FQ=999;MQ=45;PV4=0.39,4.4e-10,0.0034,0.2 GT:PL:DP:SP:GQ 0/1:0,9,72,5,6,7:3:212:12 0/0:0,15,140,5,6,7:5:458752:18 1:147,0,5:7:384:24 0:0,131,5:5:208:18 0/0:0,9,83,5,6,7:3:392:12 0/0:0,6,56,5,6,7:2:204:9 20 61270 . A T 93 PASS AC1=5;AF1=0.02733;DP4=149,185,5,10;DP=398;FQ=93.1;MQ=43;PV4=0.44,3.5e-05,0.028,1 GT:PL:DP:SP:GQ 0/0:8,14,58:3:0:19 0/0:0,6,52:2:0:19 0/0:0,6,56:2:0:19 0/0:0,15,117:5:0:28 0/0:0,6,45:2:0:19 0/0:0,12,87:4:0:25 20 61275 . T G 14.5 PASS AC1=14;AF1=0.07104;DP4=76,141,1,24;DP=345;FQ=14.8;MQ=43;PV4=0.001,4e-16,2.4e-07,0.39 GT:PL:DP:SP:GQ 0/0:0,3,13:1:16908804:12 0/0:0,3,28:1:0:12 0/0:8,0,41:3:201985031:3 0/0:0,12,97:4:134480904:21 0/0:0,6,49:2:117901063:15 0/0:0,9,67:3:33686020:18 20 61282 . T C 120 PASS AC1=35;AF1=0.1794;DP4=50,118,1,30;DP=333;FQ=121;G3=0.6049,0.3951,2.481e-10;HWE=0.0152;MQ=44;PV4=0.0012,1.1e-08,5.7e-11,1 GT:PL:DP:SP:GQ 0/1:15,3,0:1:17302017:6 0/0:0,6,51:2:0:10 0/1:6,0,31:3:437393682:4 0/0:0,6,57:2:134283524:10 0/1:7,0,19:2:404167697:5 0/1:16,0,20:2:67633410:13 20 61795 . G A 999 PASS AC1=63;AF1=0.3239;DP4=193,313,105,127;DP=753;FQ=999;MQ=48;PV4=0.075,4e-23,2.6e-26,1 GT:PL:DP:SP:GQ 0/0:0,27,203:9:-1598497894:27 0/0:0,21,174:7:50331648:21 0/0:0,45,229:15:0:45 0/0:0,27,199:9:1056977028:27 0/0:0,24,182:8:0:24 0/0:0,9,85:3:1678500058:10 20 62731 . C A 999 PASS AC1=24;AF1=0.1207;DP4=326,391,40,53;DP=846;FQ=999;MQ=49;PV4=0.74,6.3e-14,5.2e-07,0.44 GT:PL:DP:SP:GQ 0/0:0,27,194:9:134349316:33 0/0:0,24,194:8:0:30 0/0:0,18,141:6:505290270:24 0/0:0,30,201:10:16908801:36 0/0:0,18,153:6:505290270:24 0/0:0,33,202:11:67175432:39 20 63008 . C A 122 PASS AC1=2;AF1=0.01078;DP4=303,374,3,7;DP=692;FQ=122;MQ=49;PV4=0.52,0.0011,0.093,1 GT:PL:DP:SP:GQ 0/0:0,42,255:14:67371265:59 0/0:0,15,128:5:0:32 0/0:0,15,136:5:505290270:32 0/0:0,39,251:13:67634177:56 0/0:0,15,111:5:505290270:32 0/0:0,27,200:9:33818632:44 20 63231 . T A 999 PASS AC1=5;AF1=0.02753;DP4=289,324,7,18;DP=652;FQ=999;MQ=49;PV4=0.067,8.8e-11,0.004,0.49 GT:PL:DP:SP:GQ 0/0:0,42,246:14:0:54 0/0:0,18,141:6:0:30 0/0:0,27,209:9:0:39 0/0:0,24,186:8:0:36 0/0:0,12,110:4:0:24 0/0:0,21,145:7:0:33 20 63244 . A C 999 PASS AC1=37;AF1=0.1905;DP4=273,269,66,56;DP=670;FQ=999;MQ=49;PV4=0.48,4.1e-21,2.5e-21,1 GT:PL:DP:SP:GQ 0/0:0,36,209:12:858855379:39 0/0:0,21,174:7:7:24 0/0:0,27,198:9:0:30 0/0:0,24,184:8:1055941246:27 0/0:0,15,132:5:0:18 0/0:0,21,159:7:203312940:24 20 63328 . A C 26.4 PASS AC1=1;AF1=0.006786;DP4=439,259,2,3;DP=711;FQ=26.4;MQ=49;PV4=0.37,0.00092,0.2,0.31 GT:PL:DP:SP:GQ 0/0:0,42,242:14:0:61 0/0:0,12,110:4:0:31 0/0:0,36,231:12:0:55 0/0:0,36,226:12:0:55 0/0:0,15,135:5:0:34 0/0:0,18,132:6:0:37 20 63452 . C A 86.5 PASS AC1=4;AF1=0.02128;DP4=399,301,5,6;DP=718;FQ=86.6;MQ=49;PV4=0.54,0.084,0.0093,0.49 GT:PL:DP:SP:GQ 0/0:0,27,200:9:-419321220:41 0/0:0,15,123:5:0:29 0/0:0,33,228:11:0:47 0/0:0,9,88:3:1060858040:23 0/0:0,24,171:8:0:38 0/0:0,15,134:5:1094576049:29 20 63799 . C T 999 PASS AC1=68;AF1=0.347;DP4=215,280,110,139;DP=796;FQ=999;MQ=49;PV4=0.88,7.1e-84,0.16,1 GT:PL:DP:SP:GQ 0/0:0,36,211:12:212:36 0/0:0,27,205:9:50331648:27 0/0:0,18,125:6:384:18 0/0:0,15,125:5:208:15 0/1:1,0,150:8:392:4 0/0:0,12,106:5:204:12 20 63967 . A G 999 PASS AC1=5;AF1=0.02598;DP4=384,427,9,10;DP=833;FQ=999;MQ=49;PV4=1,0.0053,1.9e-05,0.043 GT:PL:DP:SP:GQ 0/0:0,30,183:10:0:43 0/0:0,30,206:10:0:43 0/0:0,30,206:10:0:43 0/0:0,33,230:11:0:46 0/0:0,21,160:7:0:34 0/0:0,12,112:4:0:25 20 65288 . G C 999 PASS AC1=23;AF1=0.1172;DP4=217,304,31,52;DP=612;FQ=999;MQ=49;PV4=0.47,1.3e-40,0.001,1 GT:PL:DP:SP:GQ 0/0:0,18,155:6:212:24 0/0:0,12,113:4:0:18 0/0:0,18,144:6:384:24 0/0:0,21,155:7:208:27 0/0:0,21,176:7:392:27 0/0:0,6,63:2:204:12 20 65900 . G A 999 PASS AC1=156;AF1=0.7977;DP4=98,72,334,335;DP=857;FQ=999;MQ=46;PV4=0.086,1,2.3e-21,1 GT:PL:DP:SP:GQ 1/1:162,24,0:8:-1210247533:27 1/1:160,21,0:7:458754:24 1/1:219,30,0:10:0:33 1/1:213,30,0:10:1056438877:33 1/1:248,42,0:14:0:45 1/1:148,21,0:7:-1125025851:24 20 65951 . T A 142 PASS AC1=4;AF1=0.02096;DP4=349,437,7,6;DP=818;FQ=142;MQ=48;PV4=0.58,0.0014,0.18,1 GT:PL:DP:SP:GQ 0/0:0,18,125:6:1040080185:32 0/0:0,15,132:5:0:29 0/0:0,24,183:8:0:38 0/0:0,45,252:15:1063282696:59 0/0:0,30,217:10:0:44 0/0:0,15,101:5:1099561607:29 20 66370 . G A 999 PASS AC1=160;AF1=0.8125;DP4=76,68,292,297;DP=774;FQ=999;MQ=46;PV4=0.52,1,5.3e-34,1 GT:PL:DP:SP:GQ 1/1:255,57,0:19:0:60 1/1:193,24,0:8:131075:27 1/1:97,12,0:4:0:15 1/1:208,30,0:10:0:33 1/1:129,15,0:5:0:18 1/1:72,9,0:3:0:13 20 67184 . C A 110 PASS AC1=2;AF1=0.01039;DP4=366,477,2,10;DP=863;FQ=111;MQ=48;PV4=0.08,1,0.12,0.24 GT:PL:DP:SP:GQ 0/0:0,33,202:11:-2137326772:50 0/0:0,6,57:2:0:23 0/0:0,42,223:14:0:59 0/0:0,21,142:7:1069487242:38 0/0:0,30,181:10:0:47 0/0:0,33,201:11:1190753285:50 20 67760 . C A 16.9 PASS AC1=2;AF1=0.008593;DP4=412,444,1,5;DP=870;FQ=16.9;MQ=47;PV4=0.22,1,0.0035,1 GT:PL:DP:SP:GQ 0/0:0,42,224:14:-1076013622:60 0/0:0,9,77:3:0:27 0/0:0,45,243:15:0:63 0/0:0,21,147:7:1070943924:39 0/0:0,33,205:11:0:51 0/0:0,12,102:4:783554555:30 20 68303 . A G 65.3 PASS AC1=1;AF1=0.005274;DP4=480,351,4,5;DP=874;FQ=65.3;MQ=49;PV4=0.51,9.5e-09,0.34,1 GT:PL:DP:SP:GQ 0/0:0,33,205:11:-1062059871:53 0/0:0,33,236:11:0:53 0/0:0,33,214:11:0:53 0/0:0,27,197:9:1072683922:47 0/0:0,18,149:6:0:38 0/0:0,9,72:3:1109840439:29 20 68618 . G C 62.6 PASS AC1=2;AF1=0.01065;DP4=409,382,5,5;DP=833;FQ=62.6;MQ=49;PV4=1,1.9e-07,0.00018,0.4 GT:PL:DP:SP:GQ 0/0:0,24,176:9:1561273746:41 0/0:0,30,214:10:0:47 0/0:0,21,159:7:0:38 0/0:0,24,191:8:1056871757:41 0/0:0,15,133:5:0:32 0/0:0,18,133:6:253656329:35 vcftools-0.1.15/examples/floats.vcf000066400000000000000000000010641307140004000172310ustar00rootroot00000000000000##fileformat=VCFv4.1 ##INFO= ##FORMAT= ##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 19 14370 . G A 29 PASS FLOATTAG=0.0001 GT 0|0 19 14371 . G A 29 PASS FLOATTAG=1e-4 GT 0|0 19 14372 . G A 29 PASS FLOATTAG=1E-4 GT 0|0 19 14373 . G A 29 PASS FLOATTAG=1e4 GT 0|0 vcftools-0.1.15/examples/indel-stats.out000066400000000000000000000000601307140004000202140ustar00rootroot00000000000000total 20 in-frame 9 frameshift 5 ratio 0.357143 vcftools-0.1.15/examples/indel-stats.tab000066400000000000000000000000301307140004000201500ustar00rootroot000000000000001 20 30 1 40 50 1 60 80 vcftools-0.1.15/examples/indel-stats.vcf000066400000000000000000000010131307140004000201620ustar00rootroot00000000000000##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO 1 15 . ACGT A . PASS . 1 15 . A ACGT . PASS . 1 18 . ACGT A . PASS . 1 18 . A ACGT . PASS . 1 25 . ACGT A . PASS . 1 25 . A ACGT . PASS . 1 27 . ACGTA A . PASS . 1 27 . A ACGT . PASS . 1 29 . ACGT A . PASS . 1 29 . A ACGT . PASS . 1 35 . ACGT A . PASS . 1 35 . A ACGT . PASS . 1 38 . ACGT A . PASS . 1 38 . A ACGT . PASS . 1 45 . ACGT A . PASS . 1 45 . A ACGT . PASS . 1 47 . ACGTA A . PASS . 1 47 . A AACGT . PASS . 1 49 . ACGT A . PASS . 1 49 . A ACGT . PASS . vcftools-0.1.15/examples/invalid-4.0.vcf000066400000000000000000000040461307140004000176710ustar00rootroot00000000000000##fileformat=VCFv4.0 ##problem1=The first base of the second ALT allele at 20:1234567 does not match the reference. ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=1000GenomesPilot-NCBI36 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:-1:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 20 1234567 microsat1 G GA,AAC 50 0 NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:-1:4 0/2:17:2 1/1:40:3 20 1235237 . T . -1 . . GT 0\0 0|0 ./. X 10 rsTest AC A,ATG 10 . . GT 0 0/1 0|2 X 11 rsTest2 T A,G 10 q10;s50 . GT:DP:GQ 0:3:10 .:5:20 0:3:10 vcftools-0.1.15/examples/isec-n2-test.vcf.out000066400000000000000000000021401307140004000207600ustar00rootroot00000000000000Warning: The column names do not match (e.g. B): B A ##fileformat=VCFv4.0 ##FILTER= ##FORMAT= ##FILTER= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 3062915 . GTTT G 1806 q10 DP=35;DP4=1,2,3,4;SF=0f,1f,2 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 1 3106154 . CAAA C 1792 PASS DP=32;SF=0,1,2 GT:GQ:DP 0/1:245:32 1 3157410 . GA G 628 q10 DP=21;SF=0f,1,2 GT:GQ:DP 1/1:21:21 1 3162006 . GAA G 1016 PASS DP=22;SF=0,1,2 GT:GQ:DP 0/1:212:22 1 3177144 . GT G 727 PASS DP=30;SF=0,1,2 GT:GQ:DP 0/1:150:30 1 3184885 . TAAAA TA,T 246 PASS DP=10;SF=0,1 GT:GQ:DP 1/2:12:10 3 3199815 . G A 353 PASS DP=19;SF=1,2 GT:GQ:DP 0/1:188:19 vcftools-0.1.15/examples/merge-test-a.vcf000066400000000000000000000017051307140004000202350ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 3062915 . GTTT G 1806 q10 DP=35;DP4=1,2,3,4 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 1 3106154 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 3157410 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 3162006 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 3177144 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 3184885 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 3199812 . G GTT,GT 481 PASS DP=26 GT:GQ:DP 1/2:322:26 3 3212016 . CTT C,CT 565 PASS DP=26 GT:GQ:DP 1/2:91:26 4 3258448 . TACACACAC T 325 PASS DP=31 GT:GQ:DP 0/1:325:31 vcftools-0.1.15/examples/merge-test-b.vcf000066400000000000000000000017741307140004000202440ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B 1 3062915 . GTTT GT 376 q20 DP=14;DP4=1,2,3,4 GT:GQ:DP:GL 0/1:376:14:-10,0,-10 1 3106154 . CAAAA C 677 PASS DP=15 GT:GQ:DP:GL 0/1:277:15:-10,0,-10 1 3157410 . GA G 249 PASS DP=11 GT:GQ:DP 0/1:49:11 1 3162006 . GAA G 663 PASS DP=19 GT:GQ:DP 0/1:589:19 1 3177144 . GT G 460 PASS DP=24 GT:GQ:DP 0/1:236:24 1 3184885 . TAAA T 598 PASS DP=16 GT:GQ:DP 0/1:435:16 2 3188209 . GA G 162 . DP=15 GT:GQ:DP 0/1:162:15 3 3199812 . G GTT,GT 353 PASS DP=19 GT:GQ:DP 1/2:188:19 3 3199815 . G A 353 PASS DP=19 GT:GQ:DP 0/1:188:19 4 3212016 . CTT C 677 q20 DP=15 GT:GQ:DP 0/1:158:15 vcftools-0.1.15/examples/merge-test-c.vcf000066400000000000000000000014361307140004000202400ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C 1 3062915 . GTTT G 388 . DP=10;DP4=1,2,3,4 GT:GQ:DP 0/1:149:10 1 3106154 . CAAA C 269 . DP=9 GT:GQ:DP 1/1:25:9 1 3157410 . GA G 212 . DP=10 GT:GQ:DP 0/1:52:10 1 3162006 . GAA G 558 . DP=17 GT:GQ:DP 0/1:163:17 1 3177144 . GT G 211 . DP=14 GT:GQ:DP 0/1:151:14 1 3199812 . G GT . . . GT 1/1 2 3212016 . CTT C 613 . DP=11 GT:GQ:DP 0/1:41:11 3 3199815 . G T 353 PASS DP=19 GT:GQ:DP 0/1:188:19 3 3242491 . TT T . . . GT 1/1 4 3291771 . T TAA,TAAA 336 . DP=12 GT:GQ:DP 1/2:2:12 vcftools-0.1.15/examples/merge-test.vcf.out000066400000000000000000000043001307140004000206170ustar00rootroot00000000000000##fileformat=VCFv4.2 ##FILTER= ##FORMAT= ##FILTER= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C 1 3062915 . GTTT G,GT 856.67 q20;q10 AC=2,1;AN=6;DP4=3,6,9,12;DP=59;SF=0f,1f,2 GT:GL:DP:GQ 0/1:-20,-5,-20,.,.,.:35:409 0/2:-10,.,.,0,.,-10:14:376 0/1:.:10:149 1 3106154 . CAAAA CA,C 912.67 PASS AC=3,1;AN=6;DP=56;SF=0,1,2 GT:GL:DP:GQ 0/1:.:32:245 0/2:-10,.,.,0,.,-10:15:277 1/1:.:9:25 1 3157410 . GA G 363.00 q10 AC=4;AN=6;DP=42;SF=0f,1,2 GT:DP:GQ 1/1:21:21 0/1:11:49 0/1:10:52 1 3162006 . GAA G 745.67 PASS AC=3;AN=6;DP=58;SF=0,1,2 GT:DP:GQ 0/1:22:212 0/1:19:589 0/1:17:163 1 3177144 . GT G 466.00 PASS AC=3;AN=6;DP=68;SF=0,1,2 GT:DP:GQ 0/1:30:150 0/1:24:236 0/1:14:151 1 3184885 . TAAAA TA,T 422.00 PASS AC=2,1;AN=4;DP=26;SF=0,1 GT:DP:GQ 1/2:10:12 0/1:16:435 . 1 3199812 . G GT . . AC=2;AN=2;SF=2 GT . . 1/1 2 3188209 . GA G 162.00 . AC=1;AN=2;DP=15;SF=1 GT:DP:GQ . 0/1:15:162 . 2 3199812 . G GTT,GT 481.00 PASS AC=1,1;AN=2;DP=26;SF=0 GT:DP:GQ 1/2:26:322 . . 2 3212016 . CTT C 613.00 . AC=1;AN=2;DP=11;SF=2 GT:DP:GQ . . 0/1:11:41 3 3199812 . G GTT,GT 353.00 PASS AC=1,1;AN=2;DP=19;SF=1 GT:DP:GQ . 1/2:19:188 . 3 3199815 . G T,A 353.00 PASS AC=1,1;AN=4;DP=38;SF=1,2 GT:DP:GQ . 0/2:19:188 0/1:19:188 3 3212016 . CTT C,CT 565.00 PASS AC=1,1;AN=2;DP=26;SF=0 GT:DP:GQ 1/2:26:91 . . 3 3242491 . TT T . . AC=2;AN=2;SF=2 GT . . 1/1 4 3212016 . CTT C 677.00 q20 AC=1;AN=2;DP=15;SF=1f GT:DP:GQ . 0/1:15:158 . 4 3258448 . TACACACAC T 325.00 PASS AC=1;AN=2;DP=31;SF=0 GT:DP:GQ 0/1:31:325 . . 4 3291771 . T TAA,TAAA 336.00 . AC=1,1;AN=2;DP=12;SF=2 GT:DP:GQ . . 1/2:12:2 vcftools-0.1.15/examples/parse-test.vcf000066400000000000000000000003461307140004000200320ustar00rootroot00000000000000##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT g 1806 PASS DP=35 1 104 . C g,,t, 1792 PASS DP=32 1 104 . C ,g,,t 1792 PASS DP=32 1 104 . C ,g,,t, 1792 PASS DP=32 vcftools-0.1.15/examples/perl-api-1.pl000066400000000000000000000026371307140004000174540ustar00rootroot00000000000000#!/usr/bin/env perl # # Example code for generating a minimal VCF file using the perl API # # Author: pd3@sanger # use strict; use warnings; use Carp; use Vcf; my $sample = 'Sample1'; my $vcf_out = Vcf->new(); $vcf_out->add_columns($sample); $vcf_out->add_header_line({key=>'FORMAT',ID=>'GT',Number=>'1',Type=>'String',Description=>"Genotype"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL',Description=>"Deletion"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL:ME:ALU',Description=>"Deletion of ALU element"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL:ME:L1',Description=>"Deletion of L1 element"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DUP',Description=>"Duplication"}); $vcf_out->add_header_line({key=>'INFO',ID=>'DP',Number=>1,Type=>'Integer',Description=>"Total Depth"}); $vcf_out->add_header_line({key=>'INFO',ID=>'H2',Number=>0,Type=>'Flag',Description=>"HapMap2 membership"}); print $vcf_out->format_header(); my $pos = 1; for my $gt qw(A/A C/C /C / / /) { $pos++; my %out; $out{CHROM} = '1'; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{REF} = 'C'; $out{QUAL} = '.'; $out{FILTER} = ['.']; $out{INFO} = { DP=>3, H2=>undef }; $out{FORMAT} = ['GT']; $out{gtypes}{$sample}{GT} = $gt; $vcf_out->format_genotype_strings(\%out); print $vcf_out->format_line(\%out); } vcftools-0.1.15/examples/query-test.out000066400000000000000000000003771307140004000201220ustar00rootroot000000000000001:100100 ref=G alt=C qual=0 1 A=G|C B=G/C 1:100200 ref=G alt=C qual=0 1 A=G|C B=G/G 1:100300 ref=G alt=C qual=0 1 A=C/C B=./. 1:100400 ref=C alt=G,T qual=35 1 A=G/G B=C/T 1:100500 ref=A alt=G qual=0 1 A=G/G B=A/A 1:100600 ref=C alt=G qual=0 1 A=G/G B=C/C vcftools-0.1.15/examples/shuffle-test.vcf000066400000000000000000000011761307140004000203560ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A 1 100100 . G C 0 . DP=1 GT:GQ:DP 0/1:40:1 0|1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0/0:40:1 0|1:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP ./.:40:1 1/1:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 0/2:40:1 1/1:41:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 0/0:40:1 1/1:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 0/0:40:1 1/1:40:1 vcftools-0.1.15/examples/subset.SNPs.out000066400000000000000000000010121307140004000201120ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . G C 1806 q10 DP=35 GT:GQ:DP 1/1:409:35 1 120 . T G 628 q10 DP=21 GT:GQ:DP 0|1|1:21:21 1 130 . GAA GTA 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 vcftools-0.1.15/examples/subset.indels.out000066400000000000000000000016431307140004000205570ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 1/0:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools-0.1.15/examples/subset.vcf000066400000000000000000000023151307140004000172460ustar00rootroot00000000000000##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100 . G C 1806 q10 DP=35 GT:GQ:DP 1/1:409:35 1/1:409:35 1 110 . C . 1792 PASS DP=32 GT:GQ:DP 0/0:245:32 0/0:245:32 1 120 . T G 628 q10 DP=21 GT:GQ:DP 0|1|1:21:21 0|1|1:21:21 1 130 . GAA G,GTA 1016 PASS DP=22 GT:GQ:DP 1/2:212:22 1/2:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 2 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 vcftools-0.1.15/examples/valid-3.3.vcf000066400000000000000000000031141307140004000173370ustar00rootroot00000000000000##fileformat=VCFv3.3 ##fileDate=20090805 ##phasing=partial ##test meta ##INFO=NS,1,Integer,"Number of Samples With Data" ##INFO=DP,1,Integer,"Total Depth" ##INFO=AN,1,Integer,"Total number of alleles in called genotypes" ##INFO=AC,-1,Integer,"Allele count in genotypes, for each ALT allele, in the same order as listed" ##INFO=AF,-1,Float,"Allele Frequency" ##INFO=AA,1,String,"Ancestral Allele" ##INFO=DB,0,Flag,"dbSNP membership, build 129" ##INFO=H2,0,Flag,"HapMap2 membership" ##FILTER=q10,"Quality below 10" ##FILTER=s50,"Less than 50% of samples have data" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" ##FORMAT=HQ,2,Integer,"Haplotype Quality" ##FORMAT=DS,1,Float,"Alternative Allele Dosage" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:-1:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 20 1234567 microsat1 G D4,IGA 50 0 NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:-1:4 0/2:17:2 1/1:40:3 20 1235237 . T . -1 . . GT 0\0 0|0 ./. X 10 rsTest A T 10 . . GT:DS 0:0.1 0/1:0.5 0|1:0.5 X 11 rsTest2 T A,G 10 q10;s50 . GT:DP:GQ 0:3:10 .:5:20 0:3:10 vcftools-0.1.15/examples/valid-4.0.vcf000066400000000000000000000042061307140004000173400ustar00rootroot00000000000000##fileformat=VCFv4.0 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=1000GenomesPilot-NCBI36 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##ALT= ##ALT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. 20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 20 1235237 . T . . . . GT 0/0 0|0 ./. X 9 . A T 12.1 . . GT 0 0/1 1/0 X 10 rsTest AC A,ATG 10 PASS . GT 0 0/1 0|2 X 11 rsTest2 T A, 10 q10;s50 . GT:DP:GQ .:3:10 ./. 0|2:3 X 12 . T A 13 . . GT 0 1/0 1/1 vcftools-0.1.15/examples/valid-4.0.vcf.stats000066400000000000000000000120511307140004000204720ustar00rootroot00000000000000$VAR1 = { 'samples' => { 'NA00002' => { 'indel_count' => 2, 'indel' => { '2' => 1, '-1' => 1 }, 'het_RA_count' => 6, 'snp_count' => 5, 'count' => 11, 'hom_RR_count' => 4, 'ref' => 10, 'missing' => 1, 'private' => 1, 'phased' => 7, 'het_AA_count' => 1, 'snp' => { 'A>G' => 1, 'A>T' => 2, 'T>A' => 2, 'G>A' => 1 }, 'unphased' => 4, 'ref_count' => 10 }, 'NA00001' => { 'het_RA_count' => 1, 'indel_count' => 1, 'indel' => { '1' => 1 }, 'snp_count' => 1, 'count' => 11, 'hom_RR_count' => 9, 'ref' => 10, 'missing' => 1, 'phased' => 6, 'het_AA_count' => 1, 'snp' => { 'A>T' => 1, 'A>G' => 1 }, 'unphased' => 5, 'ref_count' => 10 }, 'NA00003' => { 'hom_AA_count' => 4, 'other_count' => 2, 'indel_count' => 1, 'indel' => { '1' => 1 }, 'het_RA_count' => 5, 'snp_count' => 6, 'count' => 11, 'hom_RR_count' => 2, 'ref' => 7, 'missing' => 1, 'private' => 3, 'phased' => 2, 'other' => 2, 'snp' => { 'A>C' => 1, 'A>T' => 2, 'A>G' => 1, 'T>A' => 1, 'G>A' => 1 }, 'ref_count' => 7, 'unphased' => 9 } }, 'all' => { 'other_count' => 2, 'indel_count' => 2, 'indel' => { '1' => 1, '2' => 1, '-1' => 1 }, 'snp_count' => 8, 'count' => 12, 'ref' => 2, 'nalt_0' => 2, 'nalt_1' => 7, 'shared' => { '1' => 4, '0' => 2, '3' => 2, '2' => 4 }, 'other' => 2, 'nalt_2' => 3, 'snp' => { 'A>C' => 1, 'A>T' => 2, 'A>G' => 2, 'T>A' => 3, 'G>A' => 1 }, 'ref_count' => 2 } }; vcftools-0.1.15/examples/valid-4.1.vcf000066400000000000000000000056671307140004000173550ustar00rootroot00000000000000##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta ##contig= ##contig= ##contig= ##SAMPLE= ##SAMPLE= ##PEDIGREE= ##PEDIGREE= ##pedigreeDB=url #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 20 1234567 microsat1 GTC G,GTCTC 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 20 2234567 . C [13:123457[ACGC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 2234568 . C .TC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 2234569 . C CT. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 3234569 . C 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 4234569 . N .[13:123457[ 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 ./.:40:3 20 5234569 . N [13:123457[. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 Y 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GL 0:0,49 0:0,3 1:41,0 vcftools-0.1.15/src/000077500000000000000000000000001307140004000142115ustar00rootroot00000000000000vcftools-0.1.15/src/Makefile.am000066400000000000000000000000231307140004000162400ustar00rootroot00000000000000SUBDIRS = cpp perl vcftools-0.1.15/src/cpp/000077500000000000000000000000001307140004000147735ustar00rootroot00000000000000vcftools-0.1.15/src/cpp/Makefile.am000066400000000000000000000014171307140004000170320ustar00rootroot00000000000000bin_PROGRAMS = vcftools vcftools_CPPFLAGS = $(ZLIB_CFLAGS) vcftools_LDADD = $(ZLIB_LIBS) vcftools_SOURCES = \ bcf_entry.cpp \ bcf_entry.h \ bcf_entry_setters.cpp \ bcf_file.cpp \ bcf_file.h \ bgzf.c \ bgzf.h \ dgeev.cpp \ dgeev.h \ entry.cpp \ entry_filters.cpp \ entry_getters.cpp \ entry.h \ entry_setters.cpp \ gamma.cpp \ gamma.h \ header.cpp \ header.h \ khash.h \ knetfile.c \ knetfile.h \ output_log.cpp \ output_log.h \ parameters.cpp \ parameters.h \ variant_file.cpp \ variant_file_diff.cpp \ variant_file_filters.cpp \ variant_file_format_convert.cpp \ variant_file.h \ variant_file_output.cpp \ vcf_entry.cpp \ vcf_entry.h \ vcf_entry_setters.cpp \ vcf_file.cpp \ vcf_file.h \ vcftools.cpp \ vcftools.h dist_man_MANS = vcftools.1 vcftools-0.1.15/src/cpp/bcf_entry.cpp000066400000000000000000000421411307140004000174540ustar00rootroot00000000000000/* * bcf_entry.cpp * * Created on: Sep 20, 2012 * Author: Anthony Marcketta * ($Revision: 1 $) */ #include "bcf_entry.h" bcf_entry::bcf_entry(header &header_obj, vector &include_individual) { N_indv = header_obj.N_indv; include_indv = include_individual; include_genotype = vector(N_indv, true); basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; N_INFO_removed = 0; N_FORMAT_removed = 0; passed_filters = true; parsed_FORMAT_binary = false; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; N_info = 0; N_format = 0; L_shared = 0; L_indiv = 0; line_pos = 0; N_allele = 0; INFO_pos = 0; FILTER_pos = 0; ALT_pos = 0; FORMAT_pos = 0; FORMAT_positions.resize(0); FORMAT_types.resize(0); FORMAT_sizes.resize(0); FORMAT_skip.resize(0); FORMAT_keys.resize(0); line.clear(); entry_header = header_obj; } bcf_entry::~bcf_entry() {} void bcf_entry::reset(const vector &data_line) { basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; parsed_FORMAT_binary = false; passed_filters = true; line = data_line; fill(parsed_GT.begin(), parsed_GT.end(), false); fill(parsed_GQ.begin(), parsed_GQ.end(), false); fill(parsed_DP.begin(), parsed_DP.end(), false); fill(parsed_FT.begin(), parsed_FT.end(), false); fill(include_genotype.begin(), include_genotype.end(), true); INFO_pos = 0; FILTER_pos = 0; ALT_pos = 0; FORMAT_pos = 0; FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear(); N_INFO_removed = 0; N_FORMAT_removed = 0; } void bcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO) { if (line.empty()) { if (parse_ALT) set_ALT(""); return; } if (!basic_parsed) { uint32_t n_allele_info, n_fmt_sample; uint32_t chrom, pos, rlen; uint32_t shared, indiv; float qual; line_pos = 0; get_number(shared, &line_pos, line); get_number(indiv, &line_pos, line); L_shared = shared; L_indiv = indiv; get_number(chrom, &line_pos, line); get_number(pos, &line_pos, line); get_number(rlen, &line_pos, line); qual = *reinterpret_cast(&line[line_pos]); line_pos += sizeof(qual); get_number(n_allele_info, &line_pos, line); get_number(n_fmt_sample, &line_pos, line); N_format = n_fmt_sample >> 24; CHROM = entry_header.CONTIG_map[chrom].ID; POS = pos + 1; ID = get_typed_string( &line_pos, line ); REF = get_typed_string( &line_pos, line ); QUAL = qual; N_allele = n_allele_info >> 16; N_info = n_allele_info & (uint32_t)65535; ALT_pos = line_pos; for (unsigned int ui=1; ui max_depth)) include_genotype[ui] = false; } } } void bcf_entry::filter_genotypes_by_filter_status(const set &filter_flags_to_remove, bool remove_all) { if (fully_parsed == false) parse_full_entry(); vector GFILTERs; if (FT_idx != -1) { // Have GFilter info for (unsigned int ui=0; ui ids; ids.resize(0); if ( GT && !parsed_GT[indv] && GT_idx != -1 ) ids.push_back(GT_idx); if (GQ && !parsed_GQ[indv] && GQ_idx != -1) ids.push_back(GQ_idx); if (DP && !parsed_DP[indv] && DP_idx != -1) ids.push_back(DP_idx); if (FT && !parsed_FT[indv] && FT_idx != -1) ids.push_back(FT_idx); for(unsigned int i=0; i1) LOG.error("Error: Only expect single value for QUALITY.\n"); float tmp; if (type==5) tmp = *reinterpret_cast(&line[l_pos]); else if (type==1) { int8_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else if (type==2) { int16_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else if (type==3) { int32_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else LOG.error("Error: Invalid type for QUALITY.\n"); set_indv_GQUALITY(indv, tmp); } else if ((int)ui == DP_idx) { if (size>1) LOG.error("Error: Only expect single value for DEPTH.\n"); int tmp = -1; if (type==1) { if ( !check_missing(l_pos, 1, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==2) { if ( !check_missing(l_pos, 2, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==3) { if ( !check_missing(l_pos, 3, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==5) { float tmp2 = -1; if ( !check_missing(l_pos, 5, line) ) tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (int)tmp2; } else LOG.error("Error: Invalid type for DEPTH.\n"); set_indv_DEPTH(indv, tmp); } else if ((int)ui == FT_idx) { if (type == 7) { vector tmp; tmp.resize( size*sizeof(char) ); memcpy(&tmp[0], &line[l_pos], size*sizeof(char)); set_indv_GFILTER(indv, tmp); } else LOG.one_off_warning("Warning: FT values must be encoded in string format.\n"); } } // Set missing return values if requested a value, but couldn't find it if (GT && (parsed_GT[indv] == false)) { set_indv_GENOTYPE_and_PHASE(indv, make_pair(-1,-1), '/'); } if (GQ && (parsed_GQ[indv] == false)) { set_indv_GQUALITY(indv, -1); } if (DP && (parsed_DP[indv] == false)) { set_indv_DEPTH(indv, -1); } if (FT && (parsed_FT[indv] == false)) { set_indv_GFILTER(indv, ""); } } void bcf_entry::parse_genotype_entries(bool GT, bool GQ, bool DP, bool FT) { for (unsigned int ui=0; ui genotype; char phase; get_indv_GENOTYPE_ids(indv, genotype); phase = get_indv_PHASE(indv); if ((genotype.first == -2) && (genotype.second == -2)) outstream << "."; else if ((genotype.first == -1) && (genotype.second == -2)) outstream << "."; else if ((genotype.first > -1) && (genotype.second == -2)) outstream << genotype.first; else if ((genotype.first > -1) && (genotype.second > -1)) outstream << genotype.first << phase << genotype.second; else outstream << genotype.first << phase << genotype.second; out = outstream.str(); } else { format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); outstream << int(tmp); } l_pos += sizeof(int8_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss) out = tmpstr; } } else if (type == 2) { int16_t tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); outstream << int(tmp); } l_pos += sizeof(int16_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 3) { int32_t tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); outstream << int(tmp); } l_pos += sizeof(int32_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 5) { float tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); outstream << float(tmp); } l_pos += sizeof(float); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 7) { stringstream str_stream; string tmp_string; char tmp = '.'; for (unsigned int uj=0; uj(&line[l_pos]); l_pos += sizeof(char); str_stream << tmp; } tmp_string = str_stream.str(); tmp_string.erase( remove( tmp_string.begin(), tmp_string.end(), ' ' ), tmp_string.end() ); if (tmp_string != "") out = tmp; else out = "."; } } void bcf_entry::read_all_entries(string &out) { if (fully_parsed == false) parse_full_entry(true); if (parsed_FORMAT == false) set_FORMAT(); ostringstream outstream; string tmpstr; outstream.str(""); tmpstream.str(""); bool format_miss, indv_miss; for(unsigned int ui=0; ui &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(); out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT(); out << '\t' << header::double2str(QUAL); out << '\t' << get_FILTER(); out << '\t' << get_INFO(INFO_to_keep, keep_all_INFO); if (FORMAT.size() > 0) { string indv_entries; out << '\t' << get_FORMAT(); read_all_entries(indv_entries); out << indv_entries; } out << '\n'; // endl flushes the buffer, which is slow. This (should be) quicker. } // Output BCF entry to output stream in BCF format void bcf_entry::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(true); vector out_vector, tmp_vector; vector > tmp_info; int index; out_vector.resize(INFO_pos); memcpy(&out_vector[0], &line[0], INFO_pos); if (keep_all_INFO) { unsigned int curr_size = out_vector.size(); out_vector.resize(curr_size + (FORMAT_pos - INFO_pos) ); memcpy(&out_vector[curr_size], &line[INFO_pos], (FORMAT_pos - INFO_pos)); } else { int map_type, number; tmp_info = get_INFO_vector(INFO_to_keep, keep_all_INFO); N_INFO_removed = INFO.size()-tmp_info.size(); get_n_allele_info(tmp_vector); memcpy(&out_vector[6*sizeof(int32_t)], &tmp_vector[0], sizeof(char)); for(unsigned int ui=0; ui &include_individual); ~bcf_entry(); unsigned int N_info; unsigned int N_format; unsigned int N_allele; unsigned int L_shared; unsigned int L_indiv; unsigned int line_pos; ostringstream outstream; ostringstream tmpstream; void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false); void parse_full_entry(bool parse_FORMAT=true); void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void set_ALT(const int n_allele); void set_ALT(const string &in); void set_FILTER(); void set_FORMAT(); void set_INFO(); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const unsigned int &pos, const unsigned int &size); void set_indv_GENOTYPE_ids(unsigned int indv, const pair &in); void set_indv_GQUALITY(unsigned int indv, const vector &in); void set_indv_GQUALITY(unsigned int indv, const float &in); void set_indv_GFILTER(unsigned int indv, const string &in); void set_indv_GFILTER(unsigned int indv, const vector &in); void set_indv_PHASE(unsigned int indv, char in); void set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in); void reset(const vector &data_line); void add_FORMAT_entry(const string &in, const unsigned int &fmt_key, const unsigned int &pos, const unsigned int &line_pos, const unsigned int &type, const unsigned int &size); void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out); void read_indv_generic_entry(unsigned int indv, const int &idx, string &out); void read_all_entries(string &out); void filter_genotypes_by_quality(double min_genotype_quality); void filter_genotypes_by_depth(int min_depth, int max_depth); void filter_genotypes_by_filter_status(const set &filter_flags_to_remove, bool remove_all = false); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false); private: vector FILTER_str; unsigned int INFO_pos, FILTER_pos, ALT_pos, FORMAT_pos; }; vcftools-0.1.15/src/cpp/bcf_entry_setters.cpp000066400000000000000000000222551307140004000212310ustar00rootroot00000000000000/* * bcf_entry_setters.cpp * * Created on: Sep 20, 2012 * Author: Anthony Marcketta * ($Revision: 1 $) */ #include "bcf_entry.h" void bcf_entry::set_ALT(const int n_allele) { ALT.resize(n_allele-1); unsigned int pos = ALT_pos; string allele; for (int ui=0; ui<(n_allele-1); ui++) { allele = get_typed_string( &pos, line ); std::transform(allele.begin(), allele.end(), allele.begin(), ::toupper); ALT[ui] = allele; } parsed_ALT = true; } void bcf_entry::set_ALT(const string &in) { istringstream ss(in); string tmpstr; ALT.resize(0); while(!ss.eof()) { getline(ss, tmpstr, ','); add_ALT_allele(tmpstr); } parsed_ALT = true; } void bcf_entry::set_INFO() { int key; unsigned int size, type, i = INFO_pos; string data_type; INFO.resize(N_info); bool miss = true; for (unsigned int ui=0; ui INFO_entry(entry_header.INFO_map[key].ID, "."); data_type = entry_header.INFO_map[key].Type_str; ostringstream ss(ostringstream::out); for (unsigned int uj=0; uj(&line[cur_pos]); if ( tmp == (int8_t)0x81 ) break; ploidy[indv]++; cur_pos += sizeof(int8_t); } if (ploidy[indv] == 0) { set_indv_GENOTYPE_alleles(indv, make_pair(-2, -2)); } else if (ploidy[indv] == 1) { set_indv_PHASE(indv, '|'); tmp = *reinterpret_cast(&line[pos]); if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; set_indv_GENOTYPE_alleles(indv, make_pair(tmp, -2)); } else if (ploidy[indv] == 2) { tmp = *reinterpret_cast(&line[pos]); tmp2 = *reinterpret_cast(&line[pos+sizeof(int8_t)]); if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; if (tmp2 == (int8_t)0x80) { tmp2 = -1; set_indv_PHASE(indv, '/'); } else { char phase = phased[ tmp2 & (int8_t)1 ]; tmp2 = (tmp2 >> 1) - 1; set_indv_PHASE(indv, phase); } set_indv_GENOTYPE_alleles(indv, make_pair((int)tmp, (int)tmp2)); } else if (ploidy[indv] > 2) LOG.error("Polyploidy found, and is not supported by vcftools: " + CHROM + ":" + header::int2str(POS)); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { set_indv_GENOTYPE_ids(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { pair a(-1,-1); if (genotype.first != ".") a.first = header::str2int(genotype.first); if (genotype.second != ".") a.second = header::str2int(genotype.second); set_indv_GENOTYPE_alleles(indv, a); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (in.first == 0x81) a.first = -2; else if (in.first != 0x80) a.first = in.first; if (in.second == 0x81) a.second = -2; else if (in.second != 0x80) a.second = in.second; GENOTYPE[indv] = in; parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-2,-2)); GENOTYPE[indv] = in; } void bcf_entry::set_indv_PHASE(unsigned int indv, char in) { if (PHASE.size() == 0) PHASE.resize(N_indv, '/'); PHASE[indv] = in; parsed_GT[indv] = true; } void bcf_entry::set_indv_GQUALITY(unsigned int indv, const vector &in) { float tmp; memcpy(&tmp, &in[0], sizeof(tmp)); parsed_GQ[indv] = true; if (tmp == 0x7F800001) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (tmp > 99.0) tmp = 99; GQUALITY[indv] = tmp; } void bcf_entry::set_indv_GQUALITY(unsigned int indv, const float &in) { parsed_GQ[indv] = true; if ( (in == -1) or (in == 0x7F800001) ) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (in > 99) GQUALITY[indv] = 99; else GQUALITY[indv] = in; } void bcf_entry::set_indv_GFILTER(unsigned int indv, const vector &in) { parsed_FT[indv] = true; if (GFILTER.size() == 0) GFILTER.resize(N_indv); GFILTER[indv].resize(0); if (in.empty()) return; else if ((in.size() == 1) and (in[0] == '\0') ) return; ostringstream ss; string ith_FILTER; ss.clear(); for (unsigned int ui=0; ui(meta_data.N_indv,true); } bcf_file::~bcf_file() { close(); } void bcf_file::open() { int ret; if (filename.substr(filename.size()-4) == ".vcf") LOG.error("Filename ends in '.vcf'. Shouldn't you be using --vcf?\n"); if (filename.substr(filename.size()-7) == ".vcf.gz") LOG.error("Filename ends in '.vcf.gz'. Shouldn't you be using --gzvcf?\n"); ret = bgzf_is_bgzf(filename.c_str()); if (ret == 1) is_BGZF = true; else is_BGZF = false; if (is_BGZF) open_gz(); else { file_tmp.open(filename.c_str(), ios::in); if (!file_tmp.is_open()) LOG.error("Could not open VCF file: " + filename, 0); file_in = &file_tmp; } } void bcf_file::open_gz() { int ret; gzMAX_LINE_LEN = 1024*1024; if (stream) gzfile_in = gzdopen(fileno(stdin), "r"); else gzfile_in = gzopen(filename.c_str(), "rb"); if (gzfile_in == NULL) LOG.error("Could not open BGZF BCF file: " + filename, 0); #ifdef ZLIB_VERNUM string tmp(ZLIB_VERSION); LOG.printLOG("Using zlib version: " + tmp + "\n"); #if (ZLIB_VERNUM >= 0x1240) ret = gzbuffer(gzfile_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster if (ret != 0) LOG.warning("Unable to change zlib buffer size."); #else LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading compressed BCF files.\n"); #endif #endif } void bcf_file::check_bcf() { char magic[5]; read(magic, 5, 1); if ((strcmp(magic, bcf_21) == 0) && (strcmp(magic, bcf_22) == 0)) LOG.error("Does not appear to be a BCF file\n"); } void bcf_file::close() { if (!stream && is_BGZF) gzclose(gzfile_in); } void bcf_file::get_entry(vector &out) { uint32_t size_int[2]; int ret, read_size = 0; ret = read(&size_int[0], 2, sizeof(uint32_t) ); read_size = size_int[0] + size_int[1]; if (ret) { out.resize(read_size+2*sizeof(uint32_t)); memcpy(&out[0], size_int, 2*sizeof(uint32_t)); read(&out[2*sizeof(uint32_t)], 1, read_size); } else out.resize(0); } entry* bcf_file::get_entry_object() { return new bcf_entry(meta_data, include_indv); } int bcf_file::read(void *buffer, unsigned int len, size_t size) { int ret; if (is_BGZF) { ret = gzread(gzfile_in, buffer, size*len); ret = (ret == (int)(len*size) ); } else { file_in->read((char*)buffer, size*len); ret = !file_in->eof(); } if ((big_endian) && (size > 1)) // Note: don't both swapping character arrays - BCF is defined as little endian. { unsigned int ui; for (ui=0; uieof()); } void bcf_file::print(const parameters ¶ms) { LOG.printLOG("Outputting VCF file...\n"); string output_file = params.output_prefix + ".recode.vcf"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open VCF Output file: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); if (meta_data.has_idx) { LOG.warning("BCF file contains IDX values in header. These are being removed for conversion to VCF."); meta_data.reprint(); } for (unsigned int ui=0; ui 0) out << "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry *e = new bcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; } void bcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector header; char magic[5] = {'B','C','F','\2','\2'}; bgzf_write(out, magic, 5); for (unsigned int ui=0; ui 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new bcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); } vcftools-0.1.15/src/cpp/bcf_file.h000066400000000000000000000014341307140004000166770ustar00rootroot00000000000000/* * vcf_file.h * * Created on: Dec 11, 2012 * Author: amarcketta */ #ifndef BCF_FILE_H_ #define BCF_FILE_H_ #include "output_log.h" #include "parameters.h" #include "variant_file.h" #include "bgzf.h" extern output_log LOG; using namespace std; class bcf_file : public variant_file { public: bcf_file(const parameters ¶ms, bool diff=false); void get_entry(vector &out); entry* get_entry_object(); void print(const parameters ¶ms); void print_bcf(const parameters ¶ms); protected: ~bcf_file(); private: bool is_BGZF; bool big_endian; bool stream; int read(void *buffer, unsigned int len, size_t size); void read_header(); void read_file(); void open(); void open_gz(); void close(); bool eof(); void check_bcf(); }; #endif /* BCF_FILE_H_ */ vcftools-0.1.15/src/cpp/bgzf.c000066400000000000000000000523431307140004000160760ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include "bgzf.h" #ifdef _USE_KNETFILE #include "knetfile.h" typedef knetFile *_bgzf_file_t; #define _bgzf_open(fn, mode) knet_open((fn), (mode)) #define _bgzf_dopen(fd, mode) knet_dopen((fd), (mode)) #define _bgzf_close(fp) knet_close((knetFile*)(fp)) #define _bgzf_fileno(fp) (((knetFile*)(fp))->fd) #define _bgzf_tell(fp) knet_tell((knetFile*)(fp)) #define _bgzf_seek(fp, offset, whence) knet_seek((knetFile*)(fp), (offset), (whence)) #define _bgzf_read(fp, buf, len) knet_read((knetFile*)(fp), (buf), (len)) #define _bgzf_write(fp, buf, len) knet_write((knetFile*)(fp), (buf), (len)) #else // ~defined(_USE_KNETFILE) #if defined(_WIN32) || defined(_MSC_VER) #define ftello(fp) ftell((FILE*)(fp)) #define fseeko(fp, offset, whence) fseek((FILE*)(fp), (offset), (whence)) #else // ~defined(_WIN32) extern off_t ftello(FILE *stream); extern int fseeko(FILE *stream, off_t offset, int whence); #endif // ~defined(_WIN32) typedef FILE *_bgzf_file_t; #define _bgzf_open(fn, mode) fopen((fn), (mode)) #define _bgzf_dopen(fd, mode) fdopen(fd, (mode)) #define _bgzf_close(fp) fclose((FILE*)(fp)) #define _bgzf_fileno(fp) fileno((FILE*)(fp)) #define _bgzf_tell(fp) ftello((FILE*)(fp)) #define _bgzf_seek(fp, offset, whence) fseeko((FILE*)(fp), (offset), (whence)) #define _bgzf_read(fp, buf, len) fread((buf), 1, (len), (FILE*)(fp)) #define _bgzf_write(fp, buf, len) fwrite((buf), 1, (len), (FILE*)(fp)) #endif // ~define(_USE_KNETFILE) #define BLOCK_HEADER_LENGTH 18 #define BLOCK_FOOTER_LENGTH 8 /* BGZF/GZIP header (speciallized from RFC 1952; little endian): +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; #ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; #include "khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) #endif static inline int ed_is_big() { long one= 1; return !(*((char *)(&one))); } static inline void packInt16(uint8_t *buffer, uint16_t value) { buffer[0] = value; buffer[1] = value >> 8; } static inline int unpackInt16(const uint8_t *buffer) { return buffer[0] | buffer[1] << 8; } static inline void packInt32(uint8_t *buffer, uint32_t value) { buffer[0] = value; buffer[1] = value >> 8; buffer[2] = value >> 16; buffer[3] = value >> 24; } static BGZF *bgzf_read_init() { BGZF *fp; fp = (BGZF*)calloc(1, sizeof(BGZF)); fp->is_write = 0; fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); #ifdef BGZF_CACHE fp->cache = kh_init(cache); #endif return fp; } static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level { BGZF *fp; fp = (BGZF*)calloc(1, sizeof(BGZF)); fp->is_write = 1; fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; return fp; } // get the compress level from the mode string static int mode2level(const char *__restrict mode) { int i, compress_level = -1; for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; if (mode[i]) compress_level = (int)mode[i] - '0'; if (strchr(mode, 'u')) compress_level = 0; return compress_level; } BGZF *bgzf_open(const char *path, const char *mode) { BGZF *fp = 0; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_open(path, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fopen(path, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } fp->is_be = ed_is_big(); return fp; } BGZF *bgzf_dopen(int fd, const char *mode) { BGZF *fp = 0; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fdopen(fd, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } fp->is_be = ed_is_big(); return fp; } static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) { uint32_t crc; z_stream zs; uint8_t *dst = (uint8_t*)_dst; // compress the body zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)src; zs.avail_in = slen; zs.next_out = dst + BLOCK_HEADER_LENGTH; zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; if (deflateEnd(&zs) != Z_OK) return -1; *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; // write the header memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes // write the footer crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen); packInt32((uint8_t*)&dst[*dlen - 8], crc); packInt32((uint8_t*)&dst[*dlen - 4], slen); return 0; } // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. static int deflate_block(BGZF *fp, int block_length) { int comp_size = BGZF_MAX_BLOCK_SIZE; if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } fp->block_offset = 0; return comp_size; } // Inflate the block in fp->compressed_block into fp->uncompressed_block static int inflate_block(BGZF* fp, int block_length) { z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)fp->compressed_block + 18; zs.avail_in = block_length - 16; zs.next_out = (Bytef*)fp->uncompressed_block; zs.avail_out = BGZF_MAX_BLOCK_SIZE; if (inflateInit2(&zs, -15) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { inflateEnd(&zs); fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflateEnd(&zs) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } return zs.total_out; } static int check_header(const uint8_t *header) { return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 && unpackInt16((uint8_t*)&header[10]) == 6 && header[12] == 'B' && header[13] == 'C' && unpackInt16((uint8_t*)&header[14]) == 2); } #ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (fp->is_write) return; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); } static int load_block_from_cache(BGZF *fp, int64_t block_address) { khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; k = kh_get(cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); return p->size; } static void cache_block(BGZF *fp, int size) { int ret; khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { /* A better way would be to remove the oldest block in the * cache, but here we remove a random one for simplicity. This * should not have a big impact on performance. */ for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) break; if (k < kh_end(h)) { free(kh_val(h, k).block); kh_del(cache, h, k); } } k = kh_put(cache, h, fp->block_address, &ret); if (ret == 0) return; // if this happens, a bug! p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); } #else static void free_cache(BGZF *fp) {} static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} static void cache_block(BGZF *fp, int size) {} #endif int bgzf_read_block(BGZF *fp) { uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; int count, size = 0, block_length, remaining; int64_t block_address; block_address = _bgzf_tell((_bgzf_file_t)fp->fp); if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; count = _bgzf_read(fp->fp, header, sizeof(header)); if (count == 0) { // no data read fp->block_length = 0; return 0; } if (count != sizeof(header) || !check_header(header)) { fp->errcode |= BGZF_ERR_HEADER; return -1; } size = count; block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" compressed_block = (uint8_t*)fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); remaining = block_length - BLOCK_HEADER_LENGTH; count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); if (count != remaining) { fp->errcode |= BGZF_ERR_IO; return -1; } size += count; if ((count = inflate_block(fp, block_length)) < 0) return -1; if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. fp->block_address = block_address; fp->block_length = count; cache_block(fp, size); return 0; } ssize_t bgzf_read(BGZF *fp, void *data, size_t length) { ssize_t bytes_read = 0; uint8_t *output = (uint8_t*)data; if (length <= 0) return 0; assert(fp->is_write == 0); while (bytes_read < length) { int copy_length, available = fp->block_length - fp->block_offset; uint8_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) return -1; available = fp->block_length - fp->block_offset; if (available <= 0) break; } copy_length = length - bytes_read < available? length - bytes_read : available; buffer = (uint8_t*)fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; bytes_read += copy_length; } if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = fp->block_length = 0; } return bytes_read; } #ifdef BGZF_MT typedef struct { BGZF *fp; struct mtaux_t *mt; void *buf; int i, errcode, toproc; } worker_t; typedef struct mtaux_t { int n_threads, n_blks, curr, done; volatile int proc_cnt; void **blk; int *len; worker_t *w; pthread_t *tid; pthread_mutex_t lock; pthread_cond_t cv; } mtaux_t; static int worker_aux(worker_t *w) { int i, stop = 0; // wait for condition: to process or all done pthread_mutex_lock(&w->mt->lock); while (!w->toproc && !w->mt->done) pthread_cond_wait(&w->mt->cv, &w->mt->lock); if (w->mt->done) stop = 1; w->toproc = 0; pthread_mutex_unlock(&w->mt->lock); if (stop) return 1; // to quit the thread w->errcode = 0; for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { int clen = BGZF_MAX_BLOCK_SIZE; if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) w->errcode |= BGZF_ERR_ZLIB; memcpy(w->mt->blk[i], w->buf, clen); w->mt->len[i] = clen; } __sync_fetch_and_add(&w->mt->proc_cnt, 1); return 0; } static void *mt_worker(void *data) { while (worker_aux((worker_t*)data) == 0); return 0; } int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) { int i; mtaux_t *mt; pthread_attr_t attr; if (!fp->is_write || fp->mt || n_threads <= 1) return -1; mt = (mtaux_t*)calloc(1, sizeof(mtaux_t)); mt->n_threads = n_threads; mt->n_blks = n_threads * n_sub_blks; mt->len = (int*)calloc(mt->n_blks, sizeof(int)); mt->blk = (void**)calloc(mt->n_blks, sizeof(void*)); for (i = 0; i < mt->n_blks; ++i) mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t)); for (i = 0; i < mt->n_threads; ++i) { mt->w[i].i = i; mt->w[i].mt = mt; mt->w[i].fp = fp; mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); } pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_mutex_init(&mt->lock, 0); pthread_cond_init(&mt->cv, 0); for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); fp->mt = mt; return 0; } static void mt_destroy(mtaux_t *mt) { int i; // signal all workers to quit pthread_mutex_lock(&mt->lock); mt->done = 1; mt->proc_cnt = 0; pthread_cond_broadcast(&mt->cv); pthread_mutex_unlock(&mt->lock); for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread // free other data allocated on heap for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); pthread_cond_destroy(&mt->cv); pthread_mutex_destroy(&mt->lock); free(mt); } static void mt_queue(BGZF *fp) { mtaux_t *mt = (mtaux_t*)fp->mt; assert(mt->curr < mt->n_blks); // guaranteed by the caller memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); mt->len[mt->curr] = fp->block_offset; fp->block_offset = 0; ++mt->curr; } static int mt_flush(BGZF *fp) { int i; mtaux_t *mt = (mtaux_t*)fp->mt; if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail // signal all the workers to compress pthread_mutex_lock(&mt->lock); for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; mt->proc_cnt = 0; pthread_cond_broadcast(&mt->cv); pthread_mutex_unlock(&mt->lock); // worker 0 is doing things here worker_aux(&mt->w[0]); // wait for all the threads to complete while (mt->proc_cnt < mt->n_threads); // dump data to disk for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; for (i = 0; i < mt->curr; ++i) if (fwrite(mt->blk[i], 1, mt->len[i], (FILE*)fp->fp) != (size_t)mt->len[i]) fp->errcode |= BGZF_ERR_IO; mt->curr = 0; return 0; } static int mt_lazy_flush(BGZF *fp) { mtaux_t *mt = (mtaux_t*)fp->mt; if (fp->block_offset) mt_queue(fp); if (mt->curr == mt->n_blks) return mt_flush(fp); return -1; } static ssize_t mt_write(BGZF *fp, const void *data, size_t length) { const uint8_t *input = (const uint8_t*)data; ssize_t rest = length; while (rest) { int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest; memcpy((uint8_t*)fp->uncompressed_block + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; rest -= copy_length; if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp); } return length - rest; } #endif // ~ #ifdef BGZF_MT int bgzf_flush(BGZF *fp) { if (!fp->is_write) return 0; #ifdef BGZF_MT if (fp->mt) return mt_flush(fp); #endif while (fp->block_offset > 0) { int block_length; block_length = deflate_block(fp, fp->block_offset); if (block_length < 0) return -1; if (fwrite(fp->compressed_block, 1, block_length, (FILE*)fp->fp) != (size_t)block_length) { fp->errcode |= BGZF_ERR_IO; // possibly truncated file return -1; } fp->block_address += block_length; } return 0; } int bgzf_flush_try(BGZF *fp, ssize_t size) { if (fp->block_offset + size > BGZF_BLOCK_SIZE) { #ifdef BGZF_MT if (fp->mt) return mt_lazy_flush(fp); else return bgzf_flush(fp); #else return bgzf_flush(fp); #endif } return -1; } ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) { const uint8_t *input = (const uint8_t*)data; int block_length = BGZF_BLOCK_SIZE, bytes_written = 0; assert(fp->is_write); #ifdef BGZF_MT if (fp->mt) return mt_write(fp, data, length); #endif while (bytes_written < length) { uint8_t* buffer = (uint8_t*)fp->uncompressed_block; int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; memcpy(buffer + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length && bgzf_flush(fp)) break; } return bytes_written; } int bgzf_close(BGZF* fp) { int ret, block_length; if (fp == 0) return -1; if (fp->is_write) { if (bgzf_flush(fp) != 0) return -1; fp->compress_level = -1; block_length = deflate_block(fp, 0); // write an empty block fwrite(fp->compressed_block, 1, block_length, (FILE*)fp->fp); if (fflush((FILE*)fp->fp) != 0) { fp->errcode |= BGZF_ERR_IO; return -1; } #ifdef BGZF_MT if (fp->mt) mt_destroy((mtaux_t*)fp->mt); #endif } ret = fp->is_write? fclose((FILE*)fp->fp) : _bgzf_close(fp->fp); if (ret != 0) return -1; free(fp->uncompressed_block); free(fp->compressed_block); free_cache(fp); free(fp); return 0; } void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; } int bgzf_check_EOF(BGZF *fp) { uint8_t buf[28]; off_t offset; offset = _bgzf_tell((_bgzf_file_t)fp->fp); if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; _bgzf_read(fp->fp, buf, 28); _bgzf_seek(fp->fp, offset, SEEK_SET); return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0; } int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { int block_offset; int64_t block_address; if (fp->is_write || where != SEEK_SET) { fp->errcode |= BGZF_ERR_MISUSE; return -1; } block_offset = pos & 0xFFFF; block_address = pos >> 16; if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { fp->errcode |= BGZF_ERR_IO; return -1; } fp->block_length = 0; // indicates current block has not been loaded fp->block_address = block_address; fp->block_offset = block_offset; return 0; } int bgzf_is_bgzf(const char *fn) { uint8_t buf[16]; int n; _bgzf_file_t fp; if ((fp = _bgzf_open(fn, "r")) == 0) return 0; n = _bgzf_read(fp, buf, 16); _bgzf_close(fp); if (n != 16) return 0; return memcmp(g_magic, buf, 16) == 0? 1 : 0; } int bgzf_getc(BGZF *fp) { int c; if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) return -2; /* error */ if (fp->block_length == 0) return -1; /* end-of-file */ } c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } return c; } #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif int bgzf_getline(BGZF *fp, int delim, kstring_t *str) { int l, state = 0; unsigned char *buf = (unsigned char*)fp->uncompressed_block; str->l = 0; do { if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) { state = -2; break; } if (fp->block_length == 0) { state = -1; break; } } for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); if (l < fp->block_length) state = 1; l -= fp->block_offset; if (str->l + l + 1 >= str->m) { str->m = str->l + l + 2; kroundup32(str->m); str->s = (char*)realloc(str->s, str->m); } memcpy(str->s + str->l, buf + fp->block_offset, l); str->l += l; fp->block_offset += l + 1; if (fp->block_offset >= fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } } while (state == 0); if (str->l == 0 && state < 0) return state; str->s[str->l] = 0; return str->l; } vcftools-0.1.15/src/cpp/bgzf.h000066400000000000000000000143731307140004000161040ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The BGZF library was originally written by Bob Handsaker from the Broad * Institute. It was later improved by the SAMtools developers. */ #ifndef __BGZF_H #define __BGZF_H #include #include #include #include #define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE #define BGZF_MAX_BLOCK_SIZE 0x10000 #define BGZF_ERR_ZLIB 1 #define BGZF_ERR_HEADER 2 #define BGZF_ERR_IO 4 #define BGZF_ERR_MISUSE 8 typedef struct { int errcode:16, is_write:2, is_be:2, compress_level:12; int cache_size; int block_length, block_offset; int64_t block_address; void *uncompressed_block, *compressed_block; void *cache; // a pointer to a hash table void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading #ifdef BGZF_MT void *mt; // only used for multi-threading #endif } BGZF; #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifdef __cplusplus extern "C" { #endif /****************** * Basic routines * ******************/ /** * Open an existing file descriptor for reading or writing. * * @param fd file descriptor * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. * @return BGZF file handler; 0 on error */ BGZF* bgzf_dopen(int fd, const char *mode); #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility /** * Open the specified file for reading or writing. */ BGZF* bgzf_open(const char* path, const char *mode); /** * Close the BGZF and free all associated resources. * * @param fp BGZF file handler * @return 0 on success and -1 on error */ int bgzf_close(BGZF *fp); /** * Read up to _length_ bytes from the file storing into _data_. * * @param fp BGZF file handler * @param data data array to read into * @param length size of data to read * @return number of bytes actually read; 0 on end-of-file and -1 on error */ ssize_t bgzf_read(BGZF *fp, void *data, size_t length); /** * Write _length_ bytes from _data_ to the file. * * @param fp BGZF file handler * @param data data array to write * @param length size of data to write * @return number of bytes actually written; -1 on error */ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length); /** * Write the data in the buffer to the file. */ int bgzf_flush(BGZF *fp); /** * Return a virtual file pointer to the current location in the file. * No interpetation of the value should be made, other than a subsequent * call to bgzf_seek can be used to position the file at the same point. * Return value is non-negative on success. */ #define bgzf_tell(fp) ((((BGZF*)fp)->block_address << 16) | (((BGZF*)fp)->block_offset & 0xFFFF)) /** * Set the file to read from the location specified by _pos_. * * @param fp BGZF file handler * @param pos virtual file offset returned by bgzf_tell() * @param whence must be SEEK_SET * @return 0 on success and -1 on error */ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); /** * Check if the BGZF end-of-file (EOF) marker is present * * @param fp BGZF file handler opened for reading * @return 1 if EOF is present; 0 if not or on I/O error */ int bgzf_check_EOF(BGZF *fp); /** * Check if a file is in the BGZF format * * @param fn file name * @return 1 if _fn_ is BGZF; 0 if not or on I/O error */ int bgzf_is_bgzf(const char *fn); /********************* * Advanced routines * *********************/ /** * Set the cache size. Only effective when compiled with -DBGZF_CACHE. * * @param fp BGZF file handler * @param size size of cache in bytes; 0 to disable caching (default) */ void bgzf_set_cache_size(BGZF *fp, int size); /** * Flush the file if the remaining buffer size is smaller than _size_ */ int bgzf_flush_try(BGZF *fp, ssize_t size); /** * Read one byte from a BGZF file. It is faster than bgzf_read() * @param fp BGZF file handler * @return byte read; -1 on end-of-file or error */ int bgzf_getc(BGZF *fp); /** * Read one line from a BGZF file. It is faster than bgzf_getc() * * @param fp BGZF file handler * @param delim delimitor * @param str string to write to; must be initialized * @return length of the string; 0 on end-of-file; negative on error */ int bgzf_getline(BGZF *fp, int delim, kstring_t *str); /** * Read the next BGZF block. */ int bgzf_read_block(BGZF *fp); #ifdef BGZF_MT /** * Enable multi-threading (only effective on writing) * * @param fp BGZF file handler; must be opened for writing * @param n_threads #threads used for writing * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended */ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); #endif #ifdef __cplusplus } #endif #endif vcftools-0.1.15/src/cpp/dgeev.cpp000066400000000000000000000057521307140004000166020ustar00rootroot00000000000000/* * dgeev.cpp * * Created on: 20 Apr 2011 * Author: auton * ($Revision: 1 $) */ #if HAVE_CONFIG_H # include "config.h" #endif #if HAVE_LIBLAPACK #include "dgeev.h" void dgeev_sort(double *Er, double *Ei, int N) { double temp, *E2; int i, j; E2 = new double[N]; for (i=0; i #endif #if HAVE_LIBLAPACK #ifndef DGEEV_H_ #define DGEEV_H_ #include void dgeev(double **H, int n, double *Er, double *Ei); void dgeev(double **H, int n, double *Er, double *Ei, double **Evecs); double *dgeev_ctof(double **in, int rows, int cols); void dgeev_ftoc(double *in, double **out, int rows, int cols); void dgeev_sort(double *Er, double *Ei, int N); void dgeev_sort(double *Er, double *Ei, double **Evecs, int N); extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double *vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info); #endif #endif vcftools-0.1.15/src/cpp/entry.cpp000066400000000000000000000537331307140004000166530ustar00rootroot00000000000000/* * entry.cpp * * Created on: Dec 12, 2012 * Author: amarcketta */ #include "entry.h" /* // This function implements an exact SNP test of Hardy-Weinberg // Equilibrium as described in Wigginton, JE, Cutler, DJ, and // Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg // Equilibrium. American Journal of Human Genetics. 76: 000 - 000 // // Written by Jan Wigginton */ void entry::SNPHWE(int obs_hets, int obs_hom1, int obs_hom2, double &p_hwe, double &p_lo, double &p_hi) { p_hwe = 1.0; p_lo = 1.0; p_hi = 1.0; //p_hi_lo = 1.0; if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return; if (obs_hom1 < 0 || obs_hom2 < 0 || obs_hets < 0) LOG.error("Internal error: negative count in HWE test", 91); int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; int rare_copies = 2 * obs_homr + obs_hets; int genotypes = obs_hets + obs_homc + obs_homr; double * het_probs = (double *) malloc((size_t) (rare_copies + 1) * sizeof(double)); if (het_probs == NULL) LOG.error("Internal error: SNP-HWE: Unable to allocate array", 90); for (int i = 0; i <= rare_copies; i++) het_probs[i] = 0.0; /* start at midpoint */ int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); /* check to ensure that midpoint and rare alleles have same parity */ if ((rare_copies & 1) ^ (mid & 1)) mid++; int curr_hets = mid; int curr_homr = (rare_copies - mid) / 2; int curr_homc = genotypes - curr_hets - curr_homr; het_probs[mid] = 1.0; double sum = het_probs[mid]; for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) { het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); sum += het_probs[curr_hets - 2]; /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ curr_homr++; curr_homc++; } curr_hets = mid; curr_homr = (rare_copies - mid) / 2; curr_homc = genotypes - curr_hets - curr_homr; for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) { het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); sum += het_probs[curr_hets + 2]; /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ curr_homr--; curr_homc--; } for (int i = 0; i <= rare_copies; i++) het_probs[i] /= sum; // alternate p-value calculation for p_hi/p_lo p_hi = het_probs[obs_hets]; for (int i = obs_hets + 1; i <= rare_copies; i++) p_hi += het_probs[i]; p_lo = het_probs[obs_hets]; for (int i = obs_hets - 1; i >= 0; i--) p_lo += het_probs[i]; //p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; p_hwe = 0.0; /* p-value calculation for p_hwe */ for (int i = 0; i <= rare_copies; i++) { if (het_probs[i] > het_probs[obs_hets]) continue; p_hwe += het_probs[i]; } p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; free(het_probs); } void entry::make_typed_string(vector &out, const string &in, bool typed) { vector tmp_vector; out.resize(0); if (in == "." or in == " " or in == "") { if (typed == false) return; int8_t tmp = (int8_t)0; tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); return; } if (typed == true) { if (in.length() >= 15) { int8_t tmp = (int8_t)15; tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); make_typed_int(tmp_vector, in.length(), typed); out.insert( out.end(), tmp_vector.begin(), tmp_vector.end() ); } else { int8_t tmp = (int8_t)in.length(); tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); } } out.reserve(out.size()+in.size()); copy(in.begin(), in.end(), back_inserter(out)); } void entry::make_typed_int(vector &out, const int &in, bool typed) { vector tmp_char; out.resize(0); int type; int8_t size_type = (int8_t)1; if (in < 127 and in >-127) type = 1; else if (in < 32767 and in>-32767) type = 2; else type = 3; make_int(tmp_char, in, type); if (typed == true) { size_type = size_type << 4; size_type = size_type | type; out.push_back(size_type); } out.insert(out.end(), tmp_char.begin(), tmp_char.end()); } void entry::make_typed_string_vector( vector &out, const vector &in, int number ) { vector tmp_char; int max_val = 0; int8_t size_type; out.resize(0); if (number == -1) { for (unsigned int ui=0; ui max_val) max_val = in[ui].size(); } } else max_val = number; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)7; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)7; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.end() ); } for (unsigned int ui=0; ui &out, vector &in ) { vector tmp_vector; int8_t size_type; int max_ploidy = 0; out.resize(0); max_ploidy = *max_element(ploidy.begin(), ploidy.end()); if (max_ploidy < 15) { size_type = (int8_t)max_ploidy; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); make_typed_int(tmp_vector, max_ploidy, true); out.insert( out.end(), tmp_vector.begin(), tmp_vector.end() ); tmp_vector.resize(0); } for (unsigned int ui=0; ui &out, string &in, int exp_size) { int8_t tmp_int; int8_t phased = 0; out.resize(exp_size); int idx = 0; for (unsigned int ui=0; ui &out, const string &in, int number ) { vector tmp_char; vector tmp_ints; vector split_string; int converted, type; int8_t size_type; unsigned int max = 0; unsigned int max_val = 0; out.resize(0); if (in == " " or in == "." or in == "") { size_type = (int8_t)0; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); return; } header::tokenize(in, ',', split_string); if (number == -1) { if (split_string.size() > max_val) max_val = split_string.size(); } else max_val = number; for (unsigned int ui=0; ui (int)max) and ( converted != (int)0x80000000)) max = abs(converted); } else converted = 0x80000001; tmp_ints.push_back( converted ); } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const vector &in, int number ) { vector tmp_char; vector tmp_ints; vector split_string; int converted, type; int8_t size_type; unsigned int max = 0; unsigned int max_val = 0; out.resize(0); if (number == -1) { unsigned int tmp_int = 0; for (unsigned int ui=0; ui max_val) max_val = tmp_int; } max_val++; } else max_val = number; for (unsigned int ui=0; ui (int)max) and (converted != (int)0x80000000)) max = abs(converted); } else converted = 0x80000001; tmp_ints.push_back( converted ); } } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const vector &in ) { vector tmp_char; int type; int8_t size_type; unsigned int max = 0; out.resize(0); for (unsigned int ui=0; ui (int)max) and ( (int8_t)in[ui] != (int8_t)0x80)) max = abs(in[ui]); } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (in.size() < 15) { size_type = (int8_t)in.size(); size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, in.size(), true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const int &in, int type) { out.resize(0); if (type == 1) { int8_t tmp_int; if (in == (int)0x80000000 || in >= 128) tmp_int = (int8_t)0x80; else if (in == (int)0x80000001) tmp_int = (int8_t)0x81; else tmp_int = (int8_t)in; out.push_back( (int8_t)tmp_int); } else if (type == 2) { int16_t tmp_int; if (in == (int)0x80000000 || in >= 32768) tmp_int = 0x8000; else if (in == (int)0x80000001) tmp_int = (int8_t)0x8001; else tmp_int = (int16_t)in; int8_t split; for(unsigned int ui=0; ui<2; ui++) { split = tmp_int & (int16_t)0x00FF;//0000000011111111 out.push_back(split); tmp_int = tmp_int >> 8; } } else { int32_t tmp_int; tmp_int = (int32_t)in; int8_t split; for(unsigned int ui=0; ui<4; ui++) { split = tmp_int & (int32_t)0x0000FF; out.push_back( (int8_t)split); tmp_int = tmp_int >> 8; } } } void entry::make_typed_float_vector(vector &out, const string &in, int number ) { vector split_string; int8_t size_type; int max_val = 0; out.resize(0); if (in == " " or in == "." or in == "") { size_type = (int8_t)0; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); return; } header::tokenize(in, ',', split_string); if (number == -1) max_val = split_string.size(); else max_val = number; if ( max_val < 15 ) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); vector size_vector; make_typed_int(size_vector, max_val, true ); out.insert(out.end(), size_vector.begin(), size_vector.end()); } float value; char missing[4] = {static_cast(0x01), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; char end[4] = {static_cast(0x02), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; for(unsigned int ui=0; (int)ui &out, const vector &in, int number ) { vector split_string; int8_t size_type; unsigned int max_val = 0; out.resize(0); if (number == -1) { unsigned int tmp_int = 0; for (unsigned int ui=0; ui max_val) max_val = tmp_int; } max_val++; } else max_val = number; if ( max_val < 15 ) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); vector size_vector; make_typed_int(size_vector, max_val, true ); out.insert(out.end(), size_vector.begin(), size_vector.end()); } float value; char missing[4] = {static_cast(0x01), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; char end[4] = {static_cast(0x02), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; for (unsigned int ui=0; ui &out, const unsigned int &type, const unsigned int &size) { uint8_t byte; vector tmp_vector; tmp_vector.resize(0); out.resize(0); if (size < 15) { byte = size; byte = byte << 4; } else { byte = (uint8_t)15; make_typed_int(tmp_vector, size, true); } byte = byte | (uint8_t)type; out.push_back(byte); out.insert(out.end(), tmp_vector.begin(), tmp_vector.end()); } string entry::get_typed_string(unsigned int * line_position, const vector& line) { unsigned int size, type; string out; get_type( line_position, line, type, size ); if (type != 7) { LOG.printLOG("Error: Expected type 7 for string. Found type " + header::int2str(type) + ".\n"); } char * tmp = new char[size]; memcpy(tmp, &line[*line_position], size*sizeof(char)); *line_position += size; out = string( tmp, size ); if (out == "" or out == " ") out = "."; delete [] tmp; return out; } int entry::get_typed_int(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size) { int out; get_type( line_position, line, type, size ); if (size > 1) { LOG.printLOG("Error: Int vector when expected only a single Integer value.\n" ); exit(0); } if (type == 1) { int8_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else if (type == 2) { int16_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else if (type == 3) { int32_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } return out; } vector entry::get_int_vector(unsigned int * line_position, const vector& line) { unsigned int size, type; get_type( line_position, line, type, size ); vector out(size); if (type == 0) { return out; } else if (type == 1) { int8_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else if (type == 2) { int16_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else if (type == 3) { int32_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } return out; } void entry::get_type(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size) { uint8_t byte = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(byte); size = byte >> 4; type = (byte & (uint8_t)15); if (size == 15) { int type2; byte = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(byte); type2 = (byte & (uint8_t)15); if (type2 == 1) { int8_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (unsigned int)tmp; } else if (type2 == 2) { int16_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (int)tmp; } else if (type2 == 3) { int32_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (unsigned int)tmp; } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } } } void entry::skip_section(unsigned int *line_position, const vector &line) { unsigned int type, size; get_type(line_position, line, type, size); if ( (type == 1) || (type == 7) ) *line_position += sizeof(int8_t)*size; else if (type == 2) *line_position += sizeof(int16_t)*size; else if ( (type == 3) || (type == 5) ) *line_position += sizeof(int32_t)*size; } bool entry::check_missing(unsigned int line_position, const unsigned int type, const vector &line) { static char missing_float[4] = {static_cast(0x01), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; static char missing_int1 = static_cast(0x80); static char missing_int2[2] = {static_cast(0x00), static_cast(0x80)}; static char missing_int3[4] = {static_cast(0x00), static_cast(0x00), static_cast(0x00), static_cast(0x80)}; char test_char; bool missing = true; if (type==1) { test_char = *reinterpret_cast(&line[line_position]); missing = (test_char == missing_int1); } else if (type==2) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_int2[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==3) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_int3[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==5) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_float[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==7) missing = false; return missing; } bool entry::check_end(unsigned int line_position, const unsigned int type, const vector &line) { static char end_float[4] = {static_cast(0x02), static_cast(0x00), static_cast(0x80), static_cast(0x7F)}; static char end_int1 = static_cast(0x81); static char end_int2[2] = {static_cast(0x01), static_cast(0x80)}; static char end_int3[4] = {static_cast(0x01), static_cast(0x00), static_cast(0x00), static_cast(0x80)}; char test_char; bool end = true; if (type==1) { test_char = *reinterpret_cast(&line[line_position]); end = (test_char == end_int1); } else if (type==2) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != end_int2[ui]) { end = false; break; } line_position += sizeof(char); } } else if (type==3) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != end_int3[ui]) { end = false; break; } line_position += sizeof(char); } } else if (type==5) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != end_float[ui]) { end = false; break; } line_position += sizeof(char); } } else if (type==7) end = false; return end; } void entry::get_number(uint32_t &out, unsigned int *line_position, const vector& line) { memcpy(&out, &line[*line_position], sizeof(out)); *line_position += sizeof(out); } vcftools-0.1.15/src/cpp/entry.h000066400000000000000000000225501307140004000163110ustar00rootroot00000000000000/* * entry.h * * Created on: Dec 12, 2012 * Author: amarcketta */ #ifndef ENTRY_H_ #define ENTRY_H_ #include #include #include #include #include "header.h" #include "bgzf.h" #include "output_log.h" #include "parameters.h" using namespace std; extern output_log LOG; class entry { public: virtual ~entry() {}; unsigned int N_indv; bool passed_filters; header entry_header; vector include_indv; vector include_genotype; virtual void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false) = 0; virtual void parse_full_entry(bool parse_FORMAT=true) = 0; virtual void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; virtual void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; virtual void reset(const vector &data_line) = 0; int apply_filters(const parameters ¶ms); void filter_sites(const set &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false); void filter_sites_to_keep(const set &snps_to_keep, const string &snps_to_keep_file); void filter_sites_to_exclude(const string &snps_to_exclude_file); void filter_sites_by_position(const string &chr, int start_pos, int end_pos); void filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file); void filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file); void filter_sites_by_chromosome(const set &chrs_to_keep, const set &chrs_to_exclude); void filter_sites_by_quality(double min_quality); void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth); void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_non_ref_af_any, double max_non_ref_af_any, double min_site_call_rate); void filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels); void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double min_non_ref_ac_any, double max_non_ref_ac_any, double max_missing_call_count); void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles); void filter_sites_by_HWE_pvalue(double min_HWE_pvalue); void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false); void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0); void filter_sites_by_filter_status(const set &filter_flags_to_remove, const set &filter_flags_to_keep, bool remove_all = false); void filter_sites_by_phase(); void filter_sites_by_thinning(int min_SNP_distance); void filter_sites_by_INFO(const set &flags_to_remove, const set &flags_to_keep); void filter_genotypes_by_quality_value(double min_genotype_quality); void filter_genotypes_by_depth_range(int min_depth, int max_depth); void filter_genotypes_by_filter_flag(const set &filter_flags_to_remove, bool remove_all = false); string get_CHROM() const; void get_CHROM(string &out) const; int get_POS() const; string get_ID() const; string get_REF() const; string get_ALT() const; string get_ALT_allele(int allele_num) const; void get_allele(int allele_num, string &out) const; string get_allele(int allele_num) const; void get_alleles_vector(vector &out) const; string get_FILTER() const; void get_FILTER_vector(vector &out) const; double get_QUAL() const; string get_INFO(const set &INFO_to_keep, bool keep_all_INFO=false) const; string get_INFO_value(const string &key) const; vector get_INFO_values(const string &key) const; string get_FORMAT() const; void get_indv_GENOTYPE_ids(unsigned int indv, pair &out) const; void get_indv_GENOTYPE_strings(unsigned int indv, pair &out) const; char get_indv_PHASE(unsigned int indv) const; double get_indv_GQUALITY(unsigned int indv) const; int get_indv_DEPTH(unsigned int indv) const; void get_indv_GFILTER(unsigned int indv, string &out) const; void get_indv_GFILTER_vector(unsigned int indv, vector &out) const; int get_indv_ploidy(unsigned int indv) const; bool is_SNP() const; bool is_biallelic_SNP() const; bool is_diploid() const; virtual void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out) = 0; bool FORMAT_id_exists(const string &FORMAT_id); void get_allele_counts(vector &out, unsigned int &N_non_missing_chr_out) const; void get_allele_counts(vector &out, unsigned int &N_non_missing_chr_out, const vector &include_indv, const vector &include_genotype) const; void get_genotype_counts(unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const; void get_genotype_counts(const vector &include_indv, const vector &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const; void get_multiple_genotype_counts(const vector &include_indv, const vector &include_genotype, vector &out_N_hom, vector &out_N_het) const; unsigned int get_N_alleles() const; unsigned int get_N_chr() const; void get_POS_binary(vector &out) const; void get_ID_binary(vector &out); void get_rlen(vector &out) const; void get_QUAL_binary(vector &out) const; void get_n_allele_info(vector &out) const; void get_n_fmt_sample(vector &out) const; void get_ALLELES_binary(vector &out); vector > get_INFO_vector(const set &INFO_to_keep, bool keep_all_INFO=false); void get_FORMAT_binary(vector &out) const; string get_typed_string( unsigned int * line_position, const vector& line ); void get_type(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size); vector get_int_vector(unsigned int * line_position, const vector& line); int get_typed_int(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size); void get_number(uint32_t &out, unsigned int * line_position, const vector& line); void make_typed_string(vector &out, const string &in, bool typed); void make_typed_int(vector &out, const int &in, bool typed); void make_int(vector &out, const int &in, int type); void make_typed_int_vector(vector &out, const vector &in, int number = -1); void make_typed_int_vector(vector &out, const string &in, int number = -1); void make_typed_int_vector(vector &out, const vector &in); void make_typed_float_vector(vector &out, const string &in, int number = -1); void make_typed_float_vector(vector &out, const vector &in, int number = -1); void make_typed_string_vector(vector &out, const vector &in, int number = -1); void make_typed_GT_vector(vector &out, vector &in); void make_type_size(vector &out, const unsigned int &type, const unsigned int &size); void encode_genotype(vector &out, string &in, int exp_size); void skip_section(unsigned int *line_position, const vector &line); bool check_missing(unsigned int line_position, const unsigned int type, const vector &line); bool check_end(unsigned int line_position, const unsigned int type, const vector &line); void add_ALT_allele(const string &in); virtual void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false) = 0; virtual void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false) = 0; virtual void filter_genotypes_by_depth(int min_depth, int max_depth) = 0; virtual void filter_genotypes_by_quality(double min_genotype_quality) = 0; virtual void filter_genotypes_by_filter_status(const set &filter_flags_to_remove, bool remove_all = false) = 0; static void SNPHWE(int obs_hets, int obs_hom1, int obs_hom2, double &p_hwe, double &p_lo, double &p_hi); static set local_snps_to_keep; static set snps_to_exclude; static vector< set > keep_positions; static vector< set > exclude_positions; static map chr_to_idx; static vector< deque > > lims; static ifstream mask; static string mask_chr; static string mask_line; static int mask_pos; static string thin_chrom; static int thin_pos; protected: istringstream data_stream; vector line; bool basic_parsed; bool fully_parsed; bool parsed_ALT; bool parsed_FILTER; bool parsed_INFO; bool parsed_FORMAT; bool parsed_FORMAT_binary; string CHROM; int POS; string ID; string REF; vector ALT; double QUAL; vector FILTER; vector > INFO; vector FORMAT; vector FORMAT_binary; int N_INFO_removed; int N_FORMAT_removed; vector< pair > GENOTYPE; vector ploidy; vector PHASE; vector GQUALITY; vector DEPTH; vector< vector > GFILTER; vector parsed_GT; vector parsed_GQ; vector parsed_DP; vector parsed_FT; map FORMAT_to_idx; int GT_idx; int GQ_idx; int DP_idx; int FT_idx; void set_indv_DEPTH(unsigned int indv, int in); vector FORMAT_positions, FORMAT_types, FORMAT_sizes, FORMAT_skip, FORMAT_keys; }; #endif /* ENTRY_H_ */ vcftools-0.1.15/src/cpp/entry_filters.cpp000066400000000000000000000675061307140004000204060ustar00rootroot00000000000000/* * entry_filters.cpp * * Created on: Aug 9, 2013 * Author: amarcketta */ #include "entry.h" set entry::local_snps_to_keep; set entry::snps_to_exclude; vector< set > entry::keep_positions; vector< set > entry::exclude_positions; map entry::chr_to_idx; vector< deque > > entry::lims; ifstream entry::mask; string entry::mask_chr; string entry::mask_line; int entry::mask_pos; string entry::thin_chrom; int entry::thin_pos; int entry::apply_filters(const parameters ¶ms) { if (line.empty()) { passed_filters = false; return 0; } // Apply all filters in turn. filter_sites_by_allele_type(params.keep_only_indels, params.remove_indels); filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file); filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites); string chr_to_keep = ""; if (params.chrs_to_keep.size() == 1) chr_to_keep = *(params.chrs_to_keep.begin()); // Get first chromosome in list (there should only be one). filter_sites_by_position(chr_to_keep, params.start_pos, params.end_pos); filter_sites_by_positions(params.positions_file, params.exclude_positions_file); filter_sites_by_overlap_positions(params.positions_overlap_file, params.exclude_positions_overlap_file); filter_sites_by_chromosome(params.chrs_to_keep, params.chrs_to_exclude); filter_sites_by_BED_file(params.BED_file, params.BED_exclude); filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles); filter_sites_by_INFO(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep); filter_sites_by_quality(params.min_quality); filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth); filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value); if (params.phased_only == true) filter_sites_by_phase(); filter_genotypes_by_quality_value(params.min_genotype_quality); filter_genotypes_by_depth_range(params.min_genotype_depth, params.max_genotype_depth); filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes); filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_non_ref_af_any, params.max_non_ref_af_any, params.min_site_call_rate); filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.min_non_ref_ac_any, params.max_non_ref_ac_any, params.max_missing_call_count); filter_sites_by_HWE_pvalue(params.min_HWE_pvalue); filter_sites_by_thinning(params.min_interSNP_distance); return 1; } void entry::filter_genotypes_by_quality_value(double min_genotype_quality) { // Filter genotypes by quality if (passed_filters == false) return; if (min_genotype_quality <= 0) return; parse_genotype_entries(false, true); if (entry_header.has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Quality."); filter_genotypes_by_quality(min_genotype_quality); } void entry::filter_genotypes_by_depth_range(int min_depth, int max_depth) { // Filter genotypes by depth if (passed_filters == false) return; if ((min_depth <= 0) && (max_depth == numeric_limits::max())) return; if (entry_header.has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Depth."); parse_genotype_entries(false, false, true); filter_genotypes_by_depth(min_depth, max_depth); } void entry::filter_genotypes_by_filter_flag(const set &filter_flags_to_remove, bool remove_all) { // Filter genotypes by Filter Flags if (passed_filters == false) return; if ((remove_all == false) && (filter_flags_to_remove.empty())) return; parse_genotype_entries(false, false, false, true); if (entry_header.has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Filter Flag."); filter_genotypes_by_filter_status(filter_flags_to_remove, remove_all); } void entry::filter_sites(const set &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude) { // Filter sites by user provided lists if (keep_then_exclude) { filter_sites_to_keep(snps_to_keep, snps_to_keep_file); filter_sites_to_exclude(snps_to_exclude_file); } else { filter_sites_to_exclude(snps_to_exclude_file); filter_sites_to_keep(snps_to_keep, snps_to_keep_file); } } void entry::filter_sites_to_keep(const set &snps_to_keep, const string &snps_to_keep_file) { // Filter sites by user provided list if(passed_filters == false) return; if ((snps_to_keep.empty()) && (snps_to_keep_file == "")) return; if (snps_to_keep_file != "" && local_snps_to_keep.empty()) { ifstream in(snps_to_keep_file.c_str()); string tmp; local_snps_to_keep = snps_to_keep; if (!in.is_open()) { LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0); } while (!in.eof()) { in >> tmp; local_snps_to_keep.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } parse_basic_entry(); if ( (local_snps_to_keep.find(ID) == local_snps_to_keep.end()) && (snps_to_keep.find(ID) == snps_to_keep.end()) ) passed_filters = false; } void entry::filter_sites_to_exclude(const string &snps_to_exclude_file) { // Filter sites by user provided list if(passed_filters == false) return; if (snps_to_exclude_file == "") return; if (snps_to_exclude_file != "" && snps_to_exclude.empty()) { ifstream in(snps_to_exclude_file.c_str()); string tmp; if (!in.is_open()) { LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0); } while (!in.eof()) { in >> tmp; snps_to_exclude.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } parse_basic_entry(); if (snps_to_exclude.find(ID) != snps_to_exclude.end()) passed_filters = false; } void entry::filter_sites_by_quality(double min_quality) { // Filter sites by quality if (passed_filters == false) return; if (min_quality < 0) return; parse_basic_entry(true); string alt_allele = get_ALT_allele(0); // The QUAL field has different definitions depending on the state of the // alternative allele. Here I treat them separately, although in this case // it is unnecessary. if ((alt_allele == ".") || (alt_allele == "")) { // The case that the alternative allele is unknown // QUAL is -10log_10 p(variant) if (QUAL < min_quality) passed_filters = false; } else { // The normal case // QUAL is -10log_10 p(no variant) if (QUAL < min_quality) passed_filters = false; } } void entry::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth) { // Filter sites by mean depth if (passed_filters == false) return; if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits::max())) return; int depth; unsigned int N_indv_included = 0; double depth_sum = 0.0; for (unsigned int ui=0; ui= 0) { depth_sum += depth; } N_indv_included++; } } double mean_depth = depth_sum / N_indv_included; if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth)) passed_filters = false; } void entry::filter_sites_by_position(const string &chr, int start_pos, int end_pos) { // Filter sites by user provided position range if (passed_filters == false) return; if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits::max()))) return; parse_basic_entry(); if (CHROM == chr) { if ((POS < start_pos) || (POS > end_pos)) passed_filters = false; } else passed_filters = false; } void entry::filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file) { // Filter sites by a user defined file containing a list of positions if (passed_filters == false) return; if ((positions_file == "") && (exclude_positions_file == "")) return; int idx; if (keep_positions.empty() && positions_file != "") { string chr; int pos1; unsigned int N_chr=chr_to_idx.size(); stringstream ss; string line; unsigned int gzMAX_LINE_LEN = 1024*1024; char *gz_readbuffer = new char[gzMAX_LINE_LEN]; gzFile gz_in = gzopen(positions_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + positions_file); keep_positions.resize(N_chr); while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } gzclose(gz_in); delete [] gz_readbuffer; } if (exclude_positions.empty() && exclude_positions_file != "") { string chr; int pos1; unsigned int N_chr=chr_to_idx.size(); stringstream ss; string line; unsigned int gzMAX_LINE_LEN = 1024*1024; char *gz_readbuffer = new char[gzMAX_LINE_LEN]; gzFile gz_in = gzopen(exclude_positions_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + exclude_positions_file); exclude_positions.resize(N_chr); while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); exclude_positions.resize(N_chr); } idx = chr_to_idx[chr]; exclude_positions[idx].insert(pos1); } gzclose(gz_in); delete [] gz_readbuffer; } parse_basic_entry(); if (!keep_positions.empty()) { // Check to see if position is in keep list if (chr_to_idx.find(CHROM) == chr_to_idx.end()) passed_filters = false; else { idx = chr_to_idx[CHROM]; if (keep_positions[idx].find(POS) == keep_positions[idx].end()) passed_filters = false; } } if (!exclude_positions.empty()) { // Check to see if position is in exclude list if (chr_to_idx.find(CHROM) != chr_to_idx.end()) { idx = chr_to_idx[CHROM]; if (exclude_positions[idx].find(POS) != exclude_positions[idx].end()) passed_filters = false; } } } void entry::filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file) { // Filter sites by overlapping with a user defined file containing a list of positions if (passed_filters == false) return; if ((positions_overlap_file == "") && (exclude_positions_overlap_file == "")) return; int idx; if (keep_positions.empty() && positions_overlap_file != "") { string chr; int pos1; unsigned int N_chr=chr_to_idx.size(); stringstream ss; string line; unsigned int gzMAX_LINE_LEN = 1024*1024; char *gz_readbuffer = new char[gzMAX_LINE_LEN]; gzFile gz_in = gzopen(positions_overlap_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + positions_overlap_file); while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } gzclose(gz_in); delete [] gz_readbuffer; } if (exclude_positions.empty() && exclude_positions_overlap_file != "") { string chr; int pos1; unsigned int N_chr=0; stringstream ss; string line; unsigned int gzMAX_LINE_LEN = 1024*1024; char *gz_readbuffer = new char[gzMAX_LINE_LEN]; gzFile gz_in = gzopen(exclude_positions_overlap_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + exclude_positions_overlap_file); while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); exclude_positions.resize(N_chr); } idx = chr_to_idx[chr]; exclude_positions[idx].insert(pos1); } gzclose(gz_in); delete [] gz_readbuffer; } parse_basic_entry(); if (!keep_positions.empty()) { // Check to see if position is in keep list if (chr_to_idx.find(CHROM) == chr_to_idx.end()) passed_filters = false; else { idx = chr_to_idx[CHROM]; bool found=false; for (unsigned int ui=POS; ui<(POS+REF.size()); ui++) if (keep_positions[idx].find(ui) != keep_positions[idx].end()) { found = true; break; } if (found == false) passed_filters = false; } } if (!exclude_positions.empty()) { // Check to see if position is in exclude list if (chr_to_idx.find(CHROM) != chr_to_idx.end()) { idx = chr_to_idx[CHROM]; bool found=false; for (unsigned int ui=POS; ui<(POS+REF.size()); ui++) if (exclude_positions[idx].find(ui) != exclude_positions[idx].end()) found = true; if (found == true) passed_filters = false; } } } void entry::filter_sites_by_chromosome(const set &chrs_to_keep, const set &chrs_to_exclude) { if (passed_filters == false) return; if (chrs_to_keep.empty() && chrs_to_exclude.empty()) return; parse_basic_entry(); if (!chrs_to_keep.empty()) { if (chrs_to_keep.find(CHROM) == chrs_to_keep.end()) passed_filters = false; } else { if (chrs_to_exclude.find(CHROM) != chrs_to_exclude.end()) passed_filters = false; } } void entry::filter_sites_by_BED_file(const string &bed_file, bool BED_exclude) { // Filter sites depending on positions in a BED file. if (passed_filters == false) return; if (bed_file == "") return; int pos1, pos2, idx; if (lims.empty()) { ifstream BED(bed_file.c_str()); if (!BED.is_open()) LOG.error("Could not open BED file: " + bed_file); string chr; unsigned int N_chr=chr_to_idx.size(); BED.ignore(numeric_limits::max(), '\n'); // Ignore header unsigned int N_BED_entries=0; while (!BED.eof()) { BED >> chr >> pos1 >> pos2; BED.ignore(numeric_limits::max(), '\n'); if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); lims.resize(N_chr); } idx = chr_to_idx[chr]; lims[idx].push_back(make_pair(pos1,pos2)); N_BED_entries++; } BED.close(); LOG.printLOG("\tRead " + output_log::int2str(N_BED_entries) + " BED file entries.\n"); for (unsigned int ui=0; ui min_ui(lims.size(), 0); parse_basic_entry(true); pos1 = POS; pos2 = pos1; unsigned int N_alleles = get_N_alleles(); for (int i=0; i<(int)N_alleles; i++) pos2 = max(pos2, (int)(pos1 + get_allele(i).length() - 1)); if (BED_exclude == false) { // Exclude sites not in BED file if (chr_to_idx.find(CHROM) == chr_to_idx.end()) passed_filters = false; else { idx = chr_to_idx[CHROM]; bool found=false; unsigned int max_ui = lims[idx].size(); for (unsigned int ui=min_ui[idx]; ui lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == false) passed_filters = false; } } else { // Exclude sites in BED file if (chr_to_idx.find(CHROM) != chr_to_idx.end()) { idx = chr_to_idx[CHROM]; bool found=false; unsigned int max_ui = lims[idx].size(); for (unsigned int ui=min_ui[idx]; ui lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == true) passed_filters = false; } } } void entry::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value) { // Filter sites on the basis of a fasta-like mask file. if (passed_filters == false || mask_file == "") return; if (!mask.is_open()) { mask.open(mask_file.c_str()); mask_chr = ""; mask_line = ""; mask_pos = 1; if (!mask.is_open()) LOG.error("Could not open mask file: " + mask_file); } string line; string next_chr=""; unsigned int next_pos = 0; parse_basic_entry(); next_chr = CHROM; while (mask_chr != next_chr && !mask.eof()) { getline(mask, line); line.erase( line.find_last_not_of(" \t") + 1); if (line[0] == '>') { mask_chr = line.substr(1, line.find_first_of(" \t")-1); mask_pos = 1; getline(mask, line); mask_line = line; } } if (next_chr == mask_chr) next_pos = (unsigned)POS; else { passed_filters = false; return; } while (next_pos > (mask_pos + mask_line.size()) && !mask.eof()) { getline(mask, line); line.erase( line.find_last_not_of(" \t") + 1); if (line[0] == '>') { mask_chr = line.substr(1, line.find_first_of(" \t")-1); mask_pos = 1; passed_filters = false; return; } else { mask_pos += mask_line.size(); mask_line = line; } } if (next_chr == mask_chr && next_pos <= (mask_pos+mask_line.size())) { char mask_base = mask_line[next_pos-mask_pos]-48; bool keep = (mask_base <= min_kept_mask_value); if (invert_mask == true) keep = !keep; if (keep == false) passed_filters = false; } else passed_filters = false; } void entry::filter_sites_by_number_of_alleles(int min_alleles, int max_alleles) { // Filter sites by the number of alleles (e.g. 2 for bi-allelic) if (passed_filters == false) return; if ((min_alleles <= 0) && (max_alleles == numeric_limits::max())) return; int N_alleles; parse_basic_entry(true); N_alleles = get_N_alleles(); if ((N_alleles < min_alleles) || (N_alleles > max_alleles)) passed_filters = false; } void entry::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_non_ref_af_any, double max_non_ref_af_any, double min_site_call_rate) { // Filter sites so that all allele frequencies are between limits if (passed_filters == false) return; if ((min_maf <= 0.0) && (max_maf >= 1.0) && (min_site_call_rate <= 0) && (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0) && (min_non_ref_af_any <= 0.0) && (max_non_ref_af_any >= 1.0)) return; unsigned int N_alleles; unsigned int N_non_missing_chr; parse_basic_entry(true); parse_genotype_entries(true); if (GT_idx == -1) LOG.error("Require Genotypes in variant file to filter by frequency and/or call rate"); N_alleles = get_N_alleles(); vector allele_counts; get_allele_counts(allele_counts, N_non_missing_chr); double freq, folded_freq; double maf=numeric_limits::max(); int N_failed = 0; for (unsigned int ui=0; ui 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af))) passed_filters = false; if ((ui > 0) && ((freq < min_non_ref_af_any) || (freq > max_non_ref_af_any))) N_failed++; } if (((min_non_ref_af > 0.0) || (max_non_ref_af < 1.0)) && (N_failed == (N_alleles-1))) passed_filters = false; if ((maf < min_maf) || (maf > max_maf)) passed_filters = false; double call_rate = N_non_missing_chr / double(get_N_chr()); if (call_rate < min_site_call_rate) passed_filters = false; } void entry::filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels) { if (passed_filters == false) return; if ((keep_only_indels == false) && (remove_indels == false)) return; if ((keep_only_indels == true) && (remove_indels == true)) LOG.error("Can't both keep and remove all indels!"); string allele; unsigned int ref_len, N_alleles; bool is_indel; parse_basic_entry(true); is_indel = false; allele = REF; ref_len = allele.size(); if (ref_len != 1) is_indel = true; N_alleles = get_N_alleles(); for (unsigned int ui=1; ui::max()) && (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits::max()) && (min_non_ref_ac_any <= 0) && (max_non_ref_ac_any == numeric_limits::max()) && (max_missing_call_count == numeric_limits::max())) return; unsigned int N_alleles, N_chr, N_non_missing_chr; parse_basic_entry(true); parse_genotype_entries(true); if (entry_header.has_genotypes == false) LOG.error("Require Genotypes in variant file to filter by allele counts and/or missing data"); N_alleles = get_N_alleles(); if (N_alleles <= 1 && min_mac > 0) passed_filters = false; vector allele_counts; get_allele_counts(allele_counts, N_non_missing_chr); N_chr = get_N_chr(); int mac = numeric_limits::max(); int N_failed = 0; for (unsigned int ui=0; ui 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac))) passed_filters = false; if ((ui > 0) && ((allele_counts[ui] < min_non_ref_ac_any) || (allele_counts[ui] > max_non_ref_ac_any))) N_failed++; } if (((min_non_ref_ac_any > 0) || (max_non_ref_ac_any < numeric_limits::max())) && (N_failed == (N_alleles-1))) passed_filters = false; if ((mac < min_mac) || (mac > max_mac)) passed_filters = false; if ((N_chr-N_non_missing_chr) > max_missing_call_count) passed_filters = false; } void entry::filter_sites_by_HWE_pvalue(double min_HWE_pvalue) { // Filter sites by HWE p-value // Note this assumes Biallelic SNPs. if(passed_filters == false) return; if (min_HWE_pvalue <= 0) return; unsigned int b11, b12, b22; double p_hwe, p_lo, p_hi; parse_basic_entry(true); parse_genotype_entries(true); if (entry_header.has_genotypes == false) LOG.error("Require Genotypes in variant file to filter sites by HWE."); get_genotype_counts(b11, b12, b22); entry::SNPHWE(b12, b11, b22, p_hwe, p_lo, p_hi); if (p_hwe < min_HWE_pvalue) passed_filters = false; } void entry::filter_sites_by_filter_status(const set &filter_flags_to_remove, const set &filter_flags_to_keep, bool remove_all) { // Filter sites by entries in the FILTER field. if (passed_filters == false) return; if ((remove_all == false) && (filter_flags_to_remove.empty()) && (filter_flags_to_keep.empty())) return; vector FILTERs; unsigned int N_to_remove = filter_flags_to_remove.size(); unsigned int N_to_keep = filter_flags_to_keep.size(); parse_basic_entry(false, true); get_FILTER_vector(FILTERs); if (N_to_keep > 0) { bool keep = false; for (unsigned int ui=0; ui= 1) && (FILTERs[0] == "PASS") ) return; else if ((remove_all == true) && (!FILTERs.empty())) passed_filters = false; else if (N_to_remove > 0) { for (unsigned int ui=0; ui 0) passed_filters = false; } void entry::filter_sites_by_thinning(int min_SNP_distance) { // Filter sites so that no two SNPs are within some minimum distance if (passed_filters == false) return; if (min_SNP_distance < 1) return; parse_basic_entry(); if (CHROM == thin_chrom) { int distance_from_last_SNP = POS - thin_pos; if (distance_from_last_SNP < min_SNP_distance) passed_filters = false; } if (passed_filters == true) thin_pos = POS; thin_chrom = CHROM; } void entry::filter_sites_by_INFO(const set &flags_to_remove, const set &flags_to_keep) { // Filter sites by entries in the INFO field. if (passed_filters == false) return; if ((flags_to_remove.empty()) && (flags_to_keep.empty())) return; string value; unsigned int N_to_remove = flags_to_remove.size(); unsigned int N_to_keep = flags_to_keep.size(); parse_basic_entry(false, false, true); if (N_to_keep > 0) { bool keep = false; for (set::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it) { if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag) LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly."); else { value = get_INFO_value(*it); if (value == "1") keep = true; } } passed_filters = keep; } if (passed_filters==false) return; if (N_to_remove > 0) { for (set::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it) { if (entry_header.INFO_map[ entry_header.INFO_reverse_map[*it] ].Type != Flag) LOG.error("Using INFO flag filtering on non flag type "+*it+" will not work correctly."); else { value = get_INFO_value(*it); if (value == "1") { passed_filters = false; continue; } } } } } vcftools-0.1.15/src/cpp/entry_getters.cpp000066400000000000000000000266371307140004000204130ustar00rootroot00000000000000/* entry_getters.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "entry.h" // Return the CHROMosome name string entry::get_CHROM() const { return CHROM; } // Return the CHROMosome name void entry::get_CHROM(string &out) const { out = CHROM; } int entry::get_POS() const { return POS; } string entry::get_ID() const { if (ID.size() == 0) return "."; return ID; } string entry::get_REF() const { if (REF == "") return "."; else return REF; } string entry::get_ALT() const { assert(parsed_ALT == true); string out; if (ALT.empty()) out = "."; else if (ALT.size() == 1 && ALT[0] == "") out = "."; else { out = ALT[0]; for (unsigned int ui=1; ui= ALT.size())) out = "."; else out = ALT[allele_num-1]; } string entry::get_allele(int allele_num) const { assert(parsed_ALT == true); if (allele_num == -2) return ""; else if (allele_num == 0) return REF; else if ((allele_num < 0) || (unsigned(allele_num - 1) >= ALT.size())) return "."; else return ALT[allele_num-1]; } string entry::get_ALT_allele(int allele_num) const { assert(parsed_ALT == true); if (allele_num == -2) return ""; else if ((allele_num == -1) || (unsigned(allele_num) >= ALT.size())) return "."; return ALT[allele_num]; } void entry::get_alleles_vector(vector &out) const { assert(parsed_ALT == true); out.resize(ALT.size()+1); out[0] = REF; copy(ALT.begin(), ALT.end(), out.begin()+1); } double entry::get_QUAL() const { return QUAL; } string entry::get_FILTER() const { assert(parsed_FILTER == true); ostringstream out; if (FILTER.empty()) out << "."; else { out << FILTER[0]; for (unsigned int ui=1; ui &out) const { assert(parsed_FILTER == true); out = FILTER; } string entry::get_INFO(const set &INFO_to_keep, bool keep_all_INFO) const { assert(parsed_INFO == true); ostringstream sout; sout.str(""); sout.clear(); bool first=true; if ( ( (!INFO.empty()) && (!INFO_to_keep.empty()) ) || keep_all_INFO ) { string key; for (unsigned int ui=0; ui > entry::get_INFO_vector(const set &INFO_to_keep, bool keep_all_INFO) { assert(parsed_INFO == true); vector > out_vector; if (keep_all_INFO == true) return INFO; if ( (!INFO.empty()) && (!INFO_to_keep.empty()) ) { string key; for (unsigned int ui=0; ui entry::get_INFO_values(const string &key) const { vector out; string tmp; tmp = get_INFO_value(key); if (tmp != "?") header::tokenize(tmp, ',', out); return out; } string entry::get_FORMAT() const { assert(parsed_FORMAT == true); string out; bool first = true; for (unsigned int ui=0; ui &out) const { assert(parsed_FORMAT_binary == true); out = FORMAT_binary; } // Return the alleles of a genotype as a pair of strings. void entry::get_indv_GENOTYPE_strings(unsigned int indv, pair &out) const { assert(parsed_GT[indv] == true); static string out_allele1, out_allele2; get_allele(GENOTYPE[indv].first, out_allele1); get_allele(GENOTYPE[indv].second, out_allele2); out = make_pair(out_allele1, out_allele2); } void entry::get_indv_GENOTYPE_ids(unsigned int indv, pair &out) const { assert(parsed_GT[indv] == true); out = GENOTYPE[indv]; } char entry::get_indv_PHASE(unsigned int indv) const { assert(parsed_GT[indv] == true); return PHASE[indv]; } int entry::get_indv_DEPTH(unsigned int indv) const { assert(parsed_DP[indv] == true); if (DEPTH.empty()) return -1; return DEPTH[indv]; } double entry::get_indv_GQUALITY(unsigned int indv) const { assert(parsed_GQ[indv] == true); if (GQUALITY.empty()) return -1; return GQUALITY[indv]; } void entry::get_indv_GFILTER_vector(unsigned int indv, vector &out) const { assert(parsed_FT[indv] == true); if (!GFILTER.empty()) out = GFILTER[indv]; else out.resize(0); } void entry::get_indv_GFILTER(unsigned int indv, string &out) const { assert(parsed_FT[indv] == true); if ((!GFILTER.empty()) && (GFILTER[indv].size()>0)) { out=""; for (unsigned int ui=0; ui &out, unsigned int &N_non_missing_chr_out) const { get_allele_counts(out, N_non_missing_chr_out, include_indv, include_genotype); } // Return the frequency (counts) of each allele. void entry::get_allele_counts(vector &out, unsigned int &N_non_missing_chr_out, const vector &include_indv, const vector &include_genotype) const { pair genotype; vector allele_counts(get_N_alleles(), 0); N_non_missing_chr_out = 0; for (unsigned int ui=0; ui -1) { allele_counts[genotype.first]++; N_non_missing_chr_out++; } if (genotype.second > -1) { allele_counts[genotype.second]++; N_non_missing_chr_out++; } } } out = allele_counts; } void entry::get_genotype_counts(const vector &include_indv, const vector &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const { out_N_hom1 = 0; out_N_hom2 = 0; out_N_het = 0; pair genotype; if (ALT.size() > 1) LOG.error("Tried to return the genotype counts of a non-biallelic SNP", 99); for (unsigned int ui=0; ui -1) && (genotype.second > -1)) { if (genotype.first != genotype.second) out_N_het++; else if (genotype.first == 0) out_N_hom1++; else if (genotype.first == 1) out_N_hom2++; else LOG.error("Unknown allele in genotype", 98); } } } } void entry::get_multiple_genotype_counts(const vector &include_indv, const vector &include_genotype, vector &out_N_hom, vector &out_N_het) const { out_N_hom.assign(ALT.size()+1, 0); out_N_het.assign(ALT.size()+1, 0); pair genotype; for (unsigned int ui=0; ui &out) const { out.resize(sizeof(uint32_t)); uint32_t pos = POS - 1; memcpy(&out[0], &pos, sizeof(pos)); } void entry::get_rlen(vector &out) const { out.resize(sizeof(int32_t)); int32_t rlen; if (REF != "" and REF != "." and REF != " ") rlen = (int32_t)REF.length(); else rlen = (int32_t)0; memcpy(&out[0], &rlen, sizeof(rlen)); } void entry::get_QUAL_binary(vector &out) const { out.resize(sizeof(float)); float qual = (float)QUAL; memcpy(&out[0], &qual, sizeof(qual)); } void entry::get_n_allele_info(vector &out) const { out.resize(sizeof(uint32_t)); uint32_t n_allele_info = (uint32_t)ALT.size() + 1; uint32_t n_info = (uint32_t)(INFO.size()-N_INFO_removed); n_allele_info = n_allele_info << 16; n_allele_info = n_allele_info | n_info; memcpy(&out[0], &n_allele_info, sizeof(n_allele_info)); } void entry::get_n_fmt_sample(vector &out) const { out.resize(sizeof(uint32_t)); uint32_t n_fmt_sample = (uint32_t)(FORMAT.size()-N_FORMAT_removed); uint32_t n_sample = (uint32_t)N_indv; n_fmt_sample = n_fmt_sample << 24; n_fmt_sample = n_fmt_sample | n_sample; memcpy(&out[0], &n_fmt_sample, sizeof(n_fmt_sample)); } void entry::get_ID_binary(vector &out) { make_typed_string(out, ID, true ); } void entry::get_ALLELES_binary(vector &out) { vector tmp; out.resize(0); make_typed_string(tmp, REF, true ); out.insert(out.end(), tmp.begin(), tmp.end()); for (unsigned int ui=0; ui::epsilon(); double FPMIN = numeric_limits::min() / numeric_limits::epsilon(); gln=gammln(a); b=x+1.0-a; c=1.0/FPMIN; d=1.0/b; h=d; for (i=1;;i++) { an = -i*(i-a); b += 2.0; d=an*d+b; if (fabs(d) < FPMIN) d=FPMIN; c=b+an/c; if (fabs(c) < FPMIN) c=FPMIN; d=1.0/d; del=d*c; h *= del; if (fabs(del-1.0) <= EPS) break; } return exp(-x+a*log(x)-gln)*h; } double gser(double a, double x, double &gln) { double sum,del,ap; gln=gammln(a); ap=a; del=sum=1.0/a; for (;;) { ++ap; del *= x/ap; sum += del; if (fabs(del) < fabs(sum)*numeric_limits::epsilon()) { return sum*exp(-x+a*log(x)-gln); } } return 0; } double gammp(double a, double x) { double gamser,gammcf,gln; if (x < 0.0 || a <= 0.0 || (x != x) || (a != a)) { return numeric_limits::quiet_NaN(); } if (x==0.0) return 0.0; if (x < (a+1.0)) { gamser=gser(a,x,gln); return gamser; } else { gammcf = gcf(a,x,gln); return 1.0-gammcf; } } double gammq(double a, double x) { double gamser,gammcf,gln; if (x < 0.0 || a <= 0.0 || (x != x) || (a != a)) { return numeric_limits::quiet_NaN(); } if (x == 0.0) return 1.0; if (x < (a+1.0)) { gamser=gser(a,x,gln); return 1.0-gamser; } else { gammcf = gcf(a,x,gln); return gammcf; } } vcftools-0.1.15/src/cpp/gamma.h000066400000000000000000000005741307140004000162340ustar00rootroot00000000000000#ifndef GAMMA_H #define GAMMA_H #include #include #include #include #include #include #include using namespace std; double gammln(double xx); double gcf(double a, double x, double &gln); double gser(double a, double x, double &gln); double gammp(double a, double x); double gammq(double a, double x); #endif vcftools-0.1.15/src/cpp/header.cpp000066400000000000000000000334361307140004000167400ustar00rootroot00000000000000/* * header.cpp * * Created on: Apr 29, 2013 * Author: amarcketta */ #include "header.h" header::header() { has_contigs = false; has_file_format = false; has_genotypes = false; has_header = false; has_idx = false; contig_index = 0; N_indv = 0; } void header::parse_meta(const string &line, unsigned int &line_index) { lines.push_back(line); if (line.compare(0,13,"##fileformat=")==0) { has_file_format = true; string version = line.substr(13); if ((version != "VCFv4.0") && (version != "VCFv4.1") && (version != "VCFv4.2")) LOG.error("VCF version must be v4.0, v4.1 or v4.2:\nYou are using version " + version); } else if (line.compare(0,7,"##INFO=")==0) { // Found an INFO descriptor line_index += add_INFO_descriptor(line.substr(8, line.size()-8), line_index); } else if (line.compare(0,9,"##FILTER=")==0) { // Found a FILTER descriptor line_index += add_FILTER_descriptor(line.substr(10, line.size()-8), line_index); } else if (line.compare(0,9,"##FORMAT=")==0) { // Found a genotype filter descriptor line_index += add_FORMAT_descriptor(line.substr(10, line.size()-8), line_index); } else if (line.compare(0,9,"##contig=")==0) { // Found a contig descriptor add_CONTIG_descriptor(line.substr(10, line.size()-8), contig_index); contig_index++; has_contigs = true; } else { Field_description I; size_t found = line.find_first_of("="); I.Field = line.substr(0,found); I.Other = line.substr(found+1); parsed_lines.push_back(I); } } void header::parse_header(const string &line) { // #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) if (has_header == true) LOG.warning("Multiple Header lines."); has_header = true; istringstream header(line); int count = 0; string tmp_str; unsigned int N_header_indv = 0; has_genotypes = false; while (!header.eof()) { getline(header, tmp_str, '\t'); switch (count) { case 0: if (tmp_str != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp_str); break; case 1: if (tmp_str != "POS") LOG.warning("Second Header entry should be POS: " + tmp_str); break; case 2: if (tmp_str != "ID") LOG.warning("Third Header entry should be ID: " + tmp_str); break; case 3: if (tmp_str != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp_str); break; case 4: if (tmp_str != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp_str); break; case 5: if (tmp_str != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp_str); break; case 6: if (tmp_str != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp_str); break; case 7: if (tmp_str != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp_str); break; case 8: if (tmp_str != "FORMAT") LOG.warning("Ninth Header entry should be FORMAT: " + tmp_str); else has_genotypes = true; break; default: { if (count <= 8) LOG.error("Incorrectly formatted header."); indv.push_back(tmp_str); N_header_indv++; } break; } count++; } N_indv = N_header_indv; if ((has_genotypes == true ) && (N_indv == 0)) LOG.warning("FORMAT field without genotypes?"); } int header::add_INFO_descriptor(const string &in, int index) { size_t found_end=in.find_last_of(">"); string details = in.substr(0, found_end); Field_description I; I.Field = "INFO"; vector tokens; tokenize(details, ',', tokens); if (tokens.size() < 4) LOG.error("Expected at least 4 parts in INFO definition: " + in); vector entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui entry; for (unsigned int ui=0; ui"; } lines.push_back(new_line.str()); } } void header::reparse() { unsigned int index = 0; has_idx = false; contig_index = 0; vector old_lines(lines.size(),""); copy(lines.begin(), lines.end(), old_lines.begin()); lines.resize(0); INFO_map.clear(); INFO_reverse_map.clear(); FILTER_map.clear(); FILTER_reverse_map.clear(); FORMAT_map.clear(); FORMAT_reverse_map.clear(); CONTIG_map.clear(); CONTIG_reverse_map.clear(); index += add_FILTER_descriptor("ID=PASS,Description=PASS", index); for (unsigned int ui=0; ui &out) { out.resize(0); istringstream ss(in); string tmp; while( getline(ss, tmp, token) ) { out.push_back(tmp); } } void header::split(const string &text, char sep, vector &tokens) { int start = 0, end = 0, idx = 0, max = tokens.size(); while ((end = text.find(sep, start)) != string::npos) { if (idx < max) tokens[idx] = text.substr(start, end - start); else tokens.push_back(text.substr(start, end - start)); start = end + 1; idx++; } if (idx < max) tokens[idx] = text.substr(start); else tokens.push_back(text.substr(start)); } string header::int2str(const int in, const int missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } } int header::str2int(const string &in, const int missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atoi(in.c_str()); } double header::str2double(const string &in, const double missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atof(in.c_str()); } string header::double2str(const double in, const double missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } } vcftools-0.1.15/src/cpp/header.h000066400000000000000000000043021307140004000163730ustar00rootroot00000000000000/* * header.h * * Created on: Apr 29, 2013 * Author: amarcketta */ #ifndef HEADER_H_ #define HEADER_H_ #include #include #include #include #include #include "output_log.h" using namespace std; extern output_log LOG; enum Type_enum {Integer=0, Float=1, Character=2, String=3, Flag=4}; class Field_description { public: string Field; string ID; int idx; int N_entries; string N_entries_str; string Type_str; Type_enum Type; string Description; string Length; string Assembly; string Source; string Version; string Other; Field_description() : Field(""), ID(""), idx(-1), N_entries(0), N_entries_str(""), Type_str(""), Type(Integer), Description(""), Length(""), Assembly(""), Source(""), Version(""), Other("") {}; ~Field_description() {}; }; class header { public: unsigned int contig_index; bool has_contigs; bool has_genotypes; bool has_header; bool has_file_format; bool has_idx; vector indv; vector lines; vector parsed_lines; unsigned int N_indv; map INFO_map; map FILTER_map; map FORMAT_map; map CONTIG_map; map CONTIG_reverse_map; map FILTER_reverse_map; map INFO_reverse_map; map FORMAT_reverse_map; header(); ~header() {}; void reprint(); void reparse(); void parse_meta(const string &line, unsigned int &line_index); void parse_header(const string &line); int add_INFO_descriptor(const string &in, int index); int add_FILTER_descriptor(const string &in, int index); int add_FORMAT_descriptor(const string &in, int index); void add_CONTIG_descriptor(const string &in, int index); static void tokenize(const string &in, char token, vector &out); static void split(const string &in, char token, vector &out); static int str2int(const string &in, const int missing_value=-1); static string int2str(const int in, const int missing_value=-1); static double str2double(const string &in, const double missing_value=-1.0); static string double2str(const double in, const double missing_value=-1.0); }; #endif /* HEADER_H_ */ vcftools-0.1.15/src/cpp/khash.h000066400000000000000000000445401307140004000162510ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - http://code.google.com/p/ulib/ - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better performance for random input. Double hashing is still the default as it is more robust to certain non-random input. * Added Wang's integer hash function (not used by default). This hash function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. */ #define AC_VERSION_KHASH_H "0.2.6" #include #include #include /* compipler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER #define inline __inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ extern kh_##name##_t *kh_init_##name(); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ __KHASH_TYPE(name, khkey_t, khval_t) \ SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ free(h->vals); \ free(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ kroundup32(new_n_buckets); \ if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ inc = __ac_inc(k, new_mask); \ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ free(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) static inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ vcftools-0.1.15/src/cpp/knetfile.c000066400000000000000000000434751307140004000167550ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 by Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Probably I will not do socket programming in the next few years and therefore I decide to heavily annotate this file, for Linux and Windows as well. -ac */ #include #include #include #include #include #include #include #include #ifndef _WIN32 #include #include #include #endif #include "knetfile.h" /* In winsock.h, the type of a socket is SOCKET, which is: "typedef * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed * integer -1. In knetfile.c, I use "int" for socket type * throughout. This should be improved to avoid confusion. * * In Linux/Mac, recv() and read() do almost the same thing. You can see * in the header file that netread() is simply an alias of read(). In * Windows, however, they are different and using recv() is mandatory. */ /* This function tests if the file handler is ready for reading (or * writing if is_read==0). */ static int socket_wait(int fd, int is_read) { fd_set fds, *fdr = 0, *fdw = 0; struct timeval tv; int ret; tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out FD_ZERO(&fds); FD_SET(fd, &fds); if (is_read) fdr = &fds; else fdw = &fds; ret = select(fd+1, fdr, fdw, 0, &tv); #ifndef _WIN32 if (ret == -1) perror("select"); #else if (ret == 0) fprintf(stderr, "select time-out\n"); else if (ret == SOCKET_ERROR) fprintf(stderr, "select: %d\n", WSAGetLastError()); #endif return ret; } #ifndef _WIN32 /* This function does not work with Windows due to the lack of * getaddrinfo() in winsock. It is addapted from an example in "Beej's * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ static int socket_connect(const char *host, const char *port) { #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) int on = 1, fd; struct linger lng = { 0, 0 }; struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; /* In Unix/Mac, getaddrinfo() is the most convenient way to get * server information. */ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); /* The following two setsockopt() are used by ftplib * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they * necessary. */ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); freeaddrinfo(res); return fd; } #else /* MinGW's printf has problem with "%lld" */ char *int64tostr(char *buf, int64_t x) { int cnt; int i = 0; do { buf[i++] = '0' + x % 10; x /= 10; } while (x); buf[i] = 0; for (cnt = i, i = 0; i < cnt/2; ++i) { int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; } return buf; } int64_t strtoint64(const char *buf) { int64_t x; for (x = 0; *buf != '\0'; ++buf) x = x * 10 + ((int64_t) *buf - 48); return x; } /* In windows, the first thing is to establish the TCP connection. */ int knet_win32_init() { WSADATA wsaData; return WSAStartup(MAKEWORD(2, 2), &wsaData); } void knet_win32_destroy() { WSACleanup(); } /* A slightly modfied version of the following function also works on * Mac (and presummably Linux). However, this function is not stable on * my Mac. It sometimes works fine but sometimes does not. Therefore for * non-Windows OS, I do not use this one. */ static SOCKET socket_connect(const char *host, const char *port) { #define __err_connect(func) \ do { \ fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ return -1; \ } while (0) int on = 1; SOCKET fd; struct linger lng = { 0, 0 }; struct sockaddr_in server; struct hostent *hp = 0; // open socket if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); // get host info if (isalpha(host[0])) hp = gethostbyname(host); else { struct in_addr addr; addr.s_addr = inet_addr(host); hp = gethostbyaddr((char*)&addr, 4, AF_INET); } if (hp == 0) __err_connect("gethost"); // connect server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); server.sin_family= AF_INET; server.sin_port = htons(atoi(port)); if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) return fd; } #endif static off_t my_netread(int fd, void *buf, off_t len) { off_t rest = len, curr, l = 0; /* recv() and read() may not read the required length of data with * one call. They have to be called repeatedly. */ while (rest) { if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading curr = netread(fd, (void*)((char*)buf + l), rest); /* According to the glibc manual, section 13.2, a zero returned * value indicates end-of-file (EOF), which should mean that * read() will not return zero if EOF has not been met but data * are not immediately available. */ if (curr == 0) break; l += curr; rest -= curr; } return l; } /************************* * FTP specific routines * *************************/ static int kftp_get_response(knetFile *ftp) { #ifndef _WIN32 unsigned char c; #else char c; #endif int n = 0; char *p; if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O //fputc(c, stderr); if (n >= ftp->max_response) { ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; ftp->response = (char*)realloc(ftp->response, ftp->max_response); } ftp->response[n++] = c; if (c == '\n') { if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) && ftp->response[3] != '-') break; n = 0; continue; } } if (n < 2) return -1; ftp->response[n-2] = 0; return strtol(ftp->response, &p, 0); } static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) { if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); return is_get? kftp_get_response(ftp) : 0; } static int kftp_pasv_prep(knetFile *ftp) { char *p; int v[6]; kftp_send_cmd(ftp, "PASV\r\n", 1); for (p = ftp->response; *p && *p != '('; ++p); if (*p != '(') return -1; ++p; sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; return 0; } static int kftp_pasv_connect(knetFile *ftp) { char host[80], port[10]; if (ftp->pasv_port == 0) { fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); return -1; } sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); sprintf(port, "%d", ftp->pasv_port); ftp->fd = socket_connect(host, port); if (ftp->fd == -1) return -1; return 0; } int kftp_connect(knetFile *ftp) { ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); if (ftp->ctrl_fd == -1) return -1; kftp_get_response(ftp); kftp_send_cmd(ftp, "USER anonymous\r\n", 1); kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); kftp_send_cmd(ftp, "TYPE I\r\n", 1); return 0; } int kftp_reconnect(knetFile *ftp) { if (ftp->ctrl_fd != -1) { netclose(ftp->ctrl_fd); ftp->ctrl_fd = -1; } netclose(ftp->fd); ftp->fd = -1; return kftp_connect(ftp); } // initialize ->type, ->host, ->retr and ->size knetFile *kftp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p; int l; if (strstr(fn, "ftp://") != fn) return 0; for (p = (char*)fn + 6; *p && *p != '/'; ++p); if (*p != '/') return 0; l = p - fn - 6; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_FTP; fp->fd = -1; /* the Linux/Mac version of socket_connect() also recognizes a port * like "ftp", but the Windows version does not. */ fp->port = strdup("21"); fp->host = (char*)calloc(l + 1, 1); if (strchr(mode, 'c')) fp->no_reconnect = 1; strncpy(fp->host, fn + 6, l); fp->retr = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->retr, "RETR %s\r\n", p); fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->size_cmd, "SIZE %s\r\n", p); fp->seek_offset = 0; return fp; } // place ->fd at offset off int kftp_connect_file(knetFile *fp) { int ret; long long file_size; if (fp->fd != -1) { netclose(fp->fd); if (fp->no_reconnect) kftp_get_response(fp); } kftp_pasv_prep(fp); kftp_send_cmd(fp, fp->size_cmd, 1); #ifndef _WIN32 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) { fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); return -1; } #else const char *p = fp->response; while (*p != ' ') ++p; while (*p < '0' || *p > '9') ++p; file_size = strtoint64(p); #endif fp->file_size = file_size; if (fp->offset>=0) { char tmp[32]; #ifndef _WIN32 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); #else strcpy(tmp, "REST "); int64tostr(tmp + 5, fp->offset); strcat(tmp, "\r\n"); #endif kftp_send_cmd(fp, tmp, 1); } kftp_send_cmd(fp, fp->retr, 0); kftp_pasv_connect(fp); ret = kftp_get_response(fp); if (ret != 150) { fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); netclose(fp->fd); fp->fd = -1; return -1; } fp->is_ready = 1; return 0; } /************************** * HTTP specific routines * **************************/ knetFile *khttp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p, *proxy, *q; int l; if (strstr(fn, "http://") != fn) return 0; // set ->http_host for (p = (char*)fn + 7; *p && *p != '/'; ++p); l = p - fn - 7; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->http_host = (char*)calloc(l + 1, 1); strncpy(fp->http_host, fn + 7, l); fp->http_host[l] = 0; for (q = fp->http_host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; // get http_proxy proxy = getenv("http_proxy"); // set ->host, ->port and ->path if (proxy == 0) { fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. fp->port = strdup(*q? q : "80"); fp->path = strdup(*p? p : "/"); } else { fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); for (q = fp->host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; fp->port = strdup(*q? q : "80"); fp->path = strdup(fn); } fp->type = KNF_TYPE_HTTP; fp->ctrl_fd = fp->fd = -1; fp->seek_offset = 0; return fp; } int khttp_connect_file(knetFile *fp) { int ret, l = 0; char *buf, *p; if (fp->fd != -1) netclose(fp->fd); fp->fd = socket_connect(fp->host, fp->port); buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); l += sprintf(buf + l, "\r\n"); netwrite(fp->fd, buf, l); l = 0; while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency if (buf[l] == '\n' && l >= 3) if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; ++l; } buf[l] = 0; if (l < 14) { // prematured header netclose(fp->fd); fp->fd = -1; return -1; } ret = strtol(buf + 8, &p, 0); // HTTP return code if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file off_t rest = fp->offset; while (rest) { off_t l = rest < 0x10000? rest : 0x10000; rest -= my_netread(fp->fd, buf, l); } } else if (ret != 206 && ret != 200) { free(buf); fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); netclose(fp->fd); fp->fd = -1; return -1; } free(buf); fp->is_ready = 1; return 0; } /******************** * Generic routines * ********************/ knetFile *knet_open(const char *fn, const char *mode) { knetFile *fp = 0; if (mode[0] != 'r') { fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); return 0; } if (strstr(fn, "ftp://") == fn) { fp = kftp_parse_url(fn, mode); if (fp == 0) return 0; if (kftp_connect(fp) == -1) { knet_close(fp); return 0; } kftp_connect_file(fp); } else if (strstr(fn, "http://") == fn) { fp = khttp_parse_url(fn, mode); if (fp == 0) return 0; khttp_connect_file(fp); } else { // local file #ifdef _WIN32 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may * be undefined on some systems, although it is defined on my * Mac and the Linux I have tested on. */ int fd = open(fn, O_RDONLY | O_BINARY); #else int fd = open(fn, O_RDONLY); #endif if (fd == -1) { perror("open"); return 0; } fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; fp->ctrl_fd = -1; } if (fp && fp->fd == -1) { knet_close(fp); return 0; } return fp; } knetFile *knet_dopen(int fd, const char *mode) { knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; return fp; } ssize_t knet_read(knetFile *fp, void *buf, size_t len) { off_t l = 0; if (fp->fd == -1) return 0; if (fp->type == KNF_TYPE_FTP) { if (fp->is_ready == 0) { if (!fp->no_reconnect) kftp_reconnect(fp); kftp_connect_file(fp); } } else if (fp->type == KNF_TYPE_HTTP) { if (fp->is_ready == 0) khttp_connect_file(fp); } if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX size_t rest = len; ssize_t curr; while (rest) { do { curr = read(fp->fd, (void*)((char*)buf + l), rest); } while (curr < 0 && EINTR == errno); if (curr < 0) return -1; if (curr == 0) break; l += curr; rest -= curr; } } else l = my_netread(fp->fd, buf, len); fp->offset += l; return l; } off_t knet_seek(knetFile *fp, off_t off, int whence) { if (whence == SEEK_SET && off == fp->offset) return 0; if (fp->type == KNF_TYPE_LOCAL) { /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ off_t offset = lseek(fp->fd, off, whence); if (offset == -1) return -1; fp->offset = offset; return fp->offset; } else if (fp->type == KNF_TYPE_FTP) { if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else if (whence == SEEK_END) fp->offset = fp->file_size + off; else return -1; fp->is_ready = 0; return fp->offset; } else if (fp->type == KNF_TYPE_HTTP) { if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); errno = ESPIPE; return -1; } if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else return -1; fp->is_ready = 0; return fp->offset; } errno = EINVAL; fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); return -1; } int knet_close(knetFile *fp) { if (fp == 0) return 0; if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific if (fp->fd != -1) { /* On Linux/Mac, netclose() is an alias of close(), but on * Windows, it is an alias of closesocket(). */ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); else netclose(fp->fd); } free(fp->host); free(fp->port); free(fp->response); free(fp->retr); // FTP specific free(fp->path); free(fp->http_host); // HTTP specific free(fp); return 0; } #ifdef KNETFILE_MAIN int main(void) { char *buf; knetFile *fp; int type = 4, l; #ifdef _WIN32 knet_win32_init(); #endif buf = calloc(0x100000, 1); if (type == 0) { fp = knet_open("knetfile.c", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 1) { // NCBI FTP, large file fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET); l = knet_read(fp, buf, 255); } else if (type == 2) { fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 3) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 4) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); knet_read(fp, buf, 10000); knet_seek(fp, 20000, SEEK_SET); knet_seek(fp, 10000, SEEK_SET); l = knet_read(fp, buf+10000, 10000000) + 10000; } if (type != 4 && type != 1) { knet_read(fp, buf, 255); buf[255] = 0; printf("%s\n", buf); } else write(fileno(stdout), buf, l); knet_close(fp); free(buf); return 0; } #endif vcftools-0.1.15/src/cpp/knetfile.h000066400000000000000000000031141307140004000167440ustar00rootroot00000000000000#ifndef KNETFILE_H #define KNETFILE_H #include #include #ifndef _WIN32 #define netread(fd, ptr, len) read(fd, ptr, len) #define netwrite(fd, ptr, len) write(fd, ptr, len) #define netclose(fd) close(fd) #else #include #define netread(fd, ptr, len) recv(fd, ptr, len, 0) #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) #define netclose(fd) closesocket(fd) #endif // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 typedef struct knetFile_s { int type, fd; int64_t offset; char *host, *port; // the following are for FTP only int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; char *response, *retr, *size_cmd; int64_t seek_offset; // for lazy seek int64_t file_size; // the following are for HTTP only char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) #define knet_fileno(fp) ((fp)->fd) #ifdef __cplusplus extern "C" { #endif #ifdef _WIN32 int knet_win32_init(); void knet_win32_destroy(); #endif knetFile *knet_open(const char *fn, const char *mode); /* This only works with local files. */ knetFile *knet_dopen(int fd, const char *mode); /* If ->is_ready==0, this routine updates ->fd; otherwise, it simply reads from ->fd. */ ssize_t knet_read(knetFile *fp, void *buf, size_t len); /* This routine only sets ->offset and ->is_ready=0. It does not communicate with the FTP server. */ off_t knet_seek(knetFile *fp, off_t off, int whence); int knet_close(knetFile *fp); #ifdef __cplusplus } #endif #endif vcftools-0.1.15/src/cpp/output_log.cpp000066400000000000000000000036301307140004000177020ustar00rootroot00000000000000/* * log.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 66 $) */ #include "output_log.h" output_log::output_log() { output_to_screen = true; output_to_file = true; } void output_log::open(bool stout, bool sterr, const string &filename_prefix) { if (stout) { output_to_screen = false; output_to_file = true; } if (sterr) { output_to_screen = true; output_to_file = false; } if (output_to_file) LOG.open((filename_prefix + ".log").c_str(), ios::out); } void output_log::close() { LOG.close(); } void output_log::printLOG(string s) { if (output_to_file) LOG << s; LOG.flush(); if (output_to_screen) cerr << s; cerr.flush(); } void output_log::error(string err_msg, int error_code) { printLOG("Error: " + err_msg + "\n"); exit(error_code); } void output_log::error(string err_msg, double value1, double value2, int error_code) { printLOG("Error: " + err_msg + "\n"); stringstream ss; ss << "Value1=" << value1 << " Value2=" << value2 << endl; printLOG(ss.str()); exit(error_code); } void output_log::warning(string err_msg) { printLOG(err_msg + "\n"); } void output_log::one_off_warning(string err_msg) { static set previous_warnings; if (previous_warnings.find(err_msg) == previous_warnings.end()) { printLOG(err_msg + "\n"); previous_warnings.insert(err_msg); } } string output_log::int2str(int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } string output_log::longint2str(long int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } string output_log::dbl2str(double n, int prc) { std::ostringstream s2; if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } string output_log::dbl2str_fixed(double n, int prc) { std::ostringstream s2; s2 << setiosflags( ios::fixed ); if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } vcftools-0.1.15/src/cpp/output_log.h000066400000000000000000000016501307140004000173470ustar00rootroot00000000000000/* * log.h * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 91 $) */ #ifndef LOG_H_ #define LOG_H_ #include #include #include #include #include #include #include #include #include using namespace std; class output_log { public: output_log(); ~output_log() {}; void open(bool stout, bool sterr, const string &filename); void close(); void printLOG(string s); void error(string err_msg, int error_code=1); void error(string err_msg, double value1, double value2, int error_code=1); void warning(string err_msg); void one_off_warning(string err_msg); static string int2str(int n); static string longint2str(long int n); static string dbl2str(double n, int prc); static string dbl2str_fixed(double n, int prc); private: bool output_to_screen; bool output_to_file; ofstream LOG; }; #endif /* LOG_H_ */ vcftools-0.1.15/src/cpp/parameters.cpp000066400000000000000000001234071307140004000176510ustar00rootroot00000000000000/* * parameters.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 249 $) */ // Class for reading in, checking and storing user parameters #include "parameters.h" parameters::parameters(int argc, char *argv[]) { if (isatty(STDERR_FILENO)) stream_err = false; else stream_err = true; string tmp; for (int i=0; iargv.push_back(tmp); } bcf_format = false; BED_exclude = false; BED_file = ""; chrom_map_file = ""; contigs_file = ""; derived = false; diff_discordance_matrix = false; diff_file = ""; diff_file_bcf = false; diff_file_compressed = false; diff_indv = false; diff_indv_discordance = false; diff_indv_map_file = ""; diff_site = false; diff_site_discordance = false; diff_switch_error = false; end_pos = numeric_limits::max(); exclude_positions_file = ""; exclude_positions_overlap_file = ""; fst_window_size = -1; fst_window_step = -1; hapcount_BED = ""; invert_mask = false; keep_only_indels = false; recode_all_INFO = false; ld_bp_window_size = numeric_limits::max(); ld_snp_window_size = numeric_limits::max(); ld_bp_window_min = -1; ld_snp_window_min = -1; min_mac = -1; min_maf = -1.0; mask_file = ""; max_alleles = numeric_limits::max(); max_genotype_depth = numeric_limits::max(); max_mac = numeric_limits::max(); max_maf = numeric_limits::max(); max_mean_depth = numeric_limits::max(); max_missing_call_count = numeric_limits::max(); max_non_ref_ac = numeric_limits::max(); max_non_ref_af = numeric_limits::max(); max_non_ref_ac_any = numeric_limits::max(); max_non_ref_af_any = numeric_limits::max(); max_N_indv = -1; mendel_ped_file = ""; min_alleles = -1; min_genotype_depth = -1; min_genotype_quality = -1.0; min_HWE_pvalue = -1.0; min_interSNP_distance = -1; min_kept_mask_value = 0; min_mean_depth = -1.0; min_quality = -1.0; min_r2 = -1.0; min_site_call_rate = 0; min_non_ref_ac = -1; min_non_ref_af = -1.0; min_non_ref_ac_any = -1; min_non_ref_af_any = -1.0; num_outputs = 0; output_012_matrix = false; output_as_IMPUTE = false; output_as_ldhat_phased = false; output_as_ldhat_unphased = false; output_as_ldhelmet = false; output_BEAGLE_genotype_likelihoods_GL = false; output_BEAGLE_genotype_likelihoods_PL = false; output_counts = false; output_filter_summary = false; output_freq = false; output_geno_depth = false; output_geno_chisq = false; output_geno_rsq = false; output_hap_rsq = false; output_het = false; output_HWE = false; output_indel_hist = false; output_indv_burden = false; output_indv_depth = false; output_indv_freq_burden = false; output_indv_freq_burden2 = false; output_indv_missingness = false; output_interchromosomal_hap_rsq = false; output_interchromosomal_geno_rsq = false; output_kept_sites = false; output_LROH = false; output_N_PCA_SNP_loadings = -1; output_PCA = false; output_prefix="out"; output_relatedness_Yang = false; output_relatedness_Manichaikul = false; output_removed_sites = false; output_singletons = false; output_site_depth = false; output_site_mean_depth = false; output_site_missingness = false; output_site_pi=false; output_site_quality = false; output_SNP_density_bin_size = 0; output_Tajima_D_bin_size = 0; output_TsTv_bin_size = 0; output_TsTv_by_count = false; output_TsTv_by_qual = false; output_TsTv_summary = false; phased_only = false; PCA_no_normalisation = false; pi_window_size = 0; pi_window_step = 0; plink_output = false; plink_tped_output = false; positions_file = ""; positions_overlap_file = ""; recode = false; recode_bcf = false; remove_all_filtered_genotypes = false; remove_all_filtered_sites = false; remove_indels = false; snps_to_exclude_file = ""; snps_to_keep_file = ""; start_pos = -1; stream_in = false; stream_out = false; suppress_allele_output = false; const char* raw = getenv("TMPDIR"); // Get environment variable temp_dir = raw?raw:""; // Handle case where TMPDIR is NULL. if(temp_dir.empty()) temp_dir = "/tmp/"; vcf_filename=""; vcf_format = false; vcf_compressed = false; } void parameters::read_parameters() { unsigned int i=1; string in_str; while (i=argv.size()) error("Requested Missing Argument",76); return argv[i]; } void parameters::print_params() { parameters defaults(0, 0); LOG.printLOG("Parameters as interpreted:\n"); string tmp_name = vcf_filename; if (tmp_name == "-") tmp_name = "[stdin]"; if (bcf_format == true) LOG.printLOG("\t--bcf " + tmp_name + "\n"); else if (vcf_format == true && vcf_compressed == false) LOG.printLOG("\t--vcf " + tmp_name + "\n"); else if (vcf_format == true && vcf_compressed == true) LOG.printLOG("\t--gzvcf " + tmp_name + "\n"); if (chrs_to_keep.size() > 0) { for (set::iterator it=chrs_to_keep.begin(); it != chrs_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--chr " + tmp + "\n"); } } if (chrs_to_exclude.size() > 0) { for (set::iterator it=chrs_to_exclude.begin(); it != chrs_to_exclude.end(); ++it) { string tmp = *it; LOG.printLOG("\t--not-chr " + tmp + "\n"); } } if (chrom_map_file != defaults.chrom_map_file) LOG.printLOG("\t--chrom-map " + chrom_map_file + "\n"); if (contigs_file != defaults.contigs_file) LOG.printLOG("\t--contigs " + contigs_file + "\n"); if (derived != defaults.derived) LOG.printLOG("\t--derived\n"); if (end_pos != defaults.end_pos) LOG.printLOG("\t--to-bp " + output_log::int2str(end_pos) + "\n"); if (exclude_positions_file != defaults.exclude_positions_file) LOG.printLOG("\t--exclude-positions " + exclude_positions_file + "\n"); if (exclude_positions_overlap_file != defaults.exclude_positions_overlap_file) LOG.printLOG("\t--exclude-positions-overlap " + exclude_positions_overlap_file + "\n"); if (FORMAT_id_to_extract != defaults.FORMAT_id_to_extract) LOG.printLOG("\t--extract-FORMAT-info " + FORMAT_id_to_extract + "\n"); if (geno_rsq_position_list != defaults.geno_rsq_position_list) LOG.printLOG("\t--geno-r2-positions " + geno_rsq_position_list + "\n"); if (hap_rsq_position_list != defaults.hap_rsq_position_list) LOG.printLOG("\t--hap-r2-positions " + hap_rsq_position_list + "\n"); if (fst_window_size != defaults.fst_window_size) LOG.printLOG("\t--fst-window-size " + output_log::int2str(fst_window_size) + "\n"); if (fst_window_step != defaults.fst_window_step) LOG.printLOG("\t--fst-window-step " + output_log::int2str(fst_window_step) + "\n"); if (weir_fst_populations.size() != 0) { for (unsigned int ui=0; ui 0) for (set::iterator it=site_filter_flags_to_exclude.begin(); it != site_filter_flags_to_exclude.end(); ++it) { string tmp = *it; LOG.printLOG("\t--remove-filtered " + tmp + "\n"); } if (site_filter_flags_to_keep.size() > 0) for (set::iterator it=site_filter_flags_to_keep.begin(); it != site_filter_flags_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--keep-filtered " + tmp + "\n"); } if (geno_filter_flags_to_exclude.size() > 0) for (set::iterator it=geno_filter_flags_to_exclude.begin(); it != geno_filter_flags_to_exclude.end(); ++it) { string tmp = *it; LOG.printLOG("\t--remove-filtered-geno " + tmp + "\n"); } if (INFO_to_extract.size() > 0) for (unsigned int ui=0; ui 0) for (set::iterator it=recode_INFO_to_keep.begin(); it != recode_INFO_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--recode-INFO " + tmp + "\n"); } if (site_INFO_flags_to_remove.size() > 0) for (set::iterator it=site_INFO_flags_to_remove.begin(); it != site_INFO_flags_to_remove.end(); ++it) { string tmp = *it; LOG.printLOG("\t--remove-INFO " + tmp + "\n"); } if (site_INFO_flags_to_keep.size() > 0) for (set::iterator it=site_INFO_flags_to_keep.begin(); it != site_INFO_flags_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--keep-INFO " + tmp + "\n"); } if (BED_file != defaults.BED_file) { if (BED_exclude == false) LOG.printLOG("\t--bed " + BED_file + "\n"); else LOG.printLOG("\t--exclude-bed " + BED_file + "\n"); } if (mask_file != defaults.mask_file) { if (invert_mask == false) LOG.printLOG("\t--mask " + mask_file + "\n"); else LOG.printLOG("\t--invert-mask " + mask_file + "\n"); } if (snps_to_keep.size() > 0) for (set::iterator it=snps_to_keep.begin(); it != snps_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--snp " + tmp + "\n"); } if (indv_to_keep.size() > 0) for (set::iterator it=indv_to_keep.begin(); it != indv_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--indv " + tmp + "\n"); } if (indv_to_exclude.size() > 0) for (set::iterator it=indv_to_exclude.begin(); it != indv_to_exclude.end(); ++it) { string tmp = *it; LOG.printLOG("\t--remove-indv " + tmp + "\n"); } LOG.printLOG("\n"); } void parameters::print_help() { unsigned int i; string in_str; if (argv.size() <= 1) { // If there are no user parameters, display help. argv.push_back("--?"); print_help(); } for(i = 0; i < argv.size(); i++) { in_str = argv[i]; if ((in_str == "--version")) { cout << "VCFtools (" << VCFTOOLS_VERSION << ")" << endl; exit(0); } if ((in_str == "-h") || (in_str == "-?") || (in_str == "-help") || (in_str == "--?") || (in_str == "--help") || (in_str == "--h")) { cout << endl << "VCFtools (" << VCFTOOLS_VERSION << ")" << endl; cout << "\u00A9 Adam Auton and Anthony Marcketta 2009" << endl << endl; cout << "Process Variant Call Format files" << endl; cout << endl; cout << "For a list of options, please go to:" << endl; cout << "\thttps://vcftools.github.io/man_latest.html" << endl; cout << endl; cout << "Alternatively, a man page is available, type:" << endl; cout << "\tman vcftools" << endl; cout << endl; cout << "Questions, comments, and suggestions should be emailed to:" << endl; cout << "\tvcftools-help@lists.sourceforge.net" << endl; cout << endl; exit(0); } } } void parameters::check_parameters() { parameters defaults(0, 0); if (vcf_filename == "-") stream_in = true; if (isatty(STDIN_FILENO) && stream_in) LOG.error("No input detected via stream."); if (!weir_fst_populations.empty()) num_outputs++; if (num_outputs > 1) error("Only one output function may be called.",0); if (vcf_filename == "" && !stream_in) error("Input file required.", 0); if (vcf_format == false && bcf_format == false) error("Must specify input file type",0); if (chrs_to_keep.size() > 0 && chrs_to_exclude.size() > 0) error("Cannot specify chromosomes to keep and to exclude", 1); if (end_pos < start_pos) error("End position must be greater than Start position.", 1); if (((end_pos != numeric_limits::max()) || (start_pos != -1)) && (chrs_to_keep.size() != 1)) error("Require a single chromosome when specifying a range.", 2); if (max_maf < min_maf) error("Maximum MAF must be not be less than Minimum MAF.", 4); if (max_mac < min_mac) error("Maximum MAC must be not be less than Minimum MAC.", 4); if (min_maf != defaults.min_maf) { if ((min_maf < 0.0) || (min_maf > 1.0)) error("MAF must be between 0 and 1.", 4); } if (max_maf != defaults.max_maf) { if ((max_maf < 0.0) || (max_maf > 1.0)) error("Maximum MAF must be between 0 and 1.", 4); } if (min_non_ref_af != defaults.min_non_ref_af) { if ((min_non_ref_af < 0.0) || (min_non_ref_af > 1.0)) error("Non-Ref Allele Frequency must be between 0 and 1.", 4); } if (min_non_ref_af_any != defaults.min_non_ref_af_any) { if ((min_non_ref_af_any < 0.0) || (min_non_ref_af_any > 1.0)) error("Non-Ref Allele Frequency must be between 0 and 1.", 4); } if (max_non_ref_af < min_non_ref_af) error("Maximum Non-Ref Allele Frequency must not be less that Minimum Non-Ref AF.", 4); if (max_non_ref_ac < min_non_ref_ac) error("Maximum Non-Ref Allele Count must not be less that Minimum Non-Ref AC.", 4); if (max_non_ref_af_any < min_non_ref_af_any) error("Maximum Non-Ref Allele Frequency must not be less that Minimum Non-Ref AF.", 4); if (max_non_ref_ac_any < min_non_ref_ac_any) error("Maximum Non-Ref Allele Count must not be less that Minimum Non-Ref AC.", 4); if (min_site_call_rate > 1) error("Minimum Call rate cannot be greater than 1.", 5); if (max_alleles < min_alleles) error("Max Number of Alleles must be greater than Min Number of Alleles.", 6); if (max_mean_depth < min_mean_depth) error("Max Mean Depth must be greater the Min Mean Depth.", 7); if (max_genotype_depth < min_genotype_depth) error("Max Genotype Depth must be greater than Min Genotype Depth.", 9); if (((output_as_ldhat_phased == true) || (output_as_ldhat_unphased) || (output_as_ldhelmet)) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting LDhat format.", 11); if ((output_BEAGLE_genotype_likelihoods_GL == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting Beagle likelihoods.", 11); if ((output_BEAGLE_genotype_likelihoods_PL == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting Beagle likelihoods.", 11); if (min_kept_mask_value > 9) error("Min Mask value must be between 0 and 9.", 14); if ((output_LROH == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting LROH.", 11); if (output_TsTv_bin_size < 0) error("TsTv bin size must be > 0",16); if (output_Tajima_D_bin_size < 0) error("Tajima D bin size must be > 0", 17); if (pi_window_size < 0) error("Pi Window size must be > 0", 18); if (output_SNP_density_bin_size < 0) error("SNP density bin size must be > 0", 18); if (stream_out) { if (output_012_matrix) error("Cannot output 012 matrix files to stream",19); if (plink_output || plink_tped_output) error("Cannot output Plink files to stream",19); if (output_as_ldhat_phased || output_as_ldhat_unphased) error("Cannot output LDhat files to stream",19); if (output_as_IMPUTE) error("Cannot output IMPUTE files to stream",19); } } void parameters::error(string err_msg, int code) { LOG.printLOG("\n\nError: " + err_msg + "\n\n"); exit(code); } vcftools-0.1.15/src/cpp/parameters.h000066400000000000000000000113111307140004000173040ustar00rootroot00000000000000/* * parameters.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 249 $) */ // Class for reading in, checking and storing user parameters #ifndef PARAMETERS_H_ #define PARAMETERS_H_ #if HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "output_log.h" extern output_log LOG; using namespace std; const string VCFTOOLS_VERSION=PACKAGE_VERSION; static const uint8_t bgzf_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; //just compare the first 16 chars? though static const uint8_t gzip_magic[2] = {0x1f,0x8b}; class parameters { public: bool stream_in; bool bcf_format; bool BED_exclude; string BED_file; set chrs_to_exclude; set chrs_to_keep; string chrom_map_file; string contigs_file; bool derived; bool diff_discordance_matrix; string diff_file; bool diff_file_bcf; bool diff_file_compressed; bool diff_indv; bool diff_indv_discordance; string diff_indv_map_file; bool diff_site; bool diff_site_discordance; bool diff_switch_error; int end_pos; string exclude_positions_file; string exclude_positions_overlap_file; string FORMAT_id_to_extract; set geno_filter_flags_to_exclude; string geno_rsq_position_list; string hap_rsq_position_list; string hapcount_BED; vector weir_fst_populations; int fst_window_size; int fst_window_step; vector indv_exclude_files; vector indv_keep_files; set indv_to_exclude; set indv_to_keep; vector INFO_to_extract; bool invert_mask; bool keep_only_indels; int ld_bp_window_size; int ld_snp_window_size; int ld_bp_window_min; int ld_snp_window_min; int min_mac; double min_maf; string mask_file; int max_alleles; int max_genotype_depth; int max_mac; double max_maf; double max_mean_depth; int max_missing_call_count; int max_non_ref_ac; double max_non_ref_af; int max_non_ref_ac_any; double max_non_ref_af_any; int max_N_indv; string mendel_ped_file; int min_alleles; int min_genotype_depth; double min_genotype_quality; double min_HWE_pvalue; int min_interSNP_distance; int min_kept_mask_value; double min_mean_depth; int min_non_ref_ac; double min_non_ref_af; int min_non_ref_ac_any; double min_non_ref_af_any; double min_quality; double min_r2; double min_site_call_rate; int num_outputs; bool output_012_matrix; bool output_as_IMPUTE; bool output_as_ldhat_phased; bool output_as_ldhat_unphased; bool output_as_ldhelmet; bool output_BEAGLE_genotype_likelihoods_GL; bool output_BEAGLE_genotype_likelihoods_PL; bool output_counts; bool output_filter_summary; bool output_freq; bool output_geno_depth; bool output_geno_chisq; bool output_geno_rsq; bool output_hap_rsq; bool output_het; bool output_HWE; bool output_indel_hist; bool output_indv_burden; bool output_indv_depth; bool output_indv_freq_burden; bool output_indv_freq_burden2; bool output_indv_missingness; bool output_interchromosomal_hap_rsq; bool output_interchromosomal_geno_rsq; bool output_kept_sites; bool output_LROH; int output_N_PCA_SNP_loadings; bool output_PCA; string output_prefix; bool output_relatedness_Yang; bool output_relatedness_Manichaikul; bool output_removed_sites; bool output_singletons; bool output_site_depth; bool output_site_mean_depth; bool output_site_missingness; bool output_site_pi; bool output_site_quality; int output_SNP_density_bin_size; int output_Tajima_D_bin_size; int output_TsTv_bin_size; bool output_TsTv_by_count; bool output_TsTv_by_qual; bool output_TsTv_summary; bool phased_only; bool PCA_no_normalisation; int pi_window_size; int pi_window_step; bool plink_output; bool plink_tped_output; string positions_file; string positions_overlap_file; bool recode; bool recode_bcf; set recode_INFO_to_keep; bool recode_all_INFO; bool remove_all_filtered_genotypes; bool remove_all_filtered_sites; bool remove_indels; set site_filter_flags_to_exclude; set site_filter_flags_to_keep; set site_INFO_flags_to_keep; set site_INFO_flags_to_remove; string snps_to_exclude_file; string snps_to_keep_file; set snps_to_keep; int start_pos; bool stream_err; bool stream_out; bool suppress_allele_output; string temp_dir; string vcf_filename; bool vcf_format; bool vcf_compressed; parameters(int argc, char *argv[]); ~parameters(){}; void read_parameters(); void print_help(); void print_params(); private: void check_parameters(); static void error(string err_msg, int code); vector argv; string get_arg(unsigned int i); }; #endif /* PARAMETERS_H_ */ vcftools-0.1.15/src/cpp/variant_file.cpp000066400000000000000000000057661307140004000201600ustar00rootroot00000000000000/* * variant_file.cpp * * Created on: Dec 11, 2012 * Author: amarcketta */ #include "variant_file.h" variant_file::~variant_file() {} // Return the number of individuals that have not been filtered out int variant_file::N_kept_individuals() const { int N_kept = 0; for (unsigned int ui=0; ui &contig_vector) { if (contigs_file == "") LOG.error("Contig declarations in header are necessary for BCF conversion. Use --contigs to add contigs to the header."); ifstream contigs(contigs_file.c_str()); if (!contigs.is_open()) LOG.error("Could not open contigs file: " + contigs_file); string line; int contig_lines = 0; contig_vector.resize(0); while (getline(contigs, line)) { if (line.find("##contig=")==string::npos) LOG.error("Contigs file must contain only contig header lines."); contig_vector.push_back(line); contig_lines++; } contigs.close(); LOG.printLOG("Including "+header::int2str(contig_lines)+" header lines from the contig file.\n"); } void variant_file::read_temp_site(ifstream &tmp_file, string &CHROM, int &POS, vector< pair > >s) { stringstream chr; char tmp_char; while(true) { tmp_file.read(&tmp_char,sizeof(char)); if (tmp_char == '\n') break; chr << tmp_char; } CHROM = chr.str(); tmp_file.read((char*)&POS,sizeof(POS)); char in_byte, tmp_gt; for(unsigned int ui=0; ui> 4; tmp_gt = in_byte & 0x03; if (tmp_gt == 0x02) GTs[ui].first = -1; else GTs[ui].first = (int)tmp_gt; } } void variant_file::read_big_temp_site(ifstream &tmp_file, string &CHROM, int &POS, int &alleles, vector< pair > >s) { stringstream chr; char tmp_char; while(true) { tmp_file.read(&tmp_char,sizeof(char)); if (tmp_char == '\n') break; chr << tmp_char; } CHROM = chr.str(); tmp_file.read((char*)&POS,sizeof(POS)); int8_t tmp_alleles; tmp_file.read((char*)&tmp_alleles,sizeof(tmp_alleles)); alleles = (int)tmp_alleles; char in_byte = 0xFF; for(unsigned int ui=0; ui #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "parameters.h" #include "entry.h" #include "gamma.h" #include "vcf_entry.h" #include "bcf_entry.h" #include "header.h" #if HAVE_LIBLAPACK # include "dgeev.h" #endif extern output_log LOG; using namespace std; class variant_file { public: string filename; bool compressed; istream *file_in; ifstream file_tmp; unsigned int gzMAX_LINE_LEN; gzFile gzfile_in; header meta_data; vector include_indv; unsigned int N_entries; unsigned int N_kept_entries; int N_kept_individuals() const; int N_kept_sites() const; int N_total_sites() const; virtual void open() = 0; virtual void open_gz() = 0; virtual void close() = 0; virtual bool eof() = 0; virtual void get_entry(vector &out) = 0; virtual entry* get_entry_object() = 0; void ByteSwap(unsigned char *b, int n) const; static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); }; void apply_filters(const parameters ¶ms); void filter_individuals(const set &indv_to_keep, const set &indv_to_exclude, const vector &indv_to_keep_filename, const vector &indv_to_exclude_filename, bool keep_then_exclude=true); void filter_individuals_by_keep_list(const set &indv_to_keep, const vector &indv_to_keep_filenames); void filter_individuals_by_exclude_list(const set &indv_to_exclude, const vector &indv_to_exclude_filenames); void filter_individuals_randomly(int max_N_indv); void output_frequency(const parameters ¶ms, bool output_counts=false); void output_individuals_by_mean_depth(const parameters ¶ms); void output_site_depth(const parameters ¶ms, bool output_mean=true); void output_genotype_depth(const parameters ¶ms); void output_het(const parameters ¶ms); void output_hwe(const parameters ¶ms); void output_SNP_density(const parameters ¶ms); void output_indv_missingness(const parameters ¶ms); void output_indv_burden(const parameters ¶ms); void output_indv_freq_burden(const parameters ¶ms, int double_count_hom_alt=0); void output_site_missingness(const parameters ¶ms); void output_haplotype_r2(const parameters ¶ms); void output_genotype_r2(const parameters ¶ms); void output_genotype_chisq(const parameters ¶ms, double min_pval); void output_interchromosomal_genotype_r2(const parameters ¶ms); void output_interchromosomal_haplotype_r2(const parameters & params); void output_haplotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms); void output_haplotype_count(const parameters ¶ms); void output_genotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms); void output_singletons(const parameters ¶ms); void output_TsTv(const parameters ¶ms); void output_TsTv_by_count(const parameters ¶ms); void output_TsTv_by_quality(const parameters ¶ms); void output_TsTv_summary(const parameters ¶ms); void output_per_site_nucleotide_diversity(const parameters ¶ms); void output_windowed_nucleotide_diversity(const parameters ¶ms); void output_Tajima_D(const parameters ¶ms); void output_site_quality(const parameters ¶ms); void output_FILTER_summary(const parameters ¶ms); void output_kept_sites(const parameters ¶ms); void output_removed_sites(const parameters ¶ms); void output_LROH(const parameters ¶ms); void output_indv_relatedness_Yang(const parameters ¶ms); void output_indv_relatedness_Manichaikul(const parameters ¶ms); void output_PCA(const parameters ¶ms); void output_PCA_SNP_loadings(const parameters ¶ms); void output_indel_hist(const parameters ¶ms); void output_as_012_matrix(const parameters ¶ms); void output_as_plink(const parameters ¶ms); void output_as_plink_tped(const parameters ¶ms); void output_BEAGLE_genotype_likelihoods(const parameters ¶ms, int GL_or_PL=0); void output_as_IMPUTE(const parameters ¶ms); void output_as_LDhat_phased(const parameters ¶ms); void output_as_LDhat_unphased(const parameters ¶ms); void output_as_LDhelmet(const parameters ¶ms); void output_FORMAT_information(const parameters ¶ms); void output_weir_and_cockerham_fst(const parameters ¶ms); void output_windowed_weir_and_cockerham_fst(const parameters ¶ms); void output_sites_in_files(const parameters ¶ms, variant_file &diff_vcf_file); void output_indv_in_files(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_by_site(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_matrix(const parameters ¶ms, variant_file &diff_vcf_file); void output_discordance_by_indv(const parameters ¶ms, variant_file &diff_vcf_file); void output_switch_error(const parameters ¶ms, variant_file &diff_vcf_file); void output_INFO_for_each_site(const parameters ¶ms); void output_mendel_inconsistencies(const parameters ¶ms); void write_stats(const parameters ¶ms); virtual void print(const parameters ¶ms) = 0; virtual void print_bcf(const parameters ¶ms) = 0; void calc_hap_r2(vector > >1, vector > >2, double &r2, double &D, double &Dprime, int &chr_count); void calc_geno_r2(vector > >1, vector > >2, double &r2, int &indv_count); void calc_r2_em(entry *e, entry *e2, double &r2, int &indv_count); void calc_geno_chisq(vector > >1, vector > >2, int &N0, int &N1, double &chisq, double &dof, double &pval, int &indv_count); void read_temp_site(ifstream &tmp_file, string &CHROM, int &POS, vector< pair > >s); void read_big_temp_site(ifstream &tmp_file, string &CHROM, int &POS, int &alleles, vector< pair > >s); void return_indv_union(variant_file &file2, map > &combined_individuals, const string &indv_ID_map_file=""); void get_contigs(const std::string &contigs_file, vector &contig_vector); virtual ~variant_file(); }; #endif /* VARIANT_FILE_H_ */ vcftools-0.1.15/src/cpp/variant_file_diff.cpp000066400000000000000000001264131307140004000211410ustar00rootroot00000000000000/* * variant_file_diff.cpp * * Created on: Oct 30, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "variant_file.h" void variant_file::return_indv_union(variant_file &file2, map > &combined_individuals, const string &indv_ID_map_file) { map indv_map; bool use_map = false; if (indv_ID_map_file != "") { LOG.printLOG("Reading individual mapping file. "); ifstream map(indv_ID_map_file.c_str()); if (!map.is_open()) LOG.error("Could not open map file: " + indv_ID_map_file); while (!map.eof()) { string indv1, indv2; map >> indv1 >> indv2; map.ignore(numeric_limits::max(), '\n'); if ((indv1 != "") && (indv1.substr(0,1) != "#")) { indv_map[indv1] = indv2; } } map.close(); use_map = true; LOG.printLOG("Read " + LOG.int2str(indv_map.size()) + " entries.\n"); } for (unsigned int ui=0; ui((int)ui, -1); } for (unsigned int ui=0; ui(-1, (int)ui); } } void variant_file::output_sites_in_files(const parameters ¶ms, variant_file &diff_variant_file) { string CHROM; vector variant_line; entry *e1 = get_entry_object(); entry *e2 = diff_variant_file.get_entry_object(); bool new_e1 = true; bool new_e2 = true; string CHROM1 = ""; string CHROM2 = ""; string curr_CHROM = ""; vector all_CHROM; int POS1 = -1; int POS2 = -1; string REF1 = ""; string REF2 = ""; string ALT1 = ""; string ALT2 = ""; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0, N_overlap_SNPs = 0; string output_file = params.output_prefix + ".diff.sites_in_files"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream sites_in_files(buf); sites_in_files << "CHROM\tPOS1\tPOS2\tIN_FILE\tREF1\tREF2\tALT1\tALT2" << endl; LOG.printLOG("Comparing sites in VCF files...\n"); while(true) { if(new_e1) { while(!eof()) { get_entry(variant_line); e1->reset(variant_line); N_entries += e1->apply_filters(params); if(!e1->passed_filters) continue; N_kept_entries++; e1->parse_basic_entry(true); CHROM1 = e1->get_CHROM(); POS1 = e1->get_POS(); REF1 = e1->get_REF(); ALT1 = e1->get_ALT(); break; } new_e1 = false; } if(new_e2) { while(!diff_variant_file.eof()) { diff_variant_file.get_entry(variant_line); e2->reset(variant_line); diff_variant_file.N_entries += e2->apply_filters(params); if(!e2->passed_filters) continue; diff_variant_file.N_kept_entries++; e2->parse_basic_entry(true); CHROM2 = e2->get_CHROM(); POS2 = e2->get_POS(); REF2 = e2->get_REF(); ALT2 = e2->get_ALT(); break; } new_e2 = false; } if(eof() && diff_variant_file.eof()) break; else if(diff_variant_file.eof()) { if(CHROM1 == curr_CHROM) { sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl; N_SNPs_file1_only++; new_e1 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); else { curr_CHROM = CHROM1; all_CHROM.push_back(CHROM1); sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl; N_SNPs_file1_only++; new_e1 = true; } } } else if(eof()) { if(CHROM2 == curr_CHROM) { sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl; N_SNPs_file2_only++; new_e2 = true; } else { if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); else { curr_CHROM = CHROM2; all_CHROM.push_back(CHROM2); sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl; N_SNPs_file2_only++; new_e2 = true; } } } else if(CHROM1 == CHROM2) { if (CHROM1 != curr_CHROM) { curr_CHROM = CHROM1; all_CHROM.push_back(curr_CHROM); } if(POS1 == POS2) { if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") ) REF1 = REF2; if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") ) REF2 = REF1; new_e1 = true; new_e2 = true; if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != "")) { sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl; N_overlap_SNPs++; } else { sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tB\t" << REF1 << "\t" << REF2 << "\t" << ALT1 << "\t" << ALT2 << endl; N_common_SNPs++; } } else if(POS1 < POS2) { if (POS2 < (POS1+REF1.size())) { sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl; N_overlap_SNPs++; new_e1 = true; new_e2 = true; } else { sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl; N_SNPs_file1_only++; new_e1 = true; } } else { if (POS1 < (POS2+REF2.size())) { sites_in_files << CHROM1 << "\t" << POS1 << "\t" << POS2 << "\tO\t" << REF1 << "\t" << REF2 <<"\t" << ALT1 << "\t" << ALT2 << endl; N_overlap_SNPs++; new_e1 = true; new_e2 = true; } else { sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl; N_SNPs_file2_only++; new_e2 = true; } } } else { if (CHROM1 == curr_CHROM) { sites_in_files << CHROM1 << "\t" << POS1 << "\t.\t1\t" << REF1 << "\t.\t" << ALT1 << "\t." << endl; N_SNPs_file1_only++; new_e1 = true; } else if (CHROM2 == curr_CHROM) { sites_in_files << CHROM2 << "\t.\t" << POS2 << "\t2\t.\t" << REF2 << "\t.\t" << ALT2 << endl; N_SNPs_file2_only++; new_e2 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); } } } LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n"); LOG.printLOG("Found " + output_log::int2str(N_overlap_SNPs) + " non-matching overlapping sites.\n"); delete e1; delete e2; } void variant_file::output_indv_in_files(const parameters ¶ms, variant_file &diff_variant_file) { LOG.printLOG("Comparing individuals in VCF files...\n"); string output_file = params.output_prefix + ".diff.indv_in_files"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "INDV\tFILES" << endl; // Build a list of individuals contained in each file map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file); unsigned int N_combined_indv = combined_individuals.size(); unsigned int N[3]={0,0,0}; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1)) { N[0]++; out << combined_individuals_it->first << "\tB" << endl; } else if (combined_individuals_it->second.first != -1) { N[1]++; out << combined_individuals_it->first << "\t1" << endl; } else if (combined_individuals_it->second.second != -1) { N[2]++; out << combined_individuals_it->first << "\t2" << endl; } else LOG.error("Unhandled case"); } LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n"); LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n"); LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n"); LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n"); } void variant_file::output_discordance_by_indv(const parameters ¶ms, variant_file &diff_variant_file) { map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file); LOG.printLOG("Outputting Discordance By Individual...\n"); map > indv_sums; vector variant_line; int indv1, indv2; entry * e1 = get_entry_object(); entry * e2 = diff_variant_file.get_entry_object(); string CHROM; bool new_e1 = true; bool new_e2 = true; string CHROM1 = ""; string CHROM2 = ""; string curr_CHROM = ""; vector all_CHROM; int POS1 = -1; int POS2 = -1; string REF1 = ""; string REF2 = ""; string ALT1 = ""; string ALT2 = ""; bool alleles_match = false; pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; while(true) { if(new_e1) { while(!eof()) { get_entry(variant_line); e1->reset(variant_line); N_entries += e1->apply_filters(params); if(!e1->passed_filters) continue; N_kept_entries++; e1->parse_basic_entry(true); CHROM1 = e1->get_CHROM(); POS1 = e1->get_POS(); REF1 = e1->get_REF(); ALT1 = e1->get_ALT(); break; } new_e1 = false; } if(new_e2) { while(!diff_variant_file.eof()) { diff_variant_file.get_entry(variant_line); e2->reset(variant_line); diff_variant_file.N_entries += e2->apply_filters(params); if(!e2->passed_filters) continue; diff_variant_file.N_kept_entries++; e2->parse_basic_entry(true); CHROM2 = e2->get_CHROM(); POS2 = e2->get_POS(); REF2 = e2->get_REF(); ALT2 = e2->get_ALT(); break; } new_e2 = false; } if(eof() && diff_variant_file.eof()) break; else if(diff_variant_file.eof()) { if(CHROM1 == curr_CHROM) { N_SNPs_file1_only++; new_e1 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); else { curr_CHROM = CHROM1; all_CHROM.push_back(CHROM1); N_SNPs_file1_only++; new_e1 = true; } } } else if(eof()) { if(CHROM2 == curr_CHROM) { N_SNPs_file2_only++; new_e2 = true; } else { if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); else { curr_CHROM = CHROM2; all_CHROM.push_back(CHROM2); N_SNPs_file2_only++; new_e2 = true; } } } else if(CHROM1 == CHROM2) { if (CHROM1 != curr_CHROM) { curr_CHROM = CHROM1; all_CHROM.push_back(curr_CHROM); } if(POS1 == POS2) { new_e1 = true; new_e2 = true; N_common_SNPs++; } else if(POS1 < POS2) { new_e1 = true; N_SNPs_file1_only++; } else { new_e2 = true; N_SNPs_file2_only++; } } else { if (CHROM1 == curr_CHROM) { new_e1 = true; N_SNPs_file1_only++; } else if (CHROM2 == curr_CHROM) { new_e2 = true; N_SNPs_file2_only++; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); } } if(new_e1 && new_e2) { if (REF1 == "N") REF1 = REF2; if (REF2 == "N") REF2 = REF1; if ((REF1.size() != REF2.size()) || ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N"))) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } alleles_match = (ALT1 == ALT2) && (REF1 == REF2); e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { indv_sums[combined_individuals_it->first].first++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing // Don't do anything. } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything. } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything. } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data indv_sums[combined_individuals_it->first].first++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing // Don't do anything } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything } else LOG.error("Unknown condition"); } } } } string output_file = params.output_prefix + ".diff.indv"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; int N, N_discord; double discordance; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { out << combined_individuals_it->first; N = indv_sums[combined_individuals_it->first].first; N_discord = indv_sums[combined_individuals_it->first].second; discordance = N_discord / double(N); out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl; } LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n"); delete e1; delete e2; } void variant_file::output_discordance_by_site(const parameters ¶ms, variant_file &diff_variant_file) { map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file); LOG.printLOG("Outputting Discordance By Site...\n"); string CHROM; vector variant_line; int indv1, indv2; entry *e1 = get_entry_object(); entry *e2 = diff_variant_file.get_entry_object(); bool new_e1 = true; bool new_e2 = true; string CHROM1 = ""; string CHROM2 = ""; string curr_CHROM = ""; vector all_CHROM; int POS1 = -1; int POS2 = -1; string REF1 = ""; string REF2 = ""; string ALT1 = ""; string ALT2 = ""; bool alleles_match = false; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; string output_file = params.output_prefix + ".diff.sites"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream diffsites(buf); diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; while(true) { if(new_e1) { while(!eof()) { get_entry(variant_line); e1->reset(variant_line); N_entries += e1->apply_filters(params); if(!e1->passed_filters) continue; N_kept_entries++; e1->parse_basic_entry(true); CHROM1 = e1->get_CHROM(); POS1 = e1->get_POS(); REF1 = e1->get_REF(); ALT1 = e1->get_ALT(); break; } new_e1 = false; } if(new_e2) { while(!diff_variant_file.eof()) { diff_variant_file.get_entry(variant_line); e2->reset(variant_line); diff_variant_file.N_entries += e2->apply_filters(params); if(!e2->passed_filters) continue; diff_variant_file.N_kept_entries++; e2->parse_basic_entry(true); CHROM2 = e2->get_CHROM(); POS2 = e2->get_POS(); REF2 = e2->get_REF(); ALT2 = e2->get_ALT(); break; } new_e2 = false; } if(eof() && diff_variant_file.eof()) break; else if(diff_variant_file.eof()) { if(CHROM1 == curr_CHROM) { diffsites << CHROM1 << "\t" << POS1 << "\t1\t"; N_SNPs_file1_only++; new_e1 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); else { curr_CHROM = CHROM1; all_CHROM.push_back(CHROM1); diffsites << CHROM1 << "\t" << POS1 << "\t1\t"; N_SNPs_file1_only++; new_e1 = true; } } } else if(eof()) { if(CHROM2 == curr_CHROM) { diffsites << CHROM2 << "\t" << POS2 << "\t2\t"; N_SNPs_file2_only++; new_e2 = true; } else { if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); else { curr_CHROM = CHROM2; all_CHROM.push_back(CHROM2); diffsites << CHROM2 << "\t" << POS2 << "\t2\t"; N_SNPs_file2_only++; new_e2 = true; } } } else if(CHROM1 == CHROM2) { if (CHROM1 != curr_CHROM) { curr_CHROM = CHROM1; all_CHROM.push_back(curr_CHROM); } if(POS1 == POS2) { if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") ) REF1 = REF2; if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") ) REF2 = REF1; new_e1 = true; new_e2 = true; if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != "")) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } diffsites << CHROM1 << "\t" << POS1 << "\tB\t"; N_common_SNPs++; } else if(POS1 < POS2) { diffsites << CHROM1 << "\t" << POS1 << "\t1\t"; N_SNPs_file1_only++; new_e1 = true; } else { diffsites << CHROM2 << "\t" << POS2 << "\t2\t"; N_SNPs_file2_only++; new_e2 = true; } } else { if (CHROM1 == curr_CHROM) { diffsites << CHROM1 << "\t" << POS1 << "\t1\t"; N_SNPs_file1_only++; new_e1 = true; } else if (CHROM2 == curr_CHROM) { diffsites << CHROM2 << "\t" << POS2 << "\t2\t"; N_SNPs_file2_only++; new_e2 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); } } pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int N_missing_1=0, N_missing_2=0; unsigned int N_discord=0; unsigned int N_concord_non_missing=0; if(new_e1 && new_e2) { alleles_match = (ALT1 == ALT2) && (REF1 == REF2); diffsites << alleles_match; e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { N_common_called++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing N_missing_1++; N_missing_2++; } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing N_missing_1++; N_missing_2++; } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } } } else diffsites << "0"; double discordance = N_discord / double(N_common_called); diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance; diffsites << endl; } LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n"); delete e1; delete e2; } void variant_file::output_discordance_matrix(const parameters ¶ms, variant_file &diff_variant_file) { map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file); LOG.printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n"); string CHROM; vector variant_line; int indv1, indv2; entry *e1 = get_entry_object(); entry *e2 = diff_variant_file.get_entry_object(); bool new_e1 = true; bool new_e2 = true; string CHROM1 = ""; string CHROM2 = ""; string curr_CHROM = ""; vector all_CHROM; int POS1 = -1; int POS2 = -1; string REF1 = ""; string REF2 = ""; string ALT1 = ""; string ALT2 = ""; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; vector > discordance_matrix(4, vector(4, 0)); if (combined_individuals.size() <= 0) LOG.error("No overlapping individuals can be found."); while(true) { if(new_e1) { while(!eof()) { get_entry(variant_line); e1->reset(variant_line); N_entries += e1->apply_filters(params); if(!e1->passed_filters) continue; N_kept_entries++; e1->parse_basic_entry(true); CHROM1 = e1->get_CHROM(); POS1 = e1->get_POS(); REF1 = e1->get_REF(); ALT1 = e1->get_ALT(); break; } new_e1 = false; } if(new_e2) { while(!diff_variant_file.eof()) { diff_variant_file.get_entry(variant_line); e2->reset(variant_line); diff_variant_file.N_entries += e2->apply_filters(params); if(!e2->passed_filters) continue; diff_variant_file.N_kept_entries++; e2->parse_basic_entry(true); CHROM2 = e2->get_CHROM(); POS2 = e2->get_POS(); REF2 = e2->get_REF(); ALT2 = e2->get_ALT(); break; } new_e2 = false; } if(eof() && diff_variant_file.eof()) break; else if(diff_variant_file.eof()) { if(CHROM1 == curr_CHROM) { N_SNPs_file1_only++; new_e1 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); else { curr_CHROM = CHROM1; all_CHROM.push_back(CHROM1); N_SNPs_file1_only++; new_e1 = true; } } } else if(eof()) { if(CHROM2 == curr_CHROM) { N_SNPs_file2_only++; new_e2 = true; } else { if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); else { curr_CHROM = CHROM2; all_CHROM.push_back(CHROM2); N_SNPs_file2_only++; new_e2 = true; } } } else if(CHROM1 == CHROM2) { if (CHROM1 != curr_CHROM) { curr_CHROM = CHROM1; all_CHROM.push_back(curr_CHROM); } if(POS1 == POS2) { if ((REF1 == "N") || (REF1 == ".") || (REF1 == "") ) REF1 = REF2; if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") ) REF2 = REF1; new_e1 = true; new_e2 = true; if ((REF1 != REF2) && (REF2 != "N") && (REF1 != "N") && (REF1 != ".") && (REF2 != ".") && (REF1 != "") && (REF2 != "")) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } N_common_SNPs++; } else if(POS1 < POS2) { N_SNPs_file1_only++; new_e1 = true; } else { N_SNPs_file2_only++; new_e2 = true; } } else { if (CHROM1 == curr_CHROM) { N_SNPs_file1_only++; new_e1 = true; } else if (CHROM2 == curr_CHROM) { N_SNPs_file2_only++; new_e2 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); } } if(new_e1 && new_e2) { if (e1->get_N_alleles() != 2 || e2->get_N_alleles() != 2) continue; if (ALT1 != ALT2) { LOG.one_off_warning("Non-matching ALT. Skipping all such sites."); continue; } e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair geno_ids1, geno_ids2; int N1, N2; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) { LOG.one_off_warning("Non-matching individual found. Skipping all such combinations."); continue; // Individual not found in one of the files } // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) || ((geno_ids2.first != -1) && (geno_ids2.second == -1))) { // Haploid LOG.one_off_warning("***Warning: Haploid chromosomes not counted!***"); continue; } N1 = geno_ids1.first + geno_ids1.second; N2 = geno_ids2.first + geno_ids2.second; if ((N1 == -1) || (N1 < -2) || (N1 > 2)) LOG.error("Unhandled case"); if ((N2 == -1) || (N2 < -2) || (N2 > 2)) LOG.error("Unhandled case"); if (N1 == -2) N1 = 3; if (N2 == -2) N2 = 3; discordance_matrix[N1][N2]++; } } } string output_file = params.output_prefix + ".diff.discordance_matrix"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl; out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl; out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl; out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl; out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl; LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n"); delete e1; delete e2; } void variant_file::output_switch_error(const parameters ¶ms, variant_file &diff_variant_file) { map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, params.diff_indv_map_file); LOG.printLOG("Outputting Phase Switch Errors...\n"); vector variant_line; int indv1, indv2; entry *e1 = get_entry_object(); entry *e2 = diff_variant_file.get_entry_object(); bool new_e1 = true; bool new_e2 = true; string CHROM1 = ""; string CHROM2 = ""; string curr_CHROM = ""; vector all_CHROM; int POS1 = -1; int POS2 = -1; string REF1 = ""; string REF2 = ""; string ALT1 = ""; string ALT2 = ""; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; string output_file = params.output_prefix + ".diff.switch"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream switcherror(buf); switcherror << "CHROM\tPOS_START\tPOS_END\tINDV" << endl; unsigned int N_combined_indv = combined_individuals.size(); vector N_phased_het_sites(N_combined_indv, 0); vector N_switch_errors(N_combined_indv, 0); pair missing_genotype(".","."); pair missing_loc(".",-1); vector > prev_geno_file1(N_combined_indv, missing_genotype); vector > prev_geno_file2(N_combined_indv, missing_genotype); vector > prev_pos_file1(N_combined_indv, missing_loc); vector > prev_pos_file2(N_combined_indv, missing_loc); pair file1_hap1, file1_hap2, file2_hap1; if (N_combined_indv <= 0) LOG.error("No overlapping individuals can be found."); while(true) { if(new_e1) { while(!eof()) { get_entry(variant_line); e1->reset(variant_line); N_entries += e1->apply_filters(params); if(!e1->passed_filters) continue; N_kept_entries++; e1->parse_basic_entry(true); CHROM1 = e1->get_CHROM(); POS1 = e1->get_POS(); REF1 = e1->get_REF(); ALT1 = e1->get_ALT(); break; } new_e1 = false; } if(new_e2) { while(!diff_variant_file.eof()) { diff_variant_file.get_entry(variant_line); e2->reset(variant_line); diff_variant_file.N_entries += e2->apply_filters(params); if(!e2->passed_filters) continue; diff_variant_file.N_kept_entries++; e2->parse_basic_entry(true); CHROM2 = e2->get_CHROM(); POS2 = e2->get_POS(); REF2 = e2->get_REF(); ALT2 = e2->get_ALT(); break; } new_e2 = false; } if(eof() && diff_variant_file.eof()) break; else if(diff_variant_file.eof()) { if(CHROM1 == curr_CHROM) { N_SNPs_file1_only++; new_e1 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); else { curr_CHROM = CHROM1; all_CHROM.push_back(CHROM1); N_SNPs_file1_only++; new_e1 = true; } } } else if(eof()) { if(CHROM2 == curr_CHROM) { N_SNPs_file2_only++; new_e2 = true; } else { if (find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); else { curr_CHROM = CHROM2; all_CHROM.push_back(CHROM2); N_SNPs_file2_only++; new_e2 = true; } } } else if(CHROM1 == CHROM2) { if (CHROM1 != curr_CHROM) { curr_CHROM = CHROM1; all_CHROM.push_back(curr_CHROM); } if(POS1 == POS2) { N_common_SNPs++; new_e1 = true; new_e2 = true; } else if(POS1 < POS2) { N_SNPs_file1_only++; new_e1 = true; } else { N_SNPs_file2_only++; new_e2 = true; } } else { if (CHROM1 == curr_CHROM) { N_SNPs_file1_only++; new_e1 = true; } else if (CHROM2 == curr_CHROM) { N_SNPs_file2_only++; new_e2 = true; } else { if(find(all_CHROM.begin(), all_CHROM.end(), CHROM1) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM1+" in file 1 appears to be out of order."); if(find(all_CHROM.begin(), all_CHROM.end(), CHROM2) != all_CHROM.end()) LOG.error("Both files must be sorted in the same chromosomal order.\n"+CHROM2+" in file 2 appears to be out of order."); LOG.error("Cannot determine chromosomal ordering of files, both files must contain the same chromosomes to use the diff functions.\nFound "+CHROM1+" in file 1 and "+CHROM2+" in file 2.\nUse option --not-chr to filter out chromosomes only found in one file."); } } if(new_e1 && new_e2) { e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair genotype1, genotype2; pair missing_genotype(".","."); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int indv_count=0; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it, indv_count++) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) { LOG.one_off_warning("Non-matching individual found. Skipping all such combinations."); continue; // Individual not found in one of the files } e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Have a matching genotypes in files 1 and 2 if (genotype1.first != genotype1.second) { // It's a heterozgote char phase1, phase2; phase1 = e1->get_indv_PHASE(indv1); phase2 = e2->get_indv_PHASE(indv2); if ((phase1 == '|') && (phase2 == '|')) { // Calculate Phasing error (switch error) N_phased_het_sites[indv_count]++; file1_hap1 = make_pair((string)prev_geno_file1[indv_count].first, (string)genotype1.first); file1_hap2 = make_pair((string)prev_geno_file1[indv_count].second, (string)genotype1.second); file2_hap1 = make_pair((string)prev_geno_file2[indv_count].first, (string)genotype2.first); if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2)) { // Must be a switch error string indv_id; N_switch_errors[indv_count]++; if (indv1 != -1) indv_id = meta_data.indv[indv1]; else indv_id = diff_variant_file.meta_data.indv[indv2]; if (prev_pos_file1[indv_count].first == prev_pos_file2[indv_count].first) { if (prev_pos_file1[indv_count].second <= prev_pos_file2[indv_count].second) switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file1[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl; else switcherror << prev_pos_file1[indv_count].first << "\t" << prev_pos_file2[indv_count].second << "\t" << POS1 << "\t" << indv_id << endl; } } prev_geno_file1[indv_count] = genotype1; prev_geno_file2[indv_count] = genotype2; prev_pos_file1[indv_count] = std::pair(CHROM1,POS1); prev_pos_file2[indv_count] = std::pair(CHROM2,POS2); } } } } } } } delete e1; delete e2; output_file = params.output_prefix + ".diff.indv.switch"; ofstream idiscord(output_file.c_str()); if (!idiscord.is_open()) LOG.error("Could not open Individual Discordance File: " + output_file, 3); idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl; unsigned int indv_count=0; double switch_error; string indv_id; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if (indv1 != -1) indv_id = meta_data.indv[indv1]; else indv_id = diff_variant_file.meta_data.indv[indv2]; if (N_phased_het_sites[indv_count] > 0) switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count]; else switch_error = 0; idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl; indv_count++; } idiscord.close(); LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " sites common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " sites only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " sites only in second file.\n"); } vcftools-0.1.15/src/cpp/variant_file_filters.cpp000066400000000000000000000102241307140004000216710ustar00rootroot00000000000000/* * variant_file_filters.cpp * * Author: amarcketta */ #include "variant_file.h" void variant_file::apply_filters(const parameters ¶ms) { filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_files, params.indv_exclude_files); filter_individuals_randomly(params.max_N_indv); } void variant_file::filter_individuals(const set &indv_to_keep, const set &indv_to_exclude, const vector &indv_to_keep_filenames, const vector &indv_to_exclude_filenames, bool keep_then_exclude) { // Filter individuals by user provided lists if (keep_then_exclude) { filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames); filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames); } else { filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames); filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames); } } void variant_file::filter_individuals_by_keep_list(const set &indv_to_keep, const vector &indv_to_keep_filenames) { // Filter individuals by user provided list if ((indv_to_keep_filenames.size() == 0) && (indv_to_keep.size() == 0)) return; LOG.printLOG("Keeping individuals in 'keep' list\n"); set indv_to_keep_copy = indv_to_keep; if (indv_to_keep_filenames.size() != 0) { for (unsigned int ui=0; ui> tmp_indv; indv_to_keep_copy.insert(tmp_indv); ss.clear(); } infile.close(); } } for (unsigned int ui=0; ui &indv_to_exclude, const vector &indv_to_exclude_filenames) { // Filter individuals by user provided list if ((indv_to_exclude_filenames.size() == 0) && (indv_to_exclude.size() == 0)) return; LOG.printLOG("Excluding individuals in 'exclude' list\n"); set indv_to_exclude_copy = indv_to_exclude; if (indv_to_exclude_filenames.size() != 0) { for (unsigned int ui=0; ui> tmp_indv; indv_to_exclude_copy.insert(tmp_indv); ss.clear(); } infile.close(); } } for (unsigned int ui=0; ui keep_index(N_kept_indv); int count = 0; for (unsigned int ui=0; ui tmp_files(meta_data.N_indv); vector tmp_filenames(meta_data.N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); (*tmp_file) << meta_data.indv[ui] << "\t" << meta_data.indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0; tmp_files[ui] = tmp_file; tmp_filenames[ui] = tmpname; } ofstream MAP(map_file.c_str()); if (!MAP.is_open()) LOG.error("Could not open output file: " + map_file, 12); int POS; string ID, CHROM, CHROM2; map CHROM_to_PLINK; if (params.chrom_map_file != "") { ifstream chrom_map(params.chrom_map_file.c_str()); if (!chrom_map.is_open()) LOG.error("Could not open chromosome mapping file: " + params.chrom_map_file); string chr, plink; unsigned int N_chrom_entries=0; while (!chrom_map.eof()) { chrom_map >> chr >> plink; CHROM_to_PLINK[chr] = plink; N_chrom_entries++; } chrom_map.close(); LOG.printLOG("\tRead " + output_log::int2str(N_chrom_entries) + " chromosome mapping file entries.\n"); } else { for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["chrM"] = "M"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; CHROM_to_PLINK["M"] = "M"; } vector alleles; char phase; pair genotype; vector variant_line; entry *e = get_entry_object(); ofstream *tmp_file; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() > 2) { LOG.one_off_warning("\tPLINK: Only outputting biallelic loci."); continue; } POS = e->get_POS(); ID = e->get_ID(); CHROM = e->get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { string tmp = ""; if (CHROM.compare(0,3,"chr") == 0) tmp = CHROM.substr(3, string::npos); else tmp = CHROM; bool isNumber = true; for(unsigned int ui=0; uiget_alleles_vector(alleles); for (unsigned int ui=0; uiinclude_genotype[ui] == true) { e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); phase = e->get_indv_PHASE(ui); } if (genotype.first < 0) (*tmp_file) << "\t0"; else if (genotype.first > 1) LOG.error("File contains entries with nonexistent genotypes at " + CHROM + ":" + output_log::int2str(e->get_POS()) ); else (*tmp_file) << "\t" << alleles[genotype.first]; if (genotype.second < 0) { if (phase == '/') (*tmp_file) << "\t0"; else if (genotype.first > -1) (*tmp_file) << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc else (*tmp_file) << "\t0"; } else if (genotype.second > 1) LOG.error("File contains entries with nonexistent genotypes at " + CHROM + ":" + output_log::int2str(e->get_POS()) ); else (*tmp_file) << "\t" << alleles[genotype.second]; } } MAP.close(); ofstream PED(ped_file.c_str()); if (!PED.is_open()) LOG.error("Could not open output file: " + ped_file, 12); string tmp_line; for (unsigned int ui=0; uiclose(); delete tmp_file; ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } PED.close(); delete e; LOG.printLOG("Done.\n"); } // Output as Plink Transposed file void variant_file::output_as_plink_tped(const parameters ¶ms) { // Output as PLINK formatted PED/MAP files. if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output as PLINK TPED."); LOG.printLOG("Writing PLINK TPED file ... "); string tped_file = params.output_prefix + ".tped"; string tfam_file = params.output_prefix + ".tfam"; ofstream TPED(tped_file.c_str()); if (!TPED.is_open()) LOG.error("Could not open output file: " + tped_file, 12); string CHROM, CHROM2; map CHROM_to_PLINK; if (params.chrom_map_file != "") { ifstream chrom_map(params.chrom_map_file.c_str()); if (!chrom_map.is_open()) LOG.error("Could not open chromosome mapping file: " + params.chrom_map_file); string chr, plink; unsigned int N_chrom_entries=0; while (!chrom_map.eof()) { chrom_map >> chr >> plink; CHROM_to_PLINK[chr] = plink; N_chrom_entries++; } chrom_map.close(); LOG.printLOG("\n\tRead " + output_log::int2str(N_chrom_entries) + " chromosome mapping file entries.\n"); } else { for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["chrM"] = "M"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; CHROM_to_PLINK["M"] = "M"; } vector alleles; char phase; pair genotype; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() > 2) // Only output sites with at most one alternative allele { LOG.one_off_warning("\tPLINK-TPED: Only outputting biallelic loci."); continue; } CHROM = e->get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { string tmp = ""; if (CHROM.compare(0,3,"chr") == 0) tmp = CHROM.substr(3, string::npos); else tmp = CHROM; bool isNumber = true; for(unsigned int ui=0; uiget_ID() == ".") TPED << CHROM2 << "\t" << e->get_CHROM() << ":" << e->get_POS() << "\t0\t" << e->get_POS(); else TPED << CHROM2 << "\t" << e->get_ID() << "\t0\t" << e->get_POS(); e->get_alleles_vector(alleles); for (unsigned int ui=0; uiinclude_genotype[ui] == true) { e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); phase = e->get_indv_PHASE(ui); } if (genotype.first < 0) TPED << "\t0"; else if (genotype.first > 1) LOG.error("File contains entries with nonexistent genotypes at " + CHROM + ":" + output_log::int2str(e->get_POS()) ); else TPED << "\t" << alleles[genotype.first]; if (genotype.second < 0) { if (phase == '/') TPED << "\t0"; else if (genotype.first > -1) TPED << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc else TPED << "\t0"; } else if (genotype.second > 1) LOG.error("File contains entries with nonexistent genotypes at " + CHROM + ":" + output_log::int2str(e->get_POS()) ); else TPED << "\t" << alleles[genotype.second]; } TPED << endl; } TPED.close(); LOG.printLOG("Writing PLINK TFAM file ... "); ofstream TFAM(tfam_file.c_str()); if (!TFAM.is_open()) LOG.error("Could not open output file: " + tfam_file, 12); for (unsigned int ui=0; ui tmp_files(meta_data.N_indv); vector tmp_filenames(meta_data.N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); (*tmp_file) << ui; tmp_files[ui] = tmp_file; tmp_filenames[ui] = tmpname; } FAM.close(); vector alleles; pair genotype; vector variant_line; entry *e = get_entry_object(); ofstream *tmp_file; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() > 2) { LOG.one_off_warning("\t012: Only outputting biallelic loci."); continue; } MAP << e->get_CHROM() << "\t" << e->get_POS() << endl; e->get_alleles_vector(alleles); for (unsigned int ui=0; uiinclude_genotype[ui] == true) { e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); } if ((genotype.first < 0) && (genotype.second < 0)) (*tmp_file) << "\t-1"; // Missing data else if ((genotype.first == 0) && (genotype.second == 0)) (*tmp_file) << "\t0"; // No copies of the alternative allele else { if ((genotype.first == 1) && (genotype.second == 1)) (*tmp_file) << "\t2"; // Two copies of the alternative allele else (*tmp_file) << "\t1"; // Must be one copy of the alternative allele. } } } ofstream PED(ped_file.c_str()); if (!PED.is_open()) LOG.error("Could not open output file: " + ped_file, 12); string tmp_line; for (unsigned int ui=0; uiclose(); delete tmp_file; ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } delete e; MAP.close(); PED.close(); LOG.printLOG("Done.\n"); } // Output as IMPUTE format void variant_file::output_as_IMPUTE(const parameters ¶ms) { if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output IMPUTE format."); LOG.printLOG("Outputting in IMPUTE format (bi-allelic, completely phased SNPs only)\n"); unsigned int ui; string legend_file = params.output_prefix + ".impute.legend"; string haplotype_file = params.output_prefix + ".impute.hap"; string indv_file = params.output_prefix + ".impute.hap.indv"; ofstream legend(legend_file.c_str()); if (!legend.is_open()) LOG.error("Could not open IMPUTE Legend Output File: " + legend_file, 2); legend << "ID pos allele0 allele1" << endl; ofstream hap(haplotype_file.c_str()); if (!hap.is_open()) LOG.error("Could not open IMPUTE Haplotype Output File: " + haplotype_file, 2); ofstream indv_out(indv_file.c_str()); if (!indv_out.is_open()) LOG.error("Could not open IMPUTE Individual Output File: " + indv_file, 2); for (ui=0; ui alleles; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tIMPUTE: Only outputting biallelic loci."); continue; } // Exclude entries with missing data and/or unphased bool missing = false; for (ui=0; uiinclude_genotype[ui] == false) { missing = true; break; } e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if ((alleles.first < 0) || (alleles.second < 0)) { missing = true; break; } if (e->get_indv_PHASE(ui) != '|') { missing = true; break; } } if (missing == true) continue; if (e->get_ID() == ".") { legend << e->get_CHROM() << "-" << e->get_POS() << " " << e->get_POS() << " " << e->get_REF() << " " << e->get_ALT_allele(0) << endl; } else legend << e->get_ID() << " " << e->get_POS() << " " << e->get_REF() << " " << e->get_ALT_allele(0) << endl; bool first = true; for (ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (first == true) { hap << alleles.first << " " << alleles.second; first = false; } else hap << " " << alleles.first << " " << alleles.second; } hap << endl; } delete e; hap.close(); legend.close(); } void variant_file::output_as_LDhat_phased(const parameters ¶ms) { if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhat format."); LOG.printLOG("Outputting in phased LDhat format\n"); unsigned int n_sites = 0; int max_pos = -1; int ret = -1; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); string locs_tmp_filename(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream locs_tmp_file(tmpname, std::ios::out | std::ios::binary); string sites_file = params.output_prefix + ".ldhat.sites"; string locs_file = params.output_prefix + ".ldhat.locs"; ofstream sites(sites_file.c_str()); if (!sites.is_open()) LOG.error("Could not open LDhat sites Output File: " + sites_file, 2); ofstream locs(locs_file.c_str()); if (!locs.is_open()) LOG.error("Could not open LDhat locs Output File: " + locs_file, 2); unsigned int n_indv = N_kept_individuals(); pair alleles; vector tmp_files(2*meta_data.N_indv); vector tmp_filenames(2*meta_data.N_indv); for (unsigned int ui=0; uiclose(); remove(tmpname); locs_tmp_file.close(); remove(locs_tmp_filename.c_str()); for (unsigned int uj=0; ujclose(); remove(tmp_filenames[2*uj].c_str()); (tmp_files[2*uj+1])->close(); remove(tmp_filenames[2*uj+1].c_str()); } LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); } ::close(ret); tmp_files[2*ui] = tmp_file; tmp_filenames[2*ui] = tmpname; char tmpname2[new_tmp.size()]; strcpy(tmpname2, new_tmp.c_str()); ret = mkstemp(tmpname2); ofstream *tmp_file2 = new ofstream(tmpname2); if (ret == -1) { // Clean up temp files. tmp_file2->close(); remove(tmpname2); locs_tmp_file.close(); remove(locs_tmp_filename.c_str()); for (unsigned int uj=0; ujclose(); remove(tmp_filenames[2*uj].c_str()); (tmp_files[2*uj+1])->close(); remove(tmp_filenames[2*uj+1].c_str()); } (tmp_files[2*ui])->close(); remove(tmp_filenames[2*ui].c_str()); LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); } ::close(ret); tmp_files[2*ui+1] = tmp_file2; tmp_filenames[2*ui+1] = tmpname2; } vector variant_line; entry *e = get_entry_object(); ofstream *tmp_file; int POS; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLDhat: Only outputting biallelic loci."); continue; } POS = e->get_POS(); max_pos = max(POS, max_pos); locs_tmp_file << POS << endl; for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); for (unsigned int k=0; k<2; k++) { tmp_file = tmp_files[(2*ui)+k]; int geno; if (k == 0) geno = alleles.first; else geno = alleles.second; if ((geno >= 0) && (e->include_genotype[ui]==true)) (*tmp_file) << geno; else (*tmp_file) << "?"; } } n_sites++; } locs << n_sites; locs.setf(ios::fixed,ios::floatfield); locs.precision(4); locs << "\t" << max_pos / 1000.0 << "\tL" << endl; ifstream locs_read_file(locs_tmp_filename.c_str()); string tmp_line; for (unsigned int ui=0; uiclose(); delete tmp_file; ifstream read_file(tmp_filenames[2*ui+k].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << meta_data.indv[ui] << "-" << k << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[2*ui+k].c_str()); } } delete e; remove(locs_tmp_filename.c_str()); sites.close(); } void variant_file::output_as_LDhat_unphased(const parameters ¶ms) { if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhat format."); LOG.printLOG("Outputting in unphased LDhat format\n"); unsigned int n_sites = 0; int max_pos = -1; int ret = -1; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); string locs_tmp_filename(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream locs_tmp_file(tmpname, std::ios::out | std::ios::binary); string sites_file = params.output_prefix + ".ldhat.sites"; string locs_file = params.output_prefix + ".ldhat.locs"; ofstream sites(sites_file.c_str()); if (!sites.is_open()) LOG.error("Could not open LDhat sites Output File: " + sites_file, 2); ofstream locs(locs_file.c_str()); if (!locs.is_open()) LOG.error("Could not open LDhat locs Output File: " + locs_file, 2); unsigned int n_indv = N_kept_individuals(); pair alleles; vector tmp_files(meta_data.N_indv); vector tmp_filenames(meta_data.N_indv); for (unsigned int ui=0; uiclose(); remove(tmpname); locs_tmp_file.close(); remove(locs_tmp_filename.c_str()); for (unsigned int uj=0; ujclose(); remove(tmp_filenames[uj].c_str()); } LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); } ::close(ret); tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } vector variant_line; entry *e = get_entry_object(); ofstream *tmp_file; int POS; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLDhat: Only outputting biallelic loci."); continue; } POS = e->get_POS(); max_pos = max(POS, max_pos); locs_tmp_file << POS << endl; for (unsigned int ui=0; uiinclude_genotype[ui] == false) (*tmp_file) << "?"; else { e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); switch (alleles.first) { case -2: (*tmp_file) << "?"; break; case -1: (*tmp_file) << "?"; break; case 0: if (alleles.second == 0) (*tmp_file) << 0; else if (alleles.second == 1) (*tmp_file) << 2; else if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) (*tmp_file) << 0; // Haploid case else if (alleles.second == -2) (*tmp_file) << 0; // Haploid case else (*tmp_file) << '?'; break; case 1: if (alleles.second == 0) (*tmp_file) << 2; else if (alleles.second == 1) (*tmp_file) << 1; else if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) (*tmp_file) << 1; // Haploid case else if (alleles.second == -2) (*tmp_file) << 1; // Haploid case else (*tmp_file) << '?'; break; default: (*tmp_file) << '?'; break; } } } n_sites++; } locs << n_sites; locs.setf(ios::fixed,ios::floatfield); locs.precision(4); locs << "\t" << max_pos / 1000.0 << "\tL" << endl; ifstream locs_read_file(locs_tmp_filename.c_str()); string tmp_line; for (unsigned int ui=0; uiclose(); delete tmp_file; ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << meta_data.indv[ui] << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } delete e; remove(locs_tmp_filename.c_str()); sites.close(); } // Output LDhelmet format void variant_file::output_as_LDhelmet(const parameters ¶ms) { if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhelmet format."); LOG.printLOG("Outputting in LDhelmet format\n"); unsigned int n_snps = 0; int max_pos = -1; int ret = -1; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); string pos_tmp_filename(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream pos_tmp_file(tmpname, std::ios::out | std::ios::binary); string snps_file = params.output_prefix + ".ldhelmet.snps"; string pos_file = params.output_prefix + ".ldhelmet.pos"; ofstream snps(snps_file.c_str()); if (!snps.is_open()) LOG.error("Could not open LDhelmet snps Output File: " + snps_file, 2); ofstream pos(pos_file.c_str()); if (!pos.is_open()) LOG.error("Could not open LDhelmet pos Output File: " + pos_file, 2); unsigned int n_indv = N_kept_individuals(); pair genotypes; vector alleles; vector tmp_files(2*meta_data.N_indv); vector tmp_filenames(2*meta_data.N_indv); for (unsigned int ui=0; uiclose(); remove(tmpname); pos_tmp_file.close(); remove(pos_tmp_filename.c_str()); for (unsigned int uj=0; ujclose(); remove(tmp_filenames[2*uj].c_str()); (tmp_files[2*uj+1])->close(); remove(tmp_filenames[2*uj+1].c_str()); } LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); } ::close(ret); tmp_files[2*ui] = tmp_file; tmp_filenames[2*ui] = tmpname; char tmpname2[new_tmp.size()]; strcpy(tmpname2, new_tmp.c_str()); ret = mkstemp(tmpname2); ofstream *tmp_file2 = new ofstream(tmpname2); if (ret == -1) { // Clean up temp files. tmp_file2->close(); remove(tmpname2); pos_tmp_file.close(); remove(pos_tmp_filename.c_str()); for (unsigned int uj=0; ujclose(); remove(tmp_filenames[2*uj].c_str()); (tmp_files[2*uj+1])->close(); remove(tmp_filenames[2*uj+1].c_str()); } (tmp_files[2*ui])->close(); remove(tmp_filenames[2*ui].c_str()); LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); } ::close(ret); tmp_files[2*ui+1] = tmp_file2; tmp_filenames[2*ui+1] = tmpname2; } vector variant_line; entry *e = get_entry_object(); ofstream *tmp_file; int POS; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); POS = e->get_POS(); max_pos = max(POS, max_pos); pos_tmp_file << POS << endl; for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotypes); e->get_alleles_vector(alleles); for (unsigned int k=0; k<2; k++) { tmp_file = tmp_files[(2*ui)+k]; int geno; if (k == 0) geno = genotypes.first; else geno = genotypes.second; if ((geno >= 0) && (e->include_genotype[ui]==true)) (*tmp_file) << alleles[geno]; else (*tmp_file) << "N"; } } n_snps++; } pos.setf(ios::fixed,ios::floatfield); pos.precision(4); ifstream pos_read_file(pos_tmp_filename.c_str()); string tmp_line; for (unsigned int ui=0; uiclose(); delete tmp_file; ifstream read_file(tmp_filenames[2*ui+k].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); snps << ">" << meta_data.indv[ui] << "-" << k << endl; snps << tmp_line << endl; read_file.close(); remove(tmp_filenames[2*ui+k].c_str()); } } delete e; remove(pos_tmp_filename.c_str()); snps.close(); } // Output INFO fields in tab-delimited format void variant_file::output_INFO_for_each_site(const parameters ¶ms) { LOG.printLOG("Outputting INFO for each site\n"); string output_file = params.output_prefix + ".INFO"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open INFO output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\tREF\tALT"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, false, true); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e->get_REF() << "\t" << e->get_ALT(); for (unsigned int ui=0; uiget_INFO_value(params.INFO_to_extract[ui]); out << endl; } delete e; } // Output FORMAT information in tab-delimited format. void variant_file::output_FORMAT_information(const parameters ¶ms) { string FORMAT_id = params.FORMAT_id_to_extract; if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output FORMAT information."); LOG.printLOG("Outputting FORMAT information for " + FORMAT_id + "\n"); string output_file = params.output_prefix + "." + FORMAT_id + ".FORMAT"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open FORMAT Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); e->parse_full_entry(true); if (e->FORMAT_id_exists(FORMAT_id) == false) continue; out << e->get_CHROM() << "\t" << e->get_POS(); for (unsigned int ui=0; uiread_indv_generic_entry(ui, FORMAT_id, FORMAT_out); out << "\t" << FORMAT_out; } out << endl; } delete e; } // Output genotype likelihoods from GL or PL FORMAT tag, ready for input into BEAGLE // using the Genotype likelihoods file format. void variant_file::output_BEAGLE_genotype_likelihoods(const parameters ¶ms, int GL_or_PL) { if (meta_data.has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output BEAGLE genotype likelihoods."); if (GL_or_PL == 0) LOG.printLOG("Outputting GLs in BEAGLE Genotype Likelihood format (bi-allelic SNPs with GL tags only)\n"); else if (GL_or_PL == 1) LOG.printLOG("Outputting PLs in BEAGLE Genotype Likelihood format (bi-allelic SNPs with PL tags only)\n"); else LOG.error("Unknown GL or PL option."); string output_file = params.output_prefix + ".BEAGLE.GL"; if (GL_or_PL == 1) output_file = params.output_prefix + ".BEAGLE.PL"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Beagle GL/PL Output file: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "marker\talleleA\talleleB"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); double lk1, lk2, lk3; bool found_GL=false; istringstream ss; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tBEAGLE: Only outputting biallelic loci."); continue; } e->parse_full_entry(true); if (GL_or_PL == 0) if (e->FORMAT_id_exists("GL") == false) continue; if (GL_or_PL == 1) if (e->FORMAT_id_exists("PL") == false) continue; found_GL = true; out << e->get_CHROM() << ":" << e->get_POS() << "\t" << e->get_REF() << "\t" << e->get_ALT(); for (unsigned int ui=0; uiinclude_genotype[ui] == true) { if (GL_or_PL == 0) e->read_indv_generic_entry(ui, "GL", GL_entry); else e->read_indv_generic_entry(ui, "PL", GL_entry); ss.clear(); ss.str(GL_entry); getline(ss, tmp_string, ','); lk1 = atof(tmp_string.c_str()); getline(ss, tmp_string, ','); lk2 = atof(tmp_string.c_str()); getline(ss, tmp_string); lk3 = atof(tmp_string.c_str()); if (GL_or_PL == 0) out << "\t" << pow(10,lk1) << "\t" << pow(10,lk2) << "\t" << pow(10,lk3); else out << "\t" << pow(10,-lk1*0.1) << "\t" << pow(10,-lk2*0.1) << "\t" << pow(10,-lk3*0.1); } else { out << "\t1\t1\t1"; // Mark as unknown } } out << endl; } delete e; if (found_GL == false) LOG.error("Require GL or PL FORMAT tags in VCF file to output BEAGLE input."); } vcftools-0.1.15/src/cpp/variant_file_output.cpp000066400000000000000000004345431307140004000215770ustar00rootroot00000000000000/* * variant_file_output.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 249 $) */ #include "variant_file.h" void variant_file::output_frequency(const parameters ¶ms, bool output_counts) { // Output statistics of frequency at each site if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Frequency Statistics."); LOG.printLOG("Outputting Frequency Statistics...\n"); string output_file = params.output_prefix + ".frq"; if (output_counts) output_file += ".count"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Frequency output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); if (params.suppress_allele_output == false) { out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:"; if (output_counts) out << "COUNT}\n"; else out << "FREQ}\n"; } else { if (output_counts) out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}\n"; else out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}\n"; } vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; vector variant_line; entry *e = get_entry_object(); unsigned int aa_idx = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; if (params.derived) e->parse_basic_entry(true, false, true); else e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); if (params.derived) { aa_idx = 0; string AA = e->get_INFO_value("AA"); std::transform(AA.begin(), AA.end(), AA.begin(), ::toupper); // Comment this out if only want high quality sites. if ((AA == "?") || (AA == ".")) { LOG.one_off_warning("\tWarning: Cannot output derived allele frequencies without Ancestral Alleles (AA)"); continue; } else { bool found = false; for (unsigned int ui=0; uiget_allele(ui)) { aa_idx = ui; found = true; break; } } if (found == false) { LOG.one_off_warning("\tWarning: Ancestral allele does not match any SNP allele."); continue; } } } e->get_allele_counts(allele_counts, N_non_missing_chr); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << N_alleles << "\t" << N_non_missing_chr; if (output_counts) { if (params.suppress_allele_output == false) { out << "\t" << e->get_allele(aa_idx) << ":" << allele_counts[aa_idx]; for (unsigned int ui=0; uiget_allele(ui) << ":" << allele_counts[ui]; } out << "\n"; } else { out << "\t" << allele_counts[aa_idx]; for (unsigned ui=0; uiget_allele(aa_idx) << ":" << freq; for (unsigned int ui=0; uiget_allele(ui) << ":" << freq; } } out << "\n"; } else { freq = allele_counts[aa_idx] / (double)N_non_missing_chr; out << "\t" << freq; for (unsigned int ui=0; ui allele_counts; unsigned int N_non_missing_chr; vector N_sites_included(meta_data.N_indv, 0); vector N_obs_hom(meta_data.N_indv, 0); vector N_expected_hom(meta_data.N_indv, 0.0); pair alleles; vector variant_line; entry *e = get_entry_object(); while (!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); e->parse_basic_entry(true); if(!e->passed_filters) continue; N_kept_entries++; if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tIndividual Heterozygosity: Only using biallelic SNPs."); continue; } e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tIndividual Heterozygosity: Only using fully diploid SNPs."); continue; } // Frequency of non-reference allele e->get_allele_counts(allele_counts, N_non_missing_chr); if (N_non_missing_chr > 0) freq = allele_counts[1] / double(N_non_missing_chr); else freq = -1; if ((freq <= numeric_limits::epsilon()) || (1.0 - freq <= numeric_limits::epsilon())) continue; for (unsigned int ui=0; uiinclude_genotype[ui] == true) { e->get_indv_GENOTYPE_ids(ui, alleles); if ((alleles.first > -1) && (alleles.second > -1)) { N_sites_included[ui]++; if (alleles.first == alleles.second) N_obs_hom[ui]++; N_expected_hom[ui] += 1.0 - (2.0 * freq * (1.0 - freq) * (N_non_missing_chr / (N_non_missing_chr - 1.0))); } } } } out.setf(ios::fixed,ios::floatfield); for (unsigned int ui=0; ui 0) { double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]); out << meta_data.indv[ui] << "\t" << N_obs_hom[ui] << "\t"; out.precision(1); out << N_expected_hom[ui] << "\t"; out.precision(5); out << N_sites_included[ui] << "\t" << F << endl; } } delete e; } void variant_file::output_hwe(const parameters ¶ms) { // Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005) if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output HWE Statistics."); // Note this assumes Biallelic SNPs. LOG.printLOG("Outputting HWE statistics (but only for biallelic loci)\n"); string output_file = params.output_prefix + ".hwe"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq_HWE\tP_HWE\tP_HET_DEFICIT\tP_HET_EXCESS" << endl; /* PLINK code: // b11 = Nhom1, b12 = Nhet, b22 = Nhom2 double tot = b11 + b12 + b22; double exp_11 = freq * freq * tot; double exp_12 = 2 * freq * (1-freq) * tot; double exp_22 = (1-freq) * (1-freq) * tot; double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ; p = chiprobP(chisq,1); */ double freq; unsigned int b11, b12, b22; double exp_11, exp_12, exp_22; double chisq; double tot; double p_hwe, p_lo, p_hi; unsigned int precision = out.precision(); vector allele_counts; unsigned int N_non_missing_chr; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tHWE: Only using biallelic SNPs."); continue; // Isn't biallelic } e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tHWE: Only using fully diploid SNPs."); continue; // Isn't diploid } e->get_allele_counts(allele_counts, N_non_missing_chr); freq = allele_counts[0] / (double)N_non_missing_chr; e->get_genotype_counts(b11, b12, b22); tot = b11 + b12 + b22; exp_11 = freq * freq * tot; exp_12 = 2.0 * freq * (1.0-freq) * tot; exp_22 = (1.0-freq) * (1.0-freq) * tot; chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22; entry::SNPHWE(b12, b11, b22, p_hwe, p_lo, p_hi); out << e->get_CHROM() << "\t" << e->get_POS(); out << "\t" << b11 << "/" << b12 << "/" << b22; out.precision(2); out << fixed << "\t" << exp_11 << "/" << exp_12 << "/" << exp_22; out.precision(precision); out << scientific; out << "\t" << chisq << "\t" << p_hwe << "\t" << p_lo << "\t" << p_hi << endl; } delete e; } void variant_file::output_indv_burden(const parameters ¶ms) { // Output the burden within each individual of variants at each frequency. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Burden Statistics."); LOG.printLOG("Outputting variant burden by individual\n"); string output_file = params.output_prefix + ".iburden"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Burden Output File: " + output_file, 2); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); vector< int > hom_ref_burden(meta_data.N_indv, 0); vector< int > het_burden(meta_data.N_indv, 0); vector< int > hom_alt_burden(meta_data.N_indv, 0); vector< int > missing_burden(meta_data.N_indv, 0); if (params.derived) out << "INDV\tN_HOM_ANC\tN_HET\tN_HOM_DER\tN_MISS" << endl; else out << "INDV\tN_HOM_REF\tN_HET\tN_HOM_ALT\tN_MISS" << endl; unsigned int N_alleles; vector variant_line; entry *e = get_entry_object(); int aa_idx = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; if (params.derived) e->parse_basic_entry(true, false, true); else e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); if (e->is_diploid() == false) { LOG.one_off_warning("\tWarning: Only using fully diploid sites."); continue; } if (params.derived) { aa_idx = 0; string AA = e->get_INFO_value("AA"); std::transform(AA.begin(), AA.end(), AA.begin(), ::toupper); // Comment this out if only want high quality sites. if ((AA == "?") || (AA == ".")) { LOG.one_off_warning("\tWarning: Cannot find Ancestral Alleles (AA)"); continue; } else { bool found = false; for (unsigned int ui=0; uiget_allele(ui)) { aa_idx = ui; found = true; break; } } if (found == false) { LOG.one_off_warning("\tWarning: Ancestral allele does not match any SNP allele."); continue; } } } pair geno; for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; if (e->include_genotype[ui] == true) { e->get_indv_GENOTYPE_ids(ui, geno); if ((geno.first == aa_idx) && (geno.second == aa_idx)) hom_ref_burden[ui]++; else if ((geno.first >= 0) && (geno.second >= 0) && (geno.first != geno.second)) het_burden[ui]++; else if ((geno.first >= 0) && (geno.second >= 0) && (geno.first == geno.second)) hom_alt_burden[ui]++; else missing_burden[ui]++; } } } delete e; for (unsigned int ui=0; ui > burden_matrix(N, vector(max_chr_count+1, 0)); out << "INDV"; for (int i=0; i<=max_chr_count; i++) out << "\t" << LOG.int2str(i); out << endl; vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; vector variant_line; entry *e = get_entry_object(); int aa_idx = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; if (params.derived) e->parse_basic_entry(true, false, true); else e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); if (e->is_diploid() == false) { LOG.one_off_warning("\tWarning: Only using fully diploid sites."); continue; } if (params.derived) { aa_idx = 0; string AA = e->get_INFO_value("AA"); std::transform(AA.begin(), AA.end(), AA.begin(), ::toupper); // Comment this out if only want high quality sites. if ((AA == "?") || (AA == ".")) { LOG.one_off_warning("\tWarning: Cannot find Ancestral Alleles (AA)"); continue; } else { bool found = false; for (unsigned int ui=0; uiget_allele(ui)) { aa_idx = ui; found = true; break; } } if (found == false) { LOG.one_off_warning("\tWarning: Ancestral allele does not match any SNP allele."); continue; } } } e->get_allele_counts(allele_counts, N_non_missing_chr); pair geno; int indv_count = 0; for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; if (e->include_genotype[ui] == true) { e->get_indv_GENOTYPE_ids(ui, geno); if ((geno.first != aa_idx) && (geno.first >= 0)) burden_matrix[indv_count][allele_counts[geno.first]]++; if ((double_count_hom_alt == 0) || (geno.first != geno.second)) { // Count the second allele if required if ((geno.second != aa_idx) && (geno.second >= 0)) burden_matrix[indv_count][allele_counts[geno.second]]++; } } indv_count++; } } delete e; int indv_count = 0; for (unsigned int ui=0; ui depth_sum(meta_data.N_indv, 0.0); vector count(meta_data.N_indv, 0); int depth; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; if (e->include_genotype[ui] == true) { e->parse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { depth_sum[ui] += depth; count[ui]++; } } } } for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); map > bins; vector chrs; unsigned int idx; double C = 1.0 / double(bin_size); int prev_pos = -1; string prev_chrom = ""; string alt; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); CHROM = e->get_CHROM(); POS = e->get_POS(); alt = e->get_ALT(); if (alt != "." && (POS != prev_pos || CHROM != prev_chrom)) { idx = (unsigned int)(POS * C); if (idx>=bins[CHROM].size()) bins[CHROM].resize(idx+1,0); bins[CHROM][idx]++; } if (CHROM != prev_chrom) chrs.push_back(CHROM); prev_pos = POS; prev_chrom = CHROM; } out << "CHROM\tBIN_START\tSNP_COUNT\tVARIANTS/KB" << endl; int bin_tot; C = 1000.0 / bin_size; for (unsigned int ui=0; ui 0) output = true; if (output == true) out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl; } } delete e; } void variant_file::output_indv_missingness(const parameters ¶ms) { // Output missingness by individual if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Missingness Statistics."); LOG.printLOG("Outputting Individual Missingness\n"); string output_file = params.output_prefix + ".imiss"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Individual Missingness Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl; unsigned int ui; vector indv_N_missing(meta_data.N_indv, 0), indv_N_tot(meta_data.N_indv, 0); vector indv_N_geno_filtered(meta_data.N_indv, 0); pair alleles; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); for (ui=0; uiinclude_genotype[ui] == false) { indv_N_geno_filtered[ui]++; continue; } e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (alleles.first == -1) indv_N_missing[ui]++; indv_N_tot[ui]++; } } for (ui=0; ui alleles; vector variant_line; entry *e = get_entry_object(); out << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); site_N_missing = 0; site_N_tot = 0; site_N_geno_filtered = 0; for (ui=0; uiinclude_genotype[ui] == false) { site_N_geno_filtered++; continue; } e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (alleles.first == -1) site_N_missing++; if (alleles.second == -1) site_N_missing++; site_N_tot+=2; if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) { // Phased missing genotypes indicate haploid genome site_N_tot--; site_N_missing--; } else if (alleles.second == -2) { site_N_tot--; site_N_missing--; } } out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << site_N_tot << "\t" << site_N_geno_filtered << "\t"; out << site_N_missing << "\t" << double(site_N_missing) / double(site_N_tot) << endl; } delete e; } void variant_file::calc_hap_r2(vector > >1, vector > >2, double &r2, double &D, double &Dprime, int &chr_count) { double x11=0, x12=0, x21=0, x22=0; double X=0, X2=0, Y=0, Y2=0, XY=0; double sx, sy; double rel_x11, p1, p2, q1, q2, Dmax; double var1, var2, cov12; chr_count = 0; int allele1, allele2; for (unsigned int ui=0; ui geno1, geno2; for (unsigned int ui=0; uiinclude_genotype[ui] == false) || (e2->include_genotype[ui] == false)) continue; e->get_indv_GENOTYPE_ids(ui, geno1); e2->parse_genotype_entry(ui, true); e2->get_indv_GENOTYPE_ids(ui, geno2); // TODO... not yet implemented...! LOG.error("Not yet implmented!\n"); } } void variant_file::calc_geno_r2(vector > >1, vector > >2, double &r2, int &indv_count) { double X=0, X2=0, Y=0, Y2=0, XY=0; double sx, sy; indv_count = 0; pair geno1, geno2; for (unsigned int ui=0; ui > >1, vector > >2, int &N0, int &N1, double &chisq, double &dof, double &pval, int &indv_count) { int N_genotypes0 = N0 * (N0+1) / 2; int N_genotypes1 = N1 * (N1+1) / 2; vector > observed(N_genotypes0, vector(N_genotypes1,0)); indv_count = 0; pair geno1, geno2; for (unsigned int ui=0; ui, int> idx_lookup1; int count = 0; for (int uj=0; uj, int> idx_lookup2; count = 0; for (int uj=0; uj > expected(N_genotypes0, vector(N_genotypes1,0)); vector row_tot(N_genotypes0, 0); vector col_tot(N_genotypes1, 0); double tot=0; for (int ui=0; ui 0) && (col_tot[uj] > 0)) // Don't use incomplete cases chisq += pow(observed[ui][uj] - expected[ui][uj], 2) / expected[ui][uj]; } } int n_col=0, n_row=0; for (int ui=0; ui 0) n_row++; for (int ui=0; ui 0) n_col++; dof = (n_row-1) * (n_col-1); pval = 1.0-gammp(dof/2, chisq/2); } // Count the number of haplotypes within user-defined bins void variant_file::output_haplotype_count(const parameters ¶ms) { if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Haplotype Counts."); LOG.printLOG("Outputting Haplotype Counts\n"); ifstream BED(params.hapcount_BED.c_str()); if (!BED.is_open()) LOG.error("Could not open BED file: " + params.hapcount_BED); string line; stringstream ss; string CHROM; int POS1, POS2; int idx; unsigned int N_chr=0; BED.ignore(numeric_limits::max(), '\n');; vector< vector< pair > > bin_positions; map chr_to_idx; while (!BED.eof()) { getline(BED, line); if ((line[0] == '#') || (line.size() == 0)) continue; ss.clear(); ss.str(line); ss >> CHROM >> POS1 >> POS2; if (chr_to_idx.find(CHROM) == chr_to_idx.end()) { N_chr++; chr_to_idx[CHROM] = (N_chr-1); bin_positions.resize(N_chr); } idx = chr_to_idx[CHROM]; bin_positions[idx].push_back(make_pair(POS1, POS2)); } BED.close(); for (unsigned int ui=0; ui bin_positions[ui][uj].first) LOG.error("BED file must be non-overlapping.\n", 33); } } vector< vector > haplotypes(2*meta_data.N_indv); vector variant_line; entry *e = get_entry_object(); string haplotype; pair geno; vector min_ui(N_chr, 0); bool have_data=false; string output_file = params.output_prefix + ".hapcount"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Haplotype Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "#CHROM\tBIN_START\tBIN_END\tN_SNP\tN_UNIQ_HAPS\tN_GROUPS\t{MULTIPLICITY:FREQ}" << endl; int bin_idx=0, prev_bin_idx=-1; int prev_idx = -1; vector haplotype_count; vector SNP_count; vector< map > haplotype_frequencies; string prev_CHROM=""; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); CHROM = e->get_CHROM(); POS1 = e->get_POS(); if (chr_to_idx.find(CHROM) == chr_to_idx.end()) continue; idx = chr_to_idx[CHROM]; if (idx != prev_idx) { // Moved to a new chromosome, so output last chromosome map, int > haplotype_set; if (have_data == true) { // Process any remaining data for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; haplotype_set[haplotypes[(2*ui)]]++; haplotype_set[haplotypes[(2*ui)+1]]++; SNP_count[prev_bin_idx] = haplotypes[2*ui].size(); haplotypes[(2*ui)].resize(0); haplotypes[(2*ui)+1].resize(0); } haplotype_count[prev_bin_idx] = haplotype_set.size(); for (map, int >::iterator it=haplotype_set.begin(); it != haplotype_set.end(); ++it) haplotype_frequencies[prev_bin_idx][it->second]++; } have_data = false; for (unsigned int ui=0; ui::iterator it = haplotype_frequencies[ui].begin(); it != haplotype_frequencies[ui].end(); ++it) out << "\t" << it->second << ":" << it->first; out << endl; } // Set up for new chromosome unsigned int N_bins = bin_positions[idx].size(); haplotype_count.clear(); haplotype_count.resize(N_bins, 0); SNP_count.clear(); SNP_count.resize(N_bins, 0); haplotype_frequencies.clear(); haplotype_frequencies.resize(N_bins); bin_idx=0, prev_bin_idx=-1; prev_idx = idx; prev_CHROM = CHROM; } bool found=false; unsigned int max_ui = bin_positions[idx].size(); for (unsigned int ui=min_ui[idx]; ui bin_positions[idx][ui].first) && (POS1 <= bin_positions[idx][ui].second)) { // We're in a BED bin, so add to haplotypes found=true; prev_bin_idx = bin_idx; bin_idx = ui; break; } else if (POS1 > bin_positions[idx][ui].second) min_ui[idx] = ui+1; } if ((found == false) || (prev_bin_idx != bin_idx)) { // Changed bin, so update haplotype count in previous bin, and reset for next bin if (have_data == true) { map, int > haplotype_set; for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; haplotype_set[haplotypes[(2*ui)]]++; haplotype_set[haplotypes[(2*ui)+1]]++; SNP_count[prev_bin_idx] = haplotypes[2*ui].size(); haplotypes[(2*ui)].resize(0); haplotypes[(2*ui)+1].resize(0); } haplotype_count[prev_bin_idx] = haplotype_set.size(); for (map, int >::iterator it=haplotype_set.begin(); it != haplotype_set.end(); ++it) haplotype_frequencies[prev_bin_idx][it->second]++; } have_data = false; } if (found == true) { // Inside a bin, so append to haplotypes have_data = true; e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tWarning: Only using fully diploid sites."); continue; } for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; geno.first = -1; geno.second = -1; if (e->include_genotype[ui] == true) e->get_indv_GENOTYPE_ids(ui, geno); haplotypes[(2*ui)].push_back(geno.first); haplotypes[(2*ui)+1].push_back(geno.second); } } } delete e; if (idx == prev_idx) { // Output any remaining data from last chromosome if (have_data == true) { // Process any remaining data map, int > haplotype_set; for (unsigned int ui=0; uiinclude_indv[ui] == false) continue; haplotype_set[haplotypes[(2*ui)]]++; haplotype_set[haplotypes[(2*ui)+1]]++; SNP_count[prev_bin_idx] = haplotypes[2*ui].size(); } haplotype_count[prev_bin_idx] = haplotype_set.size(); for (map, int >::iterator it=haplotype_set.begin(); it != haplotype_set.end(); ++it) haplotype_frequencies[prev_bin_idx][it->second]++; } for (unsigned int ui=0; ui::iterator it = haplotype_frequencies[ui].begin(); it != haplotype_frequencies[ui].end(); ++it) out << "\t" << it->second << ":" << it->first; out << endl; } } } void variant_file::output_haplotype_r2(const parameters ¶ms) { // Output pairwise LD statistics, using traditional r^2. Requires phased haplotypes. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); int snp_window_size = params.ld_snp_window_size; int snp_window_min = params.ld_snp_window_min; int bp_window_size = params.ld_bp_window_size; int bp_window_min = params.ld_bp_window_min; double min_r2 = params.min_r2; LOG.printLOG("Outputting Pairwise LD (phased bi-allelic only)\n"); string output_file = params.output_prefix + ".hap.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR\tPOS1\tPOS2\tN_CHR\tR^2\tD\tDprime" << endl; double r2, D, Dprime; int chr_count, site_count = 0; unsigned int skip = (unsigned int)max((int)1, snp_window_min); pair geno; vector variant_line; string CHROM,CHROM2; int POS,POS2,ret = -1; vector out_line, tmp_int; entry *e = get_entry_object(); string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); out_line.reserve(meta_data.N_indv+10); int indv_miss = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); site_count++; string chrom_str = CHROM+"\n"; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiget_indv_PHASE(ui) != '|') { remove(tmpname); LOG.error("Require phased haplotypes for r^2 calculation (use --phased)\n"); } if (include_indv[ui] == false) { indv_miss++; continue; } if (e->include_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) > 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Cannot use polyploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } fd.write(&out_line[0],out_line.size()); } fd.close(); if (N_kept_entries <= 1) { remove(tmpname); LOG.error("Insufficient sites remained after filtering"); } ifstream tmp_file(tmpname, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; unsigned int uj = 0; for(unsigned int ui=0; ui snp_window_size) break; GTs2.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); read_temp_site(tmp_file, CHROM2, POS2, GTs2); if(uj < (ui+skip)) continue; if (CHROM != CHROM2) continue; if (POS2 < POS) LOG.one_off_warning("Warning: Input is unsorted, results may not be complete."); if ((POS2 - POS) < bp_window_min) continue; if ((POS2 - POS) > bp_window_size) break; calc_hap_r2(GTs, GTs2, r2, D, Dprime, chr_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << POS2 << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << endl; } } tmp_file.close(); remove(tmpname); delete e; } void variant_file::output_genotype_r2(const parameters ¶ms) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); int snp_window_size = params.ld_snp_window_size; int snp_window_min = params.ld_snp_window_min; int bp_window_size = params.ld_bp_window_size; int bp_window_min = params.ld_bp_window_min; double min_r2 = params.min_r2; LOG.printLOG("Outputting Pairwise LD (bi-allelic only)\n"); string output_file = params.output_prefix + ".geno.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl; double r2; int indv_count; unsigned int skip = (unsigned int)max((int)1, snp_window_min); vector variant_line; entry *e = get_entry_object(); int count = 0; string CHROM, CHROM2; int POS, POS2, ret = -1; pair geno; vector out_line, tmp_int; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); out_line.reserve(meta_data.N_indv+10); int indv_miss = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tgenoLD: Only using biallelic variants."); continue; // Isn't biallelic } count++; e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); string chrom_str = CHROM+"\n"; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiinclude_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) != 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Only using diploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } fd.write(&out_line[0],out_line.size()); } fd.close(); if (N_kept_entries <= 1) { remove(tmpname); LOG.error("Insufficient sites remained after filtering"); } ifstream tmp_file(tmpname, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; unsigned int uj = 0; for(unsigned int ui=0; ui snp_window_size) break; GTs2.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); read_temp_site(tmp_file, CHROM2, POS2, GTs2); if(uj < (ui+skip)) continue; if (CHROM != CHROM2) continue; if (POS2 < POS) LOG.one_off_warning("Warning: Input is unsorted, results may not be complete."); if ((POS2 - POS) < bp_window_min) continue; if ((POS2 - POS) > bp_window_size) break; calc_geno_r2(GTs, GTs2, r2, indv_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << POS2 << "\t" << indv_count << "\t" << r2 << endl; } } tmp_file.close(); remove(tmpname); delete e; } void variant_file::output_genotype_chisq(const parameters ¶ms, double min_pval) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); int snp_window_size = params.ld_snp_window_size; int snp_window_min = params.ld_snp_window_min; int bp_window_size = params.ld_bp_window_size; int bp_window_min = params.ld_bp_window_min; LOG.printLOG("Outputting Pairwise LD\n"); string output_file = params.output_prefix + ".geno.chisq"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR\tPOS1\tPOS2\tN_INDV\tCHI^2\tDOF\tPVAL" << endl; double chisq, dof, pval; int indv_count; unsigned int skip = (unsigned int)max((int)1, snp_window_min); vector variant_line; entry *e = get_entry_object(); string CHROM, CHROM2; int POS, POS2; pair geno; int8_t tmp_alleles; int alleles, alleles2, ret = -1; vector out_line, tmp_int; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); out_line.reserve(2*meta_data.N_indv+11); int indv_miss = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); tmp_alleles = (int8_t)e->get_N_alleles(); string chrom_str = CHROM+"\n"; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); out_line.push_back(tmp_alleles); int8_t out_byte = 0x00; indv_miss = 0; for (unsigned int ui=0; uiinclude_genotype[ui] == false) { out_byte = 0xFF; out_line.push_back(out_byte); out_line.push_back(out_byte); continue; } if (e->get_indv_ploidy(ui) != 2) { out_byte = 0xFF; out_line.push_back(out_byte); out_line.push_back(out_byte); LOG.one_off_warning("\tgenoLD: Only using diploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte = 0xFF; else out_byte = (int8_t)geno.first; out_line.push_back(out_byte); if (geno.second == -1) out_byte = 0xFF; else out_byte = (int8_t)geno.second; out_line.push_back(out_byte); } fd.write(&out_line[0],out_line.size()); } fd.close(); if (N_kept_entries <= 1) { remove(tmpname); LOG.error("Insufficient sites remained after filtering"); } ifstream tmp_file(tmpname, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; unsigned int uj = 0; for(unsigned int ui=0; ui snp_window_size) break; GTs2.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); read_big_temp_site(tmp_file, CHROM2, POS2, alleles2, GTs2); if(uj < (ui+skip)) continue; if (CHROM != CHROM2) continue; if (POS2 < POS) LOG.one_off_warning("Warning: Input is unsorted, results may not be complete."); if ((POS2 - POS) < bp_window_min) continue; if ((POS2 - POS) > bp_window_size) break; calc_geno_chisq(GTs, GTs2, alleles, alleles2, chisq, dof, pval, indv_count); if (min_pval > 0) if ((pval < min_pval) | (pval != pval)) continue; out << CHROM << "\t" << POS << "\t" << POS2 << "\t" << indv_count << "\t" << chisq << "\t" << dof << "\t" << pval << endl; } } tmp_file.close(); remove(tmpname); delete e; } void variant_file::output_interchromosomal_genotype_r2(const parameters ¶ms) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); double min_r2 = params.min_r2; LOG.printLOG("Outputting Interchromosomal Pairwise Genotype LD (bi-allelic only)\n"); string output_file = params.output_prefix + ".interchrom.geno.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl; int indv_count; double r2; vector variant_line; entry *e = get_entry_object(); int count = 0; string CHROM, CHROM2; int POS, POS2, ret = -1; pair geno; vector out_line, tmp_int; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); out_line.reserve(meta_data.N_indv+10); int indv_miss = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } count++; e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); string chrom_str = CHROM+"\n"; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiinclude_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) != 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Only using diploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } fd.write(&out_line[0],out_line.size()); } fd.close(); if (N_kept_entries <= 1) { remove(tmpname); LOG.error("Insufficient sites remained after filtering"); } ifstream tmp_file(tmpname, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; unsigned int uj=0; for(unsigned int ui=0; ui 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << CHROM2 << "\t" << POS2 << "\t" << indv_count << "\t" << r2 << endl; } } tmp_file.close(); remove(tmpname); delete e; } void variant_file::output_interchromosomal_haplotype_r2(const parameters ¶ms) { double min_r2 = params.min_r2; // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); LOG.printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n"); string output_file = params.output_prefix + ".interchrom.hap.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_CHR\tR^2" << endl; double D, Dprime; int chr_count, site_count = 0; double r2; entry *e; e = get_entry_object(); pair geno; vector variant_line; string CHROM,CHROM2; int POS,POS2,ret=-1; vector out_line, tmp_int; string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); out_line.reserve(meta_data.N_indv+10); int indv_miss = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); site_count++; string chrom_str = CHROM+"\n"; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiget_indv_PHASE(ui) != '|') { remove(tmpname); LOG.error("Require phased haplotypes for r^2 calculation (use --phased)\n"); } if (include_indv[ui] == false) { indv_miss++; continue; } if (e->include_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) > 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Cannot use polyploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } fd.write(&out_line[0],out_line.size()); } fd.close(); if (N_kept_entries <= 1) { remove(tmpname); LOG.error("Insufficient sites remained after filtering"); } ifstream tmp_file(tmpname, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; unsigned int uj = 0; for(unsigned int ui=0; ui 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << CHROM2 << "\t" << POS2 << "\t" << chr_count << "\t" << r2 << endl; } } tmp_file.close(); remove(tmpname); delete e; } void variant_file::output_haplotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms) { if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); LOG.printLOG("Outputting haplotype pairwise LD (bi-allelic only) for a set of SNPs versus all others.\n"); int snp_window_size = params.ld_snp_window_size; int snp_window_min = params.ld_snp_window_min; int bp_window_size = params.ld_bp_window_size; int bp_window_min = params.ld_bp_window_min; string positions_file = params.hap_rsq_position_list; double min_r2 = params.min_r2; vector< set > keep_positions; vector list_positions; map chr_to_idx; string line; stringstream ss; pair geno; string CHROM, CHROM2; int POS, POS2, idx, ret = -1; unsigned int N_chr=0; vector out_line, tmp_int; ifstream BED(positions_file.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + positions_file); BED.ignore(numeric_limits::max(), '\n'); int nlist = 0; while (!BED.eof()) { getline(BED, line); if (line[0] == '#' || line == "") continue; ss.clear(); ss.str(line); ss >> CHROM >> POS; if (chr_to_idx.find(CHROM) == chr_to_idx.end()) { N_chr++; chr_to_idx[CHROM] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[CHROM]; keep_positions[idx].insert(POS); nlist += 1; } BED.close(); if (nlist == 0) LOG.error("No sites found in positions file.\n",0); LOG.printLOG("\tRead in "+header::int2str(nlist)+" site(s) for LD analysis.\n"); string output_file = params.output_prefix + ".list.hap.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_CHR\tR^2" << endl; double D, Dprime; int chr_count, site_count = 0; double r2; vector variant_line; entry *e = get_entry_object(); string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); char tmpname2[new_tmp.size()]; strcpy(tmpname2, new_tmp.c_str()); ret = mkstemp(tmpname2); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd_POS(tmpname2, std::ios::out | std::ios::binary); nlist = 0; int indv_miss = 0; out_line.reserve(meta_data.N_indv+10); while (!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); bool check_pos = false; if ( chr_to_idx.find(CHROM) != chr_to_idx.end() ) if ( keep_positions[ chr_to_idx[CHROM] ].find(POS) != keep_positions[ chr_to_idx[CHROM] ].end() ) check_pos = true; string chrom_str = CHROM+"\n"; if (check_pos) { nlist++; list_positions.push_back(site_count); } else site_count++; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiget_indv_PHASE(ui) != '|') { remove(tmpname); remove(tmpname2); LOG.error("Require phased haplotypes for r^2 calculation (use --phased)\n"); } if (include_indv[ui] == false) { indv_miss++; continue; } if (e->include_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) > 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Cannot use polyploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } if (check_pos) fd_POS.write(&out_line[0],out_line.size()); else fd.write(&out_line[0],out_line.size()); } fd.close(); fd_POS.close(); ifstream tmp_file(tmpname, ios::binary); ifstream tmp_file2(tmpname2, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; GTs.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); GTs2.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); for(unsigned int ui=0; ui bp_window_size) continue; int list_pos = list_positions[ui]; if ( abs(list_pos - uj) < snp_window_min) continue; if ( abs(list_pos - uj) > snp_window_size) continue; calc_hap_r2(GTs, GTs2, r2, D, Dprime, chr_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << CHROM2 << "\t" << POS2 << "\t" << chr_count << "\t" << r2 << endl; } } tmp_file.close(); tmp_file2.close(); remove(tmpname); remove(tmpname2); delete e; } void variant_file::output_genotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms) { if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); LOG.printLOG("Outputting genotype pairwise LD (bi-allelic only) for a set of SNPs versus all others.\n"); int snp_window_size = params.ld_snp_window_size; int snp_window_min = params.ld_snp_window_min; int bp_window_size = params.ld_bp_window_size; int bp_window_min = params.ld_bp_window_min; vector< set > keep_positions; vector list_positions; map chr_to_idx; string line; stringstream ss; string CHROM, CHROM2; int POS, POS2, idx, ret = -1; pair geno; unsigned int N_chr=0; double min_r2 = params.min_r2; vector out_line, tmp_int; ifstream BED(params.geno_rsq_position_list.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + params.geno_rsq_position_list); BED.ignore(numeric_limits::max(), '\n'); int nlist = 0; while (!BED.eof()) { getline(BED, line); if (line[0] == '#' || line == "") continue; ss.clear(); ss.str(line); ss >> CHROM >> POS; if (chr_to_idx.find(CHROM) == chr_to_idx.end()) { N_chr++; chr_to_idx[CHROM] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[CHROM]; keep_positions[idx].insert(POS); nlist++; } BED.close(); if (nlist == 0) LOG.error("No sites found in positions file.\n",0); LOG.printLOG("\tRead in "+header::int2str(nlist)+" site(s) for LD analysis.\n"); string output_file = params.output_prefix + ".list.geno.ld"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LD Output File: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl; int indv_count, site_count = 0; double r2; vector variant_line; entry *e = get_entry_object(); string new_tmp = params.temp_dir+"/vcftools.XXXXXX"; char tmpname[new_tmp.size()]; strcpy(tmpname, new_tmp.c_str()); ret = mkstemp(tmpname); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd(tmpname, std::ios::out | std::ios::binary); char tmpname2[new_tmp.size()]; strcpy(tmpname2, new_tmp.c_str()); ret = mkstemp(tmpname2); if (ret == -1) LOG.error(" Could not open temporary file.\n", 12); ::close(ret); ofstream fd_POS(tmpname2, std::ios::out | std::ios::binary); nlist = 0; int indv_miss = 0; out_line.reserve(meta_data.N_indv+10); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); bool check_pos = false; if ( chr_to_idx.find(CHROM) != chr_to_idx.end() ) if ( keep_positions[ chr_to_idx[CHROM] ].find(POS) != keep_positions[ chr_to_idx[CHROM] ].end() ) check_pos = true; string chrom_str = CHROM+"\n"; if (check_pos) { nlist++; list_positions.push_back(site_count); } else site_count++; out_line.resize(0); copy(chrom_str.begin(), chrom_str.end(), back_inserter(out_line)); tmp_int.resize(0); e->make_int(tmp_int, POS, 3); copy(tmp_int.begin(), tmp_int.end(), back_inserter(out_line)); char out_byte; indv_miss = 0; for (unsigned int ui=0; uiinclude_genotype[ui] == false) { out_line.push_back(0x22); continue; } if (e->get_indv_ploidy(ui) != 2) { out_line.push_back(0x22); LOG.one_off_warning("\tLD: Only using diploid individuals."); continue; } e->get_indv_GENOTYPE_ids(ui, geno); if (geno.first == -1) out_byte |= 0x02; else out_byte |= (char)geno.first; out_byte = out_byte << 4; if (geno.second == -1) out_byte |= 0x02; else out_byte |= (char)geno.second; out_line.push_back(out_byte); } if (check_pos) fd_POS.write(&out_line[0],out_line.size()); else fd.write(&out_line[0],out_line.size()); } fd.close(); fd_POS.close(); ifstream tmp_file(tmpname, ios::binary); ifstream tmp_file2(tmpname2, ios::binary); vector > GTs, GTs2; streampos file_pos = 0; GTs.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); GTs2.resize(meta_data.N_indv-indv_miss, make_pair(-1,-1)); for(unsigned int ui=0; ui bp_window_size) continue; int list_pos = list_positions[ui]; if ( abs(list_pos - uj) < snp_window_min) continue; if ( abs(list_pos - uj) > snp_window_size) continue; calc_geno_r2(GTs, GTs2, r2, indv_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << CHROM << "\t" << POS << "\t" << CHROM2 << "\t" << POS2 << "\t" << indv_count << "\t" << r2 << endl; } } tmp_file.close(); tmp_file2.close(); remove(tmpname); remove(tmpname2); delete e; } void variant_file::output_singletons(const parameters ¶ms) { // Locate and output singletons (and private doubletons) if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Singletons."); LOG.printLOG("Outputting Singleton Locations\n"); string output_file = params.output_prefix + ".singletons"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Singleton output file: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl; int a; vector allele_counts; unsigned int N_non_missing_chr, N_alleles, ui; pair geno; string allele; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_genotype_entries(true); e->get_allele_counts(allele_counts, N_non_missing_chr); N_alleles = e->get_N_alleles(); for (a=0; a<(signed)N_alleles; a++) { if (allele_counts[a] == 1) { // Singleton for (ui=0; uiget_indv_GENOTYPE_ids(ui, geno); if ((geno.first == a) || (geno.second == a)) { e->get_allele(a, allele); out << e->get_CHROM() << "\t" << e->get_POS() << "\tS\t" << allele << "\t" << meta_data.indv[ui] << endl; ui=meta_data.N_indv; break; } } } else if (allele_counts[a] == 2) { // Possible doubleton for (ui=0; uiget_indv_GENOTYPE_ids(ui, geno); if ((geno.first == a) && (geno.second == a)) { e->get_allele(a, allele); out << e->get_CHROM() << "\t" << e->get_POS() << "\tD\t" << allele << "\t" << meta_data.indv[ui] << endl; ui=meta_data.N_indv; break; } } } } } delete e; } void variant_file::output_genotype_depth(const parameters ¶ms) { // Output genotype depth in tab-delimited format. if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Genotype Depth Statistics."); LOG.printLOG("Outputting Depth for Each Genotype\n"); string output_file = params.output_prefix + ".gdepth"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Genotype Depth Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS(); for (unsigned int ui=0; uiinclude_genotype[ui] == true) { e->parse_genotype_entry(ui, false, false, true); out << "\t" << e->get_indv_DEPTH(ui); } else out << "\t-1"; } out << endl; } delete e; } void variant_file::output_FILTER_summary(const parameters ¶ms) { // Output a summary of sites in various FILTER categories. LOG.printLOG("Outputting Filter Summary (for bi-allelic loci only)\n"); map model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; string FILTER; vector variant_line; entry *e = get_entry_object(); map > FILTER_to_TsTv; map FILTER_to_Nsites; map::iterator FILTER_to_Nsites_it; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true); string model = e->get_REF() + e->get_ALT_allele(0); sort(model.begin(), model.end()); FILTER = e->get_FILTER(); FILTER_to_Nsites[FILTER]++; if (model_to_idx.find(model) != model_to_idx.end()) { switch (model_to_idx[model]) { case 1: case 4: FILTER_to_TsTv[FILTER].first++; break; case 0: case 2: case 3: case 5: FILTER_to_TsTv[FILTER].second++; break; default: // Don't count this snp towards Ts/Tv break; } } } vector > count_to_FILTER; for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it ) { FILTER = (*FILTER_to_Nsites_it).first; int Nsites = (*FILTER_to_Nsites_it).second; count_to_FILTER.push_back(make_pair(Nsites, FILTER)); } sort(count_to_FILTER.begin(), count_to_FILTER.end()); string output_file = params.output_prefix + ".FILTER.summary"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Filter Summary Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "FILTER\tN_VARIANTS\tN_Ts\tN_Tv\tTs/Tv" << endl; for (int i=count_to_FILTER.size()-1; i > -1; i--) { FILTER = count_to_FILTER[i].second; int Ts = FILTER_to_TsTv[FILTER].first; int Tv = FILTER_to_TsTv[FILTER].second; int Nsites = FILTER_to_Nsites[FILTER]; out << FILTER << "\t" << Nsites << "\t"; out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl; } delete e; } void variant_file::output_TsTv(const parameters ¶ms) { // Output Ts/Tv ratios in bins of a given size. int bin_size = params.output_TsTv_bin_size; LOG.printLOG("Outputting Ts/Tv in bins of " + header::int2str(bin_size) + "bp\n"); map model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; map max_pos; string CHROM; vector variant_line; entry *e = get_entry_object(); map > Ts_counts; map > Tv_counts; vector chrs; string prev_chr = ""; vector model_counts(6,0); double C = 1.0 / double(bin_size); unsigned int idx; string model; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; model = e->get_REF() + e->get_ALT_allele(0); sort(model.begin(), model.end()); CHROM = e->get_CHROM(); idx = (unsigned int)(e->get_POS() * C); if(idx>=Ts_counts[CHROM].size()) Ts_counts[CHROM].resize(idx+1,0); if(idx>=Tv_counts[CHROM].size()) Tv_counts[CHROM].resize(idx+1,0); if(CHROM != prev_chr) { chrs.push_back(CHROM); prev_chr = CHROM; } if (model_to_idx.find(model) != model_to_idx.end()) { model_counts[model_to_idx[model]]++; switch (model_to_idx[model]) { case 1: case 4: Ts_counts[CHROM][idx]++; break; case 0: case 2: case 3: case 5: Tv_counts[CHROM][idx]++; break; default: LOG.error("Unknown idx\n"); break; } } else LOG.warning("Unknown model type. Not a SNP? " + CHROM + ":" + header::int2str(e->get_POS()) +"\n"); } string output_file = params.output_prefix + ".TsTv"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open TsTv Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tBinStart\tSNP_count\tTs/Tv" << endl; double ratio; for(unsigned int ui=0; ui model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; vector variant_line; entry *e = get_entry_object(); vector model_counts(6,0); string model; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; model = e->get_REF() + e->get_ALT_allele(0); sort(model.begin(), model.end()); if (model_to_idx.find(model) != model_to_idx.end()) model_counts[model_to_idx[model]]++; else LOG.warning("Unknown model type. Not a SNP? " + e->get_CHROM() + ":" + header::int2str(e->get_POS()) +"\n"); } string output_file = params.output_prefix + ".TsTv.summary"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open TsTv Summary Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "MODEL\tCOUNT" << endl; out << "AC\t" << model_counts[0] << endl; out << "AG\t" << model_counts[1] << endl; out << "AT\t" << model_counts[2] << endl; out << "CG\t" << model_counts[3] << endl; out << "CT\t" << model_counts[4] << endl; out << "GT\t" << model_counts[5] << endl; unsigned int Ts = model_counts[1] + model_counts[4]; unsigned int Tv = model_counts[0] + model_counts[2] + model_counts[3] + model_counts[5]; out << "Ts\t" << Ts << endl; out << "Tv\t" << Tv << endl; LOG.printLOG("Ts/Tv ratio: " + output_log::dbl2str(double(Ts)/Tv, 4) + "\n"); delete e; } void variant_file::output_TsTv_by_count(const parameters ¶ms) { // Output Ts/Tv ratios in bins of a given size. LOG.printLOG("Outputting Ts/Tv by Alternative Allele Count\n"); vector Ts_counts, Tv_counts; unsigned int N_kept_indv = N_kept_individuals(); Ts_counts.resize(2*N_kept_indv); Tv_counts.resize(2*N_kept_indv); string model; vector variant_line; entry *e = get_entry_object(); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; vector allele_counts; unsigned int allele_count; unsigned int N_included_indv; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; e->parse_genotype_entries(true); e->get_allele_counts(allele_counts, N_included_indv); allele_count = allele_counts[1]; model = e->get_REF() + e->get_ALT_allele(0); if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end()) { idx = model_to_Ts_or_Tv[model]; if (idx == 0) // Ts Ts_counts[allele_count]++; else if (idx == 1) // Tv; Tv_counts[allele_count]++; else LOG.error("Unknown model type\n"); } else LOG.warning("Unknown model type. Not a SNP? " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS()) +"\n"); } string output_file = params.output_prefix + ".TsTv.count"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open TsTv by Count Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); double ratio; out << "ALT_ALLELE_COUNT\tN_Ts\tN_Tv\tTs/Tv" << endl; for (unsigned int ui=0; ui<2*N_kept_indv; ui++) { ratio = double(Ts_counts[ui]) / Tv_counts[ui]; out << ui << "\t" << Ts_counts[ui] << "\t" << Tv_counts[ui] << "\t" << ratio << endl; } delete e; } void variant_file::output_TsTv_by_quality(const parameters ¶ms) { // Output Ts/Tv ratios in bins of a given size. LOG.printLOG("Outputting Ts/Tv By Quality\n"); map > TsTv_counts; double max_qual = -numeric_limits::max(), min_qual=numeric_limits::max(); string model; vector variant_line; entry *e = get_entry_object(); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; double QUAL; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; QUAL = e->get_QUAL(); if (QUAL > max_qual) max_qual = QUAL; if (QUAL < min_qual) min_qual = QUAL; model = e->get_REF() + e->get_ALT_allele(0);; if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end()) { idx = model_to_Ts_or_Tv[model]; if (idx == 0) // Ts TsTv_counts[QUAL].first++; else if (idx == 1) // Tv; TsTv_counts[QUAL].second++; else LOG.error("Unknown model type\n"); } else LOG.warning("Unknown model type. Not a SNP? " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS()) +"\n"); } string output_file = params.output_prefix + ".TsTv.qual"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open TsTv by Quality Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "QUAL_THRESHOLD"; out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD"; out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl; unsigned int N_TsTv = TsTv_counts.size(); vector Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0); vector QUAL_vector(N_TsTv+1, 0.0); QUAL_vector[0] = min_qual; QUAL_vector[N_TsTv] = max_qual; idx = 1; for (map >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts; Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv; QUAL_vector[idx-1] = QUAL; idx++; } QUAL_vector[N_TsTv] = max_qual; vector Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0); idx = N_TsTv; for (map >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts; Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv; idx--; } double Ts_sum, Tv_sum, ratio; for (unsigned int ui=1; ui<(N_TsTv+1); ui++) { QUAL = QUAL_vector[ui-1]; out << QUAL; Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; out << endl; } delete e; } void variant_file::output_site_quality(const parameters ¶ms) { // Output per-site quality information. LOG.printLOG("Outputting Quality for Each Site\n"); string output_file = params.output_prefix + ".lqual"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open TsTv by Count Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\tQUAL" << endl; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e->get_QUAL() << endl; } delete e; } void variant_file::output_site_depth(const parameters ¶ms, bool output_mean) { // Output per-site depth information if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Site Depth Statistics."); LOG.printLOG("Outputting Depth for Each Site\n"); string output_file = params.output_prefix + ".ldepth"; if (output_mean) output_file += ".mean"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Site Depth Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\t"; if (output_mean) out << "MEAN_DEPTH\tVAR_DEPTH" << endl; else out << "SUM_DEPTH\tSUMSQ_DEPTH" << endl; int depth; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS() << "\t"; unsigned int sum=0; unsigned int sumsq=0; unsigned int n=0; for (unsigned int ui=0; uiinclude_genotype[ui] == false) continue; e->parse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { sum += depth; sumsq += (depth*depth); n++; } } if (output_mean) { double mean = double(sum) / n; double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1); out << mean << "\t" << var << endl; } else out << sum << "\t" << sumsq << endl; } delete e; } void variant_file::output_weir_and_cockerham_fst(const parameters ¶ms) { // Implements the bi-allelic version of Weir and Cockerham's Fst if (params.weir_fst_populations.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Weir and Cockerham Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = params.weir_fst_populations.size(); indvs_in_pops.resize(N_pops, vector(meta_data.N_indv, false)); vector all_indv(meta_data.N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } string output_file = params.output_prefix + ".weir.fst"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Fst Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\tWEIR_AND_COCKERHAM_FST" << endl; entry *e = get_entry_object(); vector variant_line; double sum1=0.0, sum2 = 0.0; double sum3=0.0, count = 0.0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_full_entry(true); e->parse_genotype_entries(true); unsigned int N_alleles = e->get_N_alleles(); if (e->is_diploid() == false) { LOG.one_off_warning("\tFst: Only using diploid sites."); continue; } vector N_hom, N_het; vector n(N_pops, 0.0); vector > p(N_pops, vector(N_alleles,0.0)); double nbar = 0.0; vector pbar(N_alleles, 0.0); vector hbar(N_alleles, 0.0); vector ssqr(N_alleles, 0.0); double sum_nsqr = 0.0; double n_sum = 0.0; for (unsigned int i=0; iget_multiple_genotype_counts(indvs_in_pops[i], e->include_genotype, N_hom, N_het); for (unsigned int j=0; j snp_Fst(N_alleles, 0.0); vector a(N_alleles, 0.0); vector b(N_alleles, 0.0); vector c(N_alleles, 0.0); double r = double(N_pops); double sum_a = 0.0; double sum_all = 0.0; for(unsigned int j=0; jget_CHROM() << "\t" << e->get_POS() << "\t" << fst << endl; } double weighted_Fst = sum1 / sum2; double mean_Fst = sum3 / count; LOG.printLOG("Weir and Cockerham mean Fst estimate: " + output_log::dbl2str(mean_Fst, 5) + "\n"); LOG.printLOG("Weir and Cockerham weighted Fst estimate: " + output_log::dbl2str(weighted_Fst, 5) + "\n"); delete e; } void variant_file::output_windowed_weir_and_cockerham_fst(const parameters ¶ms) { int fst_window_size = params.fst_window_size; int fst_window_step = params.fst_window_step; vector indv_files = params.weir_fst_populations; if ((fst_window_step <= 0) || (fst_window_step > fst_window_size)) fst_window_step = fst_window_size; if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Windowed Weir and Cockerham Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(meta_data.N_indv, false)); vector all_indv(meta_data.N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } string CHROM; string last_chr = ""; vector chrs; vector variant_line; entry *e = get_entry_object(); // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele const vector< double > empty_vector(4, 0); // sum1, sum2, sum3, count map > > bins; double sum1=0.0, sum2 = 0.0; double sum3=0.0, count = 0.0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_full_entry(true); e->parse_genotype_entries(true); unsigned int N_alleles = e->get_N_alleles(); if (e->is_diploid() == false) { LOG.one_off_warning("\tFst: Only using diploid sites."); continue; } vector N_hom, N_het; vector n(N_pops, 0.0); vector > p(N_pops, vector(N_alleles,0.0)); double nbar = 0.0; vector pbar(N_alleles, 0.0); vector hbar(N_alleles, 0.0); vector ssqr(N_alleles, 0.0); double sum_nsqr = 0.0; double n_sum = 0.0; for (unsigned int i=0; iget_multiple_genotype_counts(indvs_in_pops[i], e->include_genotype, N_hom, N_het); for (unsigned int j=0; j snp_Fst(N_alleles, 0.0); vector a(N_alleles, 0.0); vector b(N_alleles, 0.0); vector c(N_alleles, 0.0); double r = double(N_pops); double sum_a = 0.0; double sum_all = 0.0; for(unsigned int j=0; jget_POS(); CHROM = e->get_CHROM(); if (CHROM != last_chr) { chrs.push_back(CHROM); last_chr = CHROM; } int first = (int) ceil((pos - fst_window_size)/double(fst_window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(fst_window_step)); for(int idx = first; idx < last; idx++) { if (idx >= (int)bins[CHROM].size()) bins[CHROM].resize(idx+1, empty_vector); bins[CHROM][idx][0] += sum_a; bins[CHROM][idx][1] += sum_all; bins[CHROM][idx][2] += fst; bins[CHROM][idx][3]++; } sum1 += sum_a; sum2 += sum_all; sum3 += fst; count++; } } double weighted_Fst = sum1 / sum2; double mean_Fst = sum3 / count; LOG.printLOG("Weir and Cockerham mean Fst estimate: " + output_log::dbl2str(mean_Fst, 5) + "\n"); LOG.printLOG("Weir and Cockerham weighted Fst estimate: " + output_log::dbl2str(weighted_Fst, 5) + "\n"); string output_file = params.output_prefix + ".windowed.weir.fst"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Fst Output file: " + output_file, 7); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tWEIGHTED_FST\tMEAN_FST" << endl; for (unsigned int ui=0; ui 0)) { double weighted_Fst = bins[CHROM][s][0] / bins[CHROM][s][1]; double mean_Fst = bins[CHROM][s][2] / bins[CHROM][s][3]; out << CHROM << "\t" << s*fst_window_step + 1 << "\t" << (s*fst_window_step + fst_window_size) << "\t" << bins[CHROM][s][3] << "\t" << weighted_Fst << "\t" << mean_Fst << endl; } } } delete e; } void variant_file::output_per_site_nucleotide_diversity(const parameters ¶ms) { // Output nucleotide diversity, calculated on a per-site basis. // Pi = average number of pairwise differences // Assumes a constant distance of 1 between all possible mutations if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics."); LOG.printLOG("Outputting Per-Site Nucleotide Diversity Statistics...\n"); string output_file = params.output_prefix + ".sites.pi"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Nucleotide Diversity Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS\tPI" << endl; vector variant_line; entry *e = get_entry_object(); vector allele_counts; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_full_entry(true); e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tsitePi: Only using fully diploid sites."); continue; } unsigned int N_non_missing_chr; e->get_allele_counts(allele_counts, N_non_missing_chr); unsigned int total_alleles = std::accumulate(allele_counts.begin(), allele_counts.end(), 0); unsigned int N_alleles = e->get_N_alleles(); int mismatches = 0; for(unsigned int allele = 0; allele < N_alleles; allele++) { int other_alleles_count = (total_alleles - allele_counts[allele]); mismatches += (allele_counts[allele] * other_alleles_count); } int pairs = (total_alleles * (total_alleles - 1)); double pi = (mismatches/static_cast(pairs)); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << pi << endl; } delete e; } //Output Tajima's D //Carlson et al. Genome Res (2005) void variant_file::output_Tajima_D(const parameters ¶ms) { int window_size = params.output_Tajima_D_bin_size; if (window_size <= 0) return; if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Tajima's D Statistic."); LOG.printLOG("Outputting Tajima's D Statistic...\n"); string output_file = params.output_prefix + ".Tajima.D"; double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2; unsigned int n = N_kept_individuals()*2; if (n < 2) LOG.error("Require at least two chromosomes!"); for (unsigned int ui=1; ui variant_line; entry *e = get_entry_object(); map > > bins; unsigned int idx; double C = 1.0 / double(window_size); vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; string prev_chr = ""; vector chrs; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) { LOG.one_off_warning("\tTajimaD: Only using bialleleic sites."); continue; } CHROM = e->get_CHROM(); idx = (unsigned int)(e->get_POS() * C); e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tTajimaD: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr); double p = double(allele_counts[0]) / N_non_missing_chr; if(idx>=bins[CHROM].size()) bins[CHROM].resize(idx+1, make_pair(0,0)); if(CHROM != prev_chr) { chrs.push_back(CHROM); prev_chr = CHROM; } if ((p > 0.0) && (p < 1.0)) { bins[CHROM][idx].first++; bins[CHROM][idx].second += p * (1.0-p); } } streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Tajima D Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl; for (unsigned int ui=0; ui::quiet_NaN(); if (S > 0) { double pi = 2.0*bins[CHROM][s].second*n/double(n-1); double tw = double(S) / a1; double var = (e1*S) + e2*S*(S-1); D = (pi - tw) / sqrt(var); output = true; } if (output == true) out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl; } } delete e; } void variant_file::output_windowed_nucleotide_diversity(const parameters ¶ms) { // Output nucleotide diversity, as calculated in windows. // Average number of pairwise differences in windows. int window_size = params.pi_window_size; int window_step = params.pi_window_step; if (window_size <= 0) return; if ((window_step <= 0) || (window_step > window_size)) window_step = window_size; if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics."); LOG.printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n"); string output_file = params.output_prefix + ".windowed.pi"; string CHROM; vector variant_line; entry *e = get_entry_object(); // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele const unsigned int N_variant_sites = 0; const unsigned int N_variant_site_pairs = 1; const unsigned int N_mismatches = 2; const unsigned int N_polymorphic_sites = 3; const vector< unsigned long > empty_vector(4, 0); map > > bins; vector chrs; string prev_chr; // Count polymorphic sites and pairwise mismatches vector allele_counts; unsigned int N_non_missing_chr; unsigned long N_comparisons; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_genotype_entries(true); CHROM = e->get_CHROM(); if (e->is_diploid() == false) { LOG.one_off_warning("\twindowPi: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr); unsigned int N_site_mismatches = 0; for (vector::iterator ac = allele_counts.begin(); ac != allele_counts.end(); ++ac) { N_site_mismatches += (*ac * (N_non_missing_chr - *ac)); } if (N_site_mismatches == 0) continue; // Site is actually fixed. // Place the counts into bins int pos = (int)e->get_POS(); int first = (int) ceil((pos - window_size)/double(window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(window_step)); N_comparisons = N_non_missing_chr * (N_non_missing_chr - 1); if(CHROM != prev_chr) { chrs.push_back(CHROM); prev_chr = CHROM; bins[CHROM].resize(1,empty_vector); } if(last>= (int)bins[CHROM].size()) bins[CHROM].resize(last+1,empty_vector); for(int idx = first; idx < last; idx++) { bins[CHROM][idx][N_variant_sites]++; bins[CHROM][idx][N_variant_site_pairs] += N_comparisons; bins[CHROM][idx][N_mismatches] += N_site_mismatches; if(allele_counts[0] < (signed)N_non_missing_chr) bins[CHROM][idx][N_polymorphic_sites]++; } } // Calculate and print nucleotide diversity statistics streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Windowed Nucleotide Diversity Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tPI" << endl; unsigned long N_monomorphic_sites = 0; int N_kept_chr = 2*N_kept_individuals(); N_comparisons = (N_kept_chr * (N_kept_chr - 1)); // Number of pairwise comparisons at a monomorphic site unsigned long N_pairs = 0; // Number of pairwise comparisons within a window double pi = 0; for (unsigned int ui=0; ui 0) || (bins[CHROM][s][N_mismatches] > 0) ) { // This number can be slightly off for the last bin since the // window size can go off the end of the chromosome. N_monomorphic_sites = window_size - bins[CHROM][s][N_variant_sites]; // The total number of possible pairwise comparisons is the sum of // pairwise comparisons at polymorphic sites and pairwise // comparisons at monomorphic sites. N_pairs = bins[CHROM][s][N_variant_site_pairs] + (N_monomorphic_sites * N_comparisons); pi = bins[CHROM][s][N_mismatches] / double(N_pairs); out << CHROM << "\t" << s*window_step + 1 << "\t" << (s*window_step + window_size) << "\t" << bins[CHROM][s][N_polymorphic_sites] << "\t" << pi << endl; } } } delete e; } void variant_file::output_kept_sites(const parameters ¶ms) { // Output lists of sites that have been filtered (or not). LOG.printLOG("Outputting Kept Sites...\n"); string output_file = params.output_prefix + ".kept.sites"; string CHROM; vector variant_line; int POS; entry *e = get_entry_object(); streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Kept Site Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS" << endl; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); POS = e->get_POS(); CHROM = e->get_CHROM(); out << CHROM << "\t" << POS << endl; } delete e; } void variant_file::output_removed_sites(const parameters ¶ms) { // Output lists of sites that have been filtered (or not). LOG.printLOG("Outputting Removed Sites...\n"); string output_file = params.output_prefix + ".removed.sites"; string CHROM; vector variant_line; int POS; entry *e = get_entry_object(); streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Removed Site Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tPOS" << endl; while (!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(); POS = e->get_POS(); CHROM = e->get_CHROM(); if(!eof()) out << CHROM << "\t" << POS << endl; } delete e; } void variant_file::output_LROH(const parameters ¶ms) { // Detect and output Long Runs of Homozygosity, following the method // developed by Adam Boyko, and described in Auton et al., Genome Research, 2009 // (Although using Forward-backwards algorithm in place of Viterbi). if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LROH."); LOG.printLOG("Outputting Long Runs of Homozygosity (Experimental)... \n"); string output_file = params.output_prefix + ".LROH"; unsigned int nGen=4; // Number of generations since common ancestry double genotype_error_rate = 0.01; // Assumed genotype error rate double p_auto_prior = 0.05; // Prior probability of being in autozygous state double p_auto_threshold = 0.99; // Threshold for reporting autozygous region int min_SNPs=0; // Threshold for reporting autozygous region string CHROM; vector variant_line; int POS; entry *e = get_entry_object(); pair alleles; vector< vector< int > > s_vector; vector< vector > > p_emission; vector< vector< vector > > p_trans; vector last_POS; vector > is_het; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open LROH Output file: " + output_file, 12); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "CHROM\tAUTO_START\tAUTO_END\tMIN_START\tMAX_END\tN_VARIANTS_BETWEEN_MAX_BOUNDARIES\tN_MISMATCHES\tINDV" << endl; s_vector.resize(meta_data.N_indv); p_emission.resize(meta_data.N_indv); p_trans.resize(meta_data.N_indv); last_POS.resize(meta_data.N_indv,-1); is_het.resize(meta_data.N_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); //if (e->get_N_alleles() != 2) //{ // LOG.one_off_warning("\tLROH: Only using bialleleic sites."); // continue; // TODO: Probably could do without this... //} CHROM = e->get_CHROM(); POS = e->get_POS(); double r = 0; unsigned int N_genotypes = 0; unsigned int N_hets = 0; vector indv_alleles(meta_data.N_indv, -1); bool has_non_ref = false; for (unsigned int ui=0; uiinclude_genotype[ui] == false)) continue; e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (e->get_indv_ploidy(ui) != 2) { LOG.one_off_warning("\tLROH: Only using diploid sites."); continue; } if ((alleles.first < 0) || (alleles.second < 0)) continue; // Skip missing genotypes if ((alleles.first > 0) || (alleles.second > 0)) has_non_ref = true; N_genotypes++; bool is_het = (alleles.first != alleles.second); if (is_het == true) N_hets++; indv_alleles[ui] = (int)is_het; } if (has_non_ref == false) continue; double h = N_hets / double(N_genotypes); // Heterozygosity for (unsigned int ui=0; uiinclude_genotype[ui] == false)) continue; double p_emission_given_nonauto; double p_emission_given_auto; if (indv_alleles[ui] < 0) continue; else if (indv_alleles[ui] == 1) { // Heterozygote p_emission_given_nonauto = h; p_emission_given_auto = genotype_error_rate; p_emission[ui].push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto)); is_het[ui].push_back(true); } else { // Homozygote p_emission_given_nonauto = 1.0-h; p_emission_given_auto = 1.0-genotype_error_rate; p_emission[ui].push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto)); is_het[ui].push_back(false); } if (last_POS[ui] > 0) { // Assume 1cM/Mb. r = (POS - last_POS[ui]) / 1000000.0 / 100.0; // Morgans } double e = (1.0 - exp(-2.0*nGen*r)); double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1] double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2] double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0] double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3] vector A(4); A[0] = p_trans_auto_to_auto; A[1] = p_trans_auto_to_nonauto; A[2] = p_trans_nonauto_to_auto; A[3] = p_trans_nonauto_to_nonauto; p_trans[ui].push_back(A); s_vector[ui].push_back(POS); last_POS[ui] = POS; } } delete e; for (unsigned int ui=0; ui > alpha(N_obs, vector(2,0)); vector > beta(N_obs, vector(2,0)); alpha[0][0] = p_emission[ui][0].first; alpha[0][1] = p_emission[ui][0].second; for (int i=1; i=0; i--) { beta[i][0] = beta[i+1][0] * p_trans[ui][i][0] * p_emission[ui][i].first; beta[i][0] += beta[i+1][1] * p_trans[ui][i][2] * p_emission[ui][i].first; beta[i][1] = beta[i+1][1] * p_trans[ui][i][3] * p_emission[ui][i].second; beta[i][1] += beta[i+1][0] * p_trans[ui][i][1] * p_emission[ui][i].second; while (beta[i][0] + beta[i][1] < 1e-20) { // Renormalise to prevent underflow beta[i][0] *= 1e20; beta[i][1] *= 1e20; } } // Calculate probability of each site being autozygous vector p_auto(N_obs); for (int i=0; i p_auto_threshold) { if (in_auto == false) { // Start of autozygous region start_pos = s_vector[ui][i]; } N_SNPs++; N_SNPs_between_hets++; if (is_het[ui][i] == true) N_hets_in_region++; in_auto = true; } else { if (in_auto == true) { // end of autozygous region // Find next_het position next_het_pos = s_vector[ui][N_obs-1]; for (int j=i; j= min_SNPs) { out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << (last_het_pos+1) << "\t" << (next_het_pos-1) << "\t" << N_SNPs_between_hets << "\t" << N_hets_in_region << "\t" << meta_data.indv[ui] << endl; } } in_auto = false; N_SNPs = 0; N_hets_in_region = 0; if (is_het[ui][i] == true) { last_het_pos = s_vector[ui][i]; N_SNPs_between_hets = 0; } } } if (in_auto == true) { // Report final region if needed end_pos = s_vector[ui][N_obs-1]; next_het_pos = s_vector[ui][N_obs-1]; if (N_SNPs >= min_SNPs) out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << (last_het_pos+1) << "\t" << next_het_pos << "\t" << N_SNPs_between_hets << "\t" << N_hets_in_region << "\t" << meta_data.indv[ui] << endl; } } } void variant_file::output_indv_relatedness_Manichaikul(const parameters ¶ms) { // Calculate and output a relatedness statistic based on the method of // Manichaikul et al., BIOINFORMATICS 2010 // doi:10.1093/bioinformatics/btq559 if ((meta_data.has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Individual Relatedness."); LOG.printLOG("Outputting Individual Relatedness\n"); string output_file = params.output_prefix + ".relatedness2"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Individual Relatedness Output file: " + output_file, 2); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); out << "INDV1\tINDV2\tN_AaAa\tN_AAaa\tN1_Aa\tN2_Aa\tRELATEDNESS_PHI" << endl; vector variant_line; entry *e = get_entry_object(); vector allele_counts; unsigned int N_alleles; pair geno_id; pair geno_id2; vector > phi(meta_data.N_indv, vector(meta_data.N_indv, 0.0)); vector > N_AaAa(meta_data.N_indv, vector(meta_data.N_indv, 0.0)); vector > N_AAaa(meta_data.N_indv, vector(meta_data.N_indv, 0.0)); vector N_Aa(meta_data.N_indv, 0.0); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) { LOG.one_off_warning("\tRelatedness: Only using biallelic sites."); continue; // Only use biallelic loci } e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tRelatedness: Only using fully diploid sites."); continue; } for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); if ((geno_id.first != geno_id.second) && (geno_id.first >= 0) && (geno_id.second >= 0)) { N_Aa[ui]++; } for (unsigned int uj=0; ujget_indv_GENOTYPE_ids(uj, geno_id2); if ((geno_id.first != geno_id.second) && (geno_id.first >= 0) && (geno_id.second >= 0)) { if ((geno_id2.first != geno_id2.second) && (geno_id2.first >= 0) && (geno_id2.second >= 0)) { N_AaAa[ui][uj]++; } } if ((geno_id.first == geno_id.second) && (geno_id.first >= 0) && (geno_id.second >= 0)) { if ((geno_id2.first == geno_id2.second) && (geno_id2.first >= 0) && (geno_id2.second >= 0)) { if (geno_id.first != geno_id2.first) { N_AAaa[ui][uj]++; } } } } } } for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); vector allele_counts; unsigned int N_alleles, N_non_missing_chr; double freq; pair geno_id; vector > Ajk(meta_data.N_indv, vector(meta_data.N_indv, 0.0)); vector > N_sites(meta_data.N_indv, vector(meta_data.N_indv, 0.0)); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) { LOG.one_off_warning("\tRelatedness: Only using biallelic sites."); continue; // Only use biallelic loci } e->parse_genotype_entries(true); if (e->is_diploid() == false) { LOG.one_off_warning("\tRelatedness: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; vector x(meta_data.N_indv, -1.0); for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x[ui] = geno_id.first + geno_id.second; } double div = 1.0/(2.0*freq*(1.0-freq)); for (unsigned int ui=0; uiinclude_genotype[ui] == false) || (x[ui] < 0)) continue; Ajk[ui][ui] += (x[ui]*x[ui] - (1 + 2.0*freq)*x[ui] + 2.0*freq*freq) * div; N_sites[ui][ui]++; for (unsigned int uj=(ui+1); ujinclude_genotype[uj] == false) || (x[uj] < 0)) continue; Ajk[ui][uj] += (x[ui] - 2.0*freq) * (x[uj] - 2.0*freq) * div; N_sites[ui][uj]++; } } } for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(); pair geno_id; double x, freq; vector allele_counts; unsigned int N_alleles, N_non_missing_chr; // Store list of included individuals vector included_indvs(N_indvs); unsigned int ui_prime = 0; for (unsigned int ui=0; ui > M(N_indvs); // Populate M unsigned int s_prime = 0; unsigned int N_sites = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) LOG.error("PCA only works for biallelic sites."); e->parse_genotype_entries(true); if (e->is_diploid() == false) LOG.error("PCA only works for fully diploid sites. Non-diploid site at " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS())); e->get_allele_counts(allele_counts, N_non_missing_chr); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; double mu = freq*2.0; double div = 1.0 / sqrt(freq * (1.0-freq)); ui_prime = 0; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x = geno_id.first + geno_id.second; if (x > -1) { if (use_normalisation == true) M[ui_prime].push_back((x - mu) * div); else M[ui_prime].push_back((x - mu)); } ui_prime++; } s_prime++; N_sites++; } if (N_indvs >= N_sites) LOG.error("PCA computation requires that there are more sites than individuals."); // Now construct X = (1/n)MM'. double **X = new double *[N_indvs]; for (unsigned int ui=0; ui included_indvs(N_indvs); unsigned int ui_prime = 0; for (unsigned int ui=0; ui > M(N_indvs); vector< vector > ids(N_indvs); map idx_to_chrom; map chrom_to_idx; string chr; int pos; int chrom_idx = 0; vector CHROMidx_list; vector pos_list; vector variant_line; entry *e = get_entry_object(); pair geno_id; double x, freq; vector allele_counts; unsigned int N_alleles, N_non_missing_chr; ui_prime = 0; unsigned int s_prime = 0; unsigned int N_sites = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) LOG.error("PCA only works for biallelic sites."); e->parse_genotype_entries(true); if (e->is_diploid() == false) LOG.error("PCA only works for fully diploid sites. Non-diploid site at " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS())); e->get_allele_counts(allele_counts, N_non_missing_chr); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; double mu = freq*2.0; double div = 1.0 / sqrt(freq * (1.0-freq)); chr = e->get_CHROM(); pos = e->get_POS(); if (chrom_to_idx.find(chr) == chrom_to_idx.end()) { chrom_to_idx[chr] = chrom_idx; idx_to_chrom[chrom_idx] = chr; chrom_idx++; } CHROMidx_list.push_back(chrom_to_idx[chr]); pos_list.push_back(pos); ui_prime = 0; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x = geno_id.first + geno_id.second; if (x > -1) { if (use_normalisation == true) M[ui_prime].push_back((x - mu) * div); else M[ui_prime].push_back((x - mu)); } ids[ui_prime].push_back(x); ui_prime++; } s_prime++; N_sites++; } if (N_indvs >= N_sites) LOG.error("PCA computation requires that there are more sites than individuals."); // Now construct X = (1/n)MM'. double **X = new double *[N_indvs]; for (unsigned int ui=0; ui gamma(SNP_loadings_N_PCs, 0.0); vector a_sum(SNP_loadings_N_PCs, 0.0); chr = idx_to_chrom[CHROMidx_list[ui]]; pos = pos_list[ui]; out << chr << "\t" << pos; for (unsigned int uj_prime=0; uj_prime -1) { for (unsigned int uk=0; uk<(unsigned int)SNP_loadings_N_PCs; uk++) { gamma[uk] += (x * Evecs[uj_prime][uk]); a_sum[uk] += (Evecs[uj_prime][uk]*Evecs[uj_prime][uk]); } } } for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++) out << "\t" << gamma[uj] / a_sum[uj]; out << endl; } delete e; delete [] Er; delete [] Ei; delete [] Evecs; delete [] X; #endif } void variant_file::output_indel_hist(const parameters ¶ms) { vector variant_line; entry *e = get_entry_object(); string allele; unsigned int ref_len, N_alleles; int indel_len, smallest_len, largest_len, snp_count; vector s_vector; string output_file = params.output_prefix + ".indel.hist"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Indel Histogram Output file: " + output_file, 2); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); LOG.printLOG("Outputting Indel Histogram\n"); out << "LENGTH\tCOUNT\tPRCT" << endl; largest_len = 0; smallest_len = 0; snp_count = 0; while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); allele = e->get_REF(); ref_len = allele.size(); N_alleles = e->get_N_alleles(); if (e->is_SNP() ) snp_count++; for (unsigned int ui=1; uiget_allele(ui, allele); if (allele.size() != ref_len) { if (allele.find_first_not_of("acgtACGT") == string::npos) { // Check all bases are ATCGatcg indel_len = allele.size() - ref_len; s_vector.push_back (indel_len); if (indel_len > largest_len) largest_len = indel_len; else if (indel_len < smallest_len) smallest_len = indel_len; } } } } double total = s_vector.size() + snp_count; double pct; for (int i=smallest_len; i<=largest_len; i++) { int icount = (int) count (s_vector.begin(), s_vector.end(), i); if (icount > 0) { pct = 100.0*icount/total; out << i << "\t" << icount << "\t" << pct << endl; } else if ((i == 0) and (snp_count>0)) { pct = 100.0*snp_count/total; out << i << "\t" << snp_count << "\t" << pct << endl; } } } void variant_file::output_mendel_inconsistencies(const parameters ¶ms) { LOG.printLOG("Outputting Mendel Errors.\n"); ifstream PED(params.mendel_ped_file.c_str()); if (!PED.is_open()) LOG.error("Could not open PED file: " + params.mendel_ped_file); string line; stringstream ss; string family, child, mother, father; vector child_idx, mother_idx, father_idx; vector family_ids; PED.ignore(numeric_limits::max(), '\n'); while (!PED.eof()) { getline(PED, line); if ((line[0] == '#') || (line.size() == 0)) continue; ss.clear(); ss.str(line); ss >> family >> child >> father >> mother; if ((child == "0") || (father == "0") || (mother == "0")) continue; int idx1 = -1, idx2 = -1, idx3 = -1; vector::iterator it = find(meta_data.indv.begin(), meta_data.indv.end(), child); if (it != meta_data.indv.end()) idx1 = distance(meta_data.indv.begin(), it); it = find(meta_data.indv.begin(), meta_data.indv.end(), mother); if (it != meta_data.indv.end()) idx2 = distance(meta_data.indv.begin(), it); it = find(meta_data.indv.begin(), meta_data.indv.end(), father); if (it != meta_data.indv.end()) idx3 = distance(meta_data.indv.begin(), it); if ((idx1 != -1) && (idx2 != -1) && (idx3 != -1)) { // Trio is in the VCF if (include_indv[idx1] == false) continue; if (include_indv[idx2] == false) continue; if (include_indv[idx3] == false) continue; child_idx.push_back(idx1); mother_idx.push_back(idx2); father_idx.push_back(idx3); family_ids.push_back(child + "_" + father + "_" + mother); } } PED.close(); LOG.printLOG("Found " + LOG.int2str(child_idx.size()) + " trios in the VCF file.\n"); if (child_idx.size() == 0) LOG.error("No PED individuals found in VCF.\n", 5); string output_file = params.output_prefix + ".mendel"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open Mendel Error output file: " + output_file, 4); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); pair child_alleles, child_alleles2; pair mother_alleles; pair father_alleles; string CHROM; int POS; string REF, ALT; out << "CHR\tPOS\tREF\tALT\tFAMILY\tCHILD\tFATHER\tMOTHER" << endl; vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); e->parse_genotype_entries(true); CHROM = e->get_CHROM(); POS = e->get_POS(); for (unsigned int trio=0; trioinclude_genotype[idx1] == false) || (e->include_genotype[idx2] == false) || (e->include_genotype[idx3] == false)) continue; e->get_indv_GENOTYPE_ids(idx1, child_alleles); e->get_indv_GENOTYPE_ids(idx2, mother_alleles); e->get_indv_GENOTYPE_ids(idx3, father_alleles); if ((child_alleles.first == -1) || (child_alleles.second == -1) || (mother_alleles.first == -1) || (mother_alleles.second == -1) || (father_alleles.first == -1) || (father_alleles.second == -1)) continue; // cout << CHROM << "\t" << POS << "\t" << REF << "\t" << ALT << "\t" << family_ids[trio] << "\t" << child_alleles.first << "/" << child_alleles.second; // cout << "\t" << father_alleles.first << "/" << father_alleles.second << "\t" << mother_alleles.first << "/" << mother_alleles.second << endl; set > possible_child_genotypes; possible_child_genotypes.insert(make_pair(mother_alleles.first, father_alleles.first)); possible_child_genotypes.insert(make_pair(mother_alleles.first, father_alleles.second)); possible_child_genotypes.insert(make_pair(mother_alleles.second, father_alleles.first)); possible_child_genotypes.insert(make_pair(mother_alleles.second, father_alleles.second)); child_alleles2 = make_pair(child_alleles.second, child_alleles.first); if ((possible_child_genotypes.find(child_alleles) == possible_child_genotypes.end()) && (possible_child_genotypes.find(child_alleles2) == possible_child_genotypes.end())) { // Mendel error! CHROM = e->get_CHROM(); POS = e->get_POS(); REF = e->get_REF(); ALT = e->get_ALT(); out << CHROM << "\t" << POS << "\t" << REF << "\t" << ALT << "\t" << family_ids[trio] << "\t" << child_alleles.first << "/" << child_alleles.second; out << "\t" << father_alleles.first << "/" << father_alleles.second << "\t" << mother_alleles.first << "/" << mother_alleles.second << endl; } } } delete e; } void variant_file::write_stats(const parameters ¶ms) { vector variant_line; entry *e = get_entry_object(); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true); } delete e; } vcftools-0.1.15/src/cpp/vcf_entry.cpp000066400000000000000000000442241307140004000175040ustar00rootroot00000000000000/* * vcf_entry.cpp * * Created on: Aug 19, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_entry.h" string vcf_entry::convert_line; vcf_entry::vcf_entry(header &meta_data, vector &include_individual) { N_indv = meta_data.N_indv; include_indv = include_individual; include_genotype = vector(N_indv, true); basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; passed_filters = true; parsed_FORMAT_binary = false; N_INFO_removed = 0; N_FORMAT_removed = 0; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; FORMAT_positions.resize(N_indv); FORMAT_types.resize(N_indv); FORMAT_sizes.resize(N_indv); FORMAT_skip.resize(N_indv); FORMAT_keys.resize(N_indv); convert_line.clear(); data_stream.str(""); entry_header = meta_data; } vcf_entry::~vcf_entry() {} // Reset the VCF entry object with a new data line void vcf_entry::reset(const vector &data_line) { basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; parsed_FORMAT_binary = false; passed_filters = true; data_stream.clear(); line = data_line; convert_line.assign(line.begin(), line.end()); data_stream.str(convert_line); fill(parsed_GT.begin(), parsed_GT.end(), 0); fill(parsed_GQ.begin(), parsed_GQ.end(), 0); fill(parsed_DP.begin(), parsed_DP.end(), 0); fill(parsed_FT.begin(), parsed_FT.end(), 0); fill(include_genotype.begin(), include_genotype.end(), 1); N_INFO_removed = 0; N_FORMAT_removed = 0; FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear(); } // Tokenize the basic information in a VCF data line (at the tab level) void vcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO) { if(!basic_parsed) { getline(data_stream, CHROM, '\t'); getline(data_stream, ID, '\t'); POS = atoi(ID.c_str()); getline(data_stream, ID, '\t'); getline(data_stream, REF, '\t'); getline(data_stream, ALT_str, '\t'); getline(data_stream, QUAL_str, '\t'); getline(data_stream, FILTER_str, '\t'); getline(data_stream, INFO_str, '\t'); QUAL = header::str2double(QUAL_str); // Convert to uppercase for consistency // Note that VCF v4.1 allows mixtures of lower/upper case in REF and ALT. // However, the spec specifically states that tools using VCF are not required // to preserve the case. std::transform(REF.begin(), REF.end(), REF.begin(), ::toupper); std::transform(ALT_str.begin(), ALT_str.end(),ALT_str.begin(), ::toupper); } basic_parsed = true; if (parse_ALT && !parsed_ALT) set_ALT(ALT_str); if (parse_FILTER && !parsed_FILTER) set_FILTER(FILTER_str); if (parse_INFO && !parsed_INFO) set_INFO(INFO_str); } // Tokenize the genotype information (at the 'tab' level) in the VCF entry void vcf_entry::parse_full_entry(bool parse_FORMAT) { if (fully_parsed) return; if (basic_parsed == false) parse_basic_entry(); getline(data_stream, FORMAT_str, '\t'); if (parse_FORMAT) set_FORMAT(FORMAT_str); string tmpstr; tmpstr.reserve(64); GENOTYPE_str.resize(N_indv, tmpstr); for (unsigned int ui=0; ui tmp_vector; vector tmp_split; vector< vector > format_matrix(N_indv); unsigned int type, number, size, position=0; tmp_split.resize(FORMAT.size()); for (unsigned int ui=0; ui &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(); out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT(); out << '\t' << header::double2str(QUAL); out << '\t' << get_FILTER(); if (keep_all_INFO == false) out << '\t' << get_INFO(INFO_to_keep); else out << '\t' << INFO_str; pair genotype; string GFILTER_tmp; if (FORMAT.size() > 0) { char PHASE; out << '\t' << get_FORMAT(); for (unsigned int ui=0; ui &INFO_to_keep, bool keep_all_INFO) { if (fully_parsed == false) parse_full_entry(); if (parsed_FORMAT_binary == false) parse_FORMAT(); vector out_vector, tmp_vector; out_vector.resize(8*sizeof(int32_t)); int vector_pos = 2*sizeof(uint32_t); string tmp_string; int index; vector filter_vector; vector > tmp_info; tmp_string = get_CHROM(); if (tmp_string == "." or tmp_string == " " or tmp_string == "") LOG.error("CHROM value must be defined for all entries.",0); if (entry_header.CONTIG_reverse_map.find(tmp_string) == entry_header.CONTIG_reverse_map.end() ) LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.",0); int32_t chrom = (int32_t)entry_header.CONTIG_reverse_map[tmp_string]; memcpy(&out_vector[vector_pos], &chrom, sizeof(chrom)); vector_pos += sizeof(chrom); get_POS_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_rlen(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_QUAL_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_ID_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_ALLELES_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_FILTER_vector(filter_vector); if (filter_vector.empty()) make_typed_int_vector(tmp_vector, filter_vector); else { vector index_vector; for(unsigned int ui=0; ui max_depth)) include_genotype[ui] = false; } } } // Filter specific genotypes by quality void vcf_entry::filter_genotypes_by_quality(double min_genotype_quality) { if (fully_parsed == false) parse_full_entry(); if (GQ_idx != -1) { // Have quality info double quality; for (unsigned int ui=0; ui &filter_flags_to_remove, bool remove_all) { if (fully_parsed == false) parse_full_entry(); vector GFILTERs; if (FT_idx != -1) { // Have GFilter info for (unsigned int ui=0; ui &include_individual); ~vcf_entry(); static string convert_line; void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false); void parse_full_entry(bool parse_FORMAT=true); void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_FORMAT(); void reset(const vector &data_line); void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out); void set_ALT(const string &in); void set_FILTER(const string &FILTER_str); void set_FORMAT(const string &in); void set_INFO(const string &INFO_str); void add_FORMAT_entry(const string &in, unsigned int pos); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const string &in); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in); void set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2); void set_indv_GENOTYPE_ids(unsigned int indv, const pair &in); void set_indv_PHASE(unsigned int indv, char in); void set_indv_GQUALITY(unsigned int indv, double in); void set_indv_GFILTER(unsigned int indv, const string &in); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false); void filter_genotypes_by_depth(int min_depth, int max_depth); void filter_genotypes_by_quality(double min_genotype_quality); void filter_genotypes_by_filter_status(const set &filter_flags_to_remove, bool remove_all = false); private: string ALT_str, FILTER_str, INFO_str, FORMAT_str, QUAL_str; vector GENOTYPE_str; }; #endif /* VCF_ENTRY_H_ */ vcftools-0.1.15/src/cpp/vcf_entry_setters.cpp000066400000000000000000000126351307140004000212560ustar00rootroot00000000000000/* * vcf_entry_setters.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_entry.h" #include "entry.h" void vcf_entry::set_ALT(const string &in) { istringstream ss(in); string tmpstr; ALT.resize(0); while(!ss.eof()) { getline(ss, tmpstr, ','); add_ALT_allele(tmpstr); } parsed_ALT = true; } void vcf_entry::set_FORMAT(const string &in) { FORMAT.resize(0); FORMAT_to_idx.clear(); if (in.size() > 0) { istringstream ss(in); string tmpstr; unsigned int pos=0; while(!ss.eof()) { getline(ss, tmpstr, ':'); add_FORMAT_entry(tmpstr, pos); pos++; } } GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; if (FORMAT_to_idx.find("GT") != FORMAT_to_idx.end()) GT_idx = FORMAT_to_idx["GT"]; if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end()) GQ_idx = FORMAT_to_idx["GQ"]; if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end()) DP_idx = FORMAT_to_idx["DP"]; if (FORMAT_to_idx.find("FT") != FORMAT_to_idx.end()) FT_idx = FORMAT_to_idx["FT"]; parsed_FORMAT = true; } void vcf_entry::add_FORMAT_entry(const string &in, unsigned int pos) { FORMAT.push_back(in); FORMAT_to_idx[in] = pos; } // The following function reads in a genotype from a '0/1'-like string. // Should handle haploid types to, but NOT polyploidy. void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const string &in) { ploidy.resize(N_indv); if ((in.size() == 3) && ((in.c_str()[1] == '/') || (in.c_str()[1] == '|'))) { // Fast, diploid case... ploidy[indv] = 2; set_indv_PHASE(indv, in.c_str()[1]); set_indv_GENOTYPE_alleles(indv, in.c_str()[0], in.c_str()[2]); } else { // More complex case... size_t pos = in.find_first_of("/|"); if (pos != string::npos) { // autosome ploidy[indv] = 2; set_indv_PHASE(indv, in[pos]); set_indv_GENOTYPE_alleles(indv, make_pair(in.substr(0,pos), in.substr(pos+1))); } else { // Male chrX, or chrY ploidy[indv] = 1; set_indv_PHASE(indv, '|'); set_indv_GENOTYPE_alleles(indv, make_pair(in.substr(0,pos), ".")); } // Check for polypoidy size_t pos2 = in.find_last_of("/|"); if (pos != pos2) LOG.error("Polyploidy found, and not supported by vcftools: " + CHROM + ":" + header::int2str(POS)); } parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { ploidy.resize(N_indv); ploidy[indv] = 2; set_indv_GENOTYPE_ids(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { ploidy.resize(N_indv); ploidy[indv] = 2; set_indv_GENOTYPE_alleles(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (in.first != ".") a.first = header::str2int(in.first); if (in.second != ".") a.second = header::str2int(in.second); GENOTYPE[indv] = a; parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (a1 != '.') a.first = a1 - '0'; if (a2 != '.') a.second = a2 - '0'; GENOTYPE[indv] = a; parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); GENOTYPE[indv] = in; } void vcf_entry::set_indv_PHASE(unsigned int indv, char in) { if (PHASE.size() == 0) PHASE.resize(N_indv, '/'); PHASE[indv] = in; parsed_GT[indv] = true; } void vcf_entry::set_indv_GQUALITY(unsigned int indv, double in) { parsed_GQ[indv] = true; if (in == -1) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (in > 99) in = 99; GQUALITY[indv] = in; } void vcf_entry::set_indv_GFILTER(unsigned int indv, const string &in) { parsed_FT[indv] = true; if (GFILTER.size() == 0) GFILTER.resize(N_indv); GFILTER[indv].resize(0); if ((in.size() == 0) || (in == ".")) return; static istringstream ss; static string ith_FILTER; ss.clear(); ss.str(in); while (!ss.eof()) { getline(ss, ith_FILTER, ';'); if ((ith_FILTER.size()==0) || (ith_FILTER == ".")) continue; // Don't bother storing "unfiltered" state. GFILTER[indv].push_back(ith_FILTER); } } void vcf_entry::set_FILTER(const string &FILTER_str) { FILTER.resize(0); if (FILTER_str != ".") { istringstream ss(FILTER_str); string ith_FILTER; while (!ss.eof()) { getline(ss, ith_FILTER, ';'); FILTER.push_back(ith_FILTER); } } sort(FILTER.begin(), FILTER.end()); parsed_FILTER = true; } void vcf_entry::set_INFO(const string &INFO_str) { INFO.resize(0); if ((INFO_str.size() > 0) && (INFO_str != ".")) { istringstream ss(INFO_str); string tmpstr; while(!ss.eof()) { getline(ss, tmpstr, ';'); istringstream ss2(tmpstr); getline(ss2, tmpstr, '='); pair INFO_entry(tmpstr, "."); if (!ss2.eof()) { // If there is a value entry, read it now getline(ss2, tmpstr); INFO_entry.second = tmpstr; } else // Otherwise, set it equal to 1 INFO_entry.second = "1"; INFO.push_back(INFO_entry); } } parsed_INFO = true; } vcftools-0.1.15/src/cpp/vcf_file.cpp000066400000000000000000000162301307140004000172560ustar00rootroot00000000000000/* * vcf_file.cpp * * Created on: Dec 11, 2012 * Author: amarcketta */ #include "vcf_file.h" vcf_file::vcf_file(const parameters &p, bool diff) { if (!diff) { filename = p.vcf_filename; compressed = p.vcf_compressed; stream = p.stream_in; } else { filename = p.diff_file; compressed = p.diff_file_compressed; stream = false; } gzMAX_LINE_LEN = 0; N_entries = 0; N_kept_entries = 0; meta_data = header(); if (stream && compressed) open_gz(); else if (stream) { char first = cin.peek(); if (first == 0x1f) LOG.error("File starts with gzip magic string. Shouldn't you be using --gzvcf?\n"); file_in = &std::cin; } else open(); read_header(); include_indv = vector(meta_data.N_indv,true); } vcf_file::~vcf_file() { close(); } void vcf_file::read_header() { string line; unsigned int line_index = 0; line_index += meta_data.add_FILTER_descriptor("ID=PASS,Description=PASS", line_index); while (!eof()) { read_line(line); if (line[0] == '#') if (line[1] == '#') meta_data.parse_meta(line, line_index); else { meta_data.parse_header(line); return; } else return; } } void vcf_file::print(const parameters ¶ms) { LOG.printLOG("Outputting VCF file...\n"); string output_file = params.output_prefix + ".recode.vcf"; streambuf * buf; ofstream temp_out; if (!params.stream_out) { temp_out.open(output_file.c_str(), ios::out); if (!temp_out.is_open()) LOG.error("Could not open VCF Output file: " + output_file, 3); buf = temp_out.rdbuf(); } else buf = cout.rdbuf(); ostream out(buf); for (unsigned int ui=0; ui 0) out << "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new vcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; } void vcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector header; char magic[5] = {'B','C','F','\2', '\2'}; bgzf_write(out, magic, 5); if (meta_data.has_idx) { LOG.warning("VCF file contains IDX values in header. These are being removed for conversion to BCF."); meta_data.reprint(); meta_data.reparse(); } for (unsigned int ui=0; ui contig_vector; get_contigs(params.contigs_file, contig_vector); for(unsigned int ui=0; ui 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new vcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); } void vcf_file::open() { struct stat buf; int i = stat(filename.c_str(), &buf); if (i != 0) { perror("stat error"); LOG.error("Can't determine file type of " + filename, 0); } if (!S_ISREG(buf.st_mode)) LOG.error("Does not appear to be a regular file: " + filename, 0); if (filename.substr(filename.size()-4) == ".bcf") LOG.error("Filename ends in '.bcf'. Shouldn't you be using --bcf?\n"); if (!compressed) { if (filename.substr(filename.size()-3) == ".gz") LOG.error("Filename ends in '.gz'. Shouldn't you be using --gzvcf or --gzdiff?\n"); file_tmp.open(filename.c_str(), ios::in); if (!file_tmp.is_open()) LOG.error("Could not open VCF file: " + filename, 0); file_in = &file_tmp; } else open_gz(); } void vcf_file::open_gz() { gzMAX_LINE_LEN = 1024*1024; gz_readbuffer = new char[gzMAX_LINE_LEN]; if (stream) gzfile_in = gzdopen(fileno(stdin), "r"); else gzfile_in = gzopen(filename.c_str(), "rb"); if (gzfile_in == NULL) LOG.error("Could not open GZVCF file: " + filename, 0); #ifdef ZLIB_VERNUM string tmp(ZLIB_VERSION); LOG.printLOG("Using zlib version: " + tmp + "\n"); #if (ZLIB_VERNUM >= 0x1240) gzbuffer(gzfile_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster #else LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files.\n"); #endif #endif } void vcf_file::close() { if (compressed) { gzclose(gzfile_in); delete [] gz_readbuffer; } } bool vcf_file::eof() { bool out; if (!compressed) out = file_in->eof(); else out = gzeof(gzfile_in); // Returns 1 when EOF has previously been detected reading the given input stream, otherwise zero. return out; } void vcf_file::get_entry(vector &out) { out.resize(0); read_line(out); } entry* vcf_file::get_entry_object() { return new vcf_entry(meta_data, include_indv); } void vcf_file::read_line(string &out) { char * tmp; out = ""; if (!compressed) { getline(*file_in, out); out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line } else { bool again = true; while (again == true) { tmp = gzgets(gzfile_in, gz_readbuffer, gzMAX_LINE_LEN); if (tmp == NULL) return; out.append(gz_readbuffer); if ((strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) || (gz_readbuffer[gzMAX_LINE_LEN-2] == '\n')) again = false; } out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) } } void vcf_file::read_line(vector &out) { static string tmp; tmp=""; out.resize(0); read_line(tmp); vector tmp_char(tmp.begin(),tmp.end()); out = tmp_char; } vcftools-0.1.15/src/cpp/vcf_file.h000066400000000000000000000014111307140004000167160ustar00rootroot00000000000000/* * vcf_file.h * * Created on: Dec 11, 2012 * Author: amarcketta */ #ifndef VCF_FILE_H_ #define VCF_FILE_H_ #include "output_log.h" #include "vcf_entry.h" #include "parameters.h" #include "variant_file.h" extern output_log LOG; using namespace std; class vcf_file : public variant_file { public: vcf_file(const parameters ¶ms, bool diff=false); void get_entry(vector &out); entry* get_entry_object(); void print(const parameters ¶ms); void print_bcf(const parameters ¶ms); protected: ~vcf_file(); private: char *gz_readbuffer; void open(); void open_gz(); void close(); bool eof(); inline void read_line(string &out); inline void read_line(vector &out); void read_header(); bool stream; }; #endif /* VCF_FILE_H_ */ vcftools-0.1.15/src/cpp/vcftools.1000066400000000000000000000746171307140004000167330ustar00rootroot00000000000000.\" Manpage for vcftools. .TH vcftools man page 1 "05 January 2016" "0.1.14" "vcftools man page" .SH NAME vcftools v0.1.14 \- Utilities for the variant call format (VCF) and binary variant call format (BCF) .SH SYNOPSIS .B vcftools [ .B --vcf FILE | .B --gzvcf FILE | .B --bcf FILE] [ .B --out OUTPUT PREFIX ] [ FILTERING OPTIONS ] [ OUTPUT OPTIONS ] .SH DESCRIPTION vcftools is a suite of functions for use on genetic variation data in the form of VCF and BCF files. The tools provided will be used mainly to summarize data, run calculations on data, filter out data, and convert data into other useful file formats. .SH EXAMPLES Output allele frequency for all sites in the input vcf file from chromosome 1 .RS 2 .B vcftools --gzvcf input_file.vcf.gz --freq --chr 1 --out chr1_analysis .RE .PP Output a new vcf file from the input vcf file that removes any indel sites .RS 2 .B vcftools --vcf input_file.vcf --remove-indels --recode --recode-INFO-all --out SNPs_only .RE .PP Output file comparing the sites in two vcf files .RS 2 .B vcftools --gzvcf input_file1.vcf.gz --gzdiff input_file2.vcf.gz --diff-site --out in1_v_in2 .RE .PP Output a new vcf file to standard out without any sites that have a filter tag, then compress it with gzip .RS 2 .B vcftools --gzvcf input_file.vcf.gz --remove-filtered-all --recode --stdout | gzip -c > output_PASS_only.vcf.gz .RE .PP Output a Hardy-Weinberg p-value for every site in the bcf file that does not have any missing genotypes .RS 2 .B vcftools --bcf input_file.bcf --hardy --max-missing 1.0 --out output_noMissing .RE .PP Output nucleotide diversity at a list of positions .RS 2 zcat input_file.vcf.gz | .B vcftools --vcf - --site-pi --positions SNP_list.txt --out nucleotide_diversity .SH BASIC OPTIONS These options are used to specify the input and output files. .SS INPUT FILE OPTIONS .RS 2 .B --vcf .I .RS 2 This option defines the VCF file to be processed. VCFtools expects files in VCF format v4.0, v4.1 or v4.2. The latter two are supported with some small limitations. If the user provides a dash character '-' as a file name, the program expects a VCF file to be piped in through standard in. .RE .PP .B --gzvcf .I .RS 2 This option can be used in place of the --vcf option to read compressed (gzipped) VCF files directly. .RE .PP .B --bcf .I .RS 2 This option can be used in place of the --vcf option to read BCF2 files directly. You do not need to specify if this file is compressed with BGZF encoding. If the user provides a dash character '-' as a file name, the program expects a BCF2 file to be piped in through standard in. .RE .SS OUTPUT FILE OPTIONS .RS 2 .B --out .I .RS 2 This option defines the output filename prefix for all files generated by vcftools. For example, if is set to output_filename, then all output files will be of the form output_filename.*** . If this option is omitted, all output files will have the prefix "out." in the current working directory. .RE .PP .B --stdout .br .B -c .RS 2 These options direct the vcftools output to standard out so it can be piped into another program or written directly to a filename of choice. However, a select few output functions cannot be written to standard out. .RE .PP .B --temp .I .RS 2 This option can be used to redirect any temporary files that vcftools creates into a specified directory. .RE .SH SITE FILTERING OPTIONS These options are used to include or exclude certain sites from any analysis being performed by the program. .SS POSITION FILTERING .RS 2 .B --chr .I .br .B --not-chr .I .RS 2 Includes or excludes sites with indentifiers matching . These options may be used multiple times to include or exclude more than one chromosome. .RE .PP .B --from-bp .I .br .B --to-bp .I .RS 2 These options specify a lower bound and upper bound for a range of sites to be processed. Sites with positions less than or greater than these values will be excluded. These options can only be used in conjunction with a single usage of --chr. Using one of these does not require use of the other. .RE .PP .B --positions .I .br .B --exclude-positions .I .RS 2 Include or exclude a set of sites on the basis of a list of positions in a file. Each line of the input file should contain a (tab-separated) chromosome and position. The file can have comment lines that start with a "#", they will be ignored. .RE .PP .B --positions-overlap .I .br .B --exclude-positions-overlap .I .RS 2 Include or exclude a set of sites on the basis of the reference allele overlapping with a list of positions in a file. Each line of the input file should contain a (tab-separated) chromosome and position. The file can have comment lines that start with a "#", they will be ignored. .RE .PP .B --bed .I .br .B --exclude-bed .I .RS 2 Include or exclude a set of sites on the basis of a BED file. Only the first three columns (chrom, chromStart and chromEnd) are required. The BED file is expected to have a header line. A site will be kept or excluded if any part of any allele (REF or ALT) at a site is within the range of one of the BED entries. .RE .PP .B --thin .I .RS 2 Thin sites so that no two sites are within the specified distance from one another. .RE .PP .B --mask .I .br .B --invert-mask .I .br .B --mask-min .I .RS 2 These options are used to specify a FASTA-like mask file to filter with. The mask file contains a sequence of integer digits (between 0 and 9) for each position on a chromosome that specify if a site at that position should be filtered or not. .br An example mask file would look like: .RS 2 .I >1 .br .I 0000011111222... .br .I >2 .br .I 2222211111000... .RE In this example, sites in the VCF file located within the first 5 bases of the start of chromosome 1 would be kept, whereas sites at position 6 onwards would be filtered out. And sites after the 11th position on chromosome 2 would be filtered out as well. .br The "--invert-mask" option takes the same format mask file as the "--mask" option, however it inverts the mask file before filtering with it. .br And the "--mask-min" option specifies a threshold mask value between 0 and 9 to filter positions by. The default threshold is 0, meaning only sites with that value or lower will be kept. .RE .SS SITE ID FILTERING .RS 2 .B --snp .I .RS 2 Include SNP(s) with matching ID (e.g. a dbSNP rsID). This command can be used multiple times in order to include more than one SNP. .RE .PP .B --snps .I .br .B --exclude .I .RS 2 Include or exclude a list of SNPs given in a file. The file should contain a list of SNP IDs (e.g. dbSNP rsIDs), with one ID per line. No header line is expected. .RE .SS VARIANT TYPE FILTERING .RS 2 .B --keep-only-indels .br .B --remove-indels .RS 2 Include or exclude sites that contain an indel. For these options "indel" means any variant that alters the length of the REF allele. .RE .SS FILTER FLAG FILTERING .RS 2 .B --remove-filtered-all .RS 2 Removes all sites with a FILTER flag other than PASS. .RE .PP .B --keep-filtered .I .br .B --remove-filtered .I .RS 2 Includes or excludes all sites marked with a specific FILTER flag. These options may be used more than once to specify multiple FILTER flags. .RE .SS INFO FIELD FILTERING .RS 2 .B --keep-INFO .I .br .B --remove-INFO .I .RS 2 Includes or excludes all sites with a specific INFO flag. These options only filter on the presence of the flag and not its value. These options can be used multiple times to specify multiple INFO flags. .RE .SS ALLELE FILTERING .RS 2 .B --maf .I .br .B --max-maf .I .RS 2 Include only sites with a Minor Allele Frequency greater than or equal to the "--maf" value and less than or equal to the "--max-maf" value. One of these options may be used without the other. Allele frequency is defined as the number of times an allele appears over all individuals at that site, divided by the total number of non-missing alleles at that site. .RE .PP .B --non-ref-af .I .br .B --max-non-ref-af .I .br .B --non-ref-ac .I .br .B --max-non-ref-ac .I .PP .B --non-ref-af-any .I .br .B --max-non-ref-af-any .I .br .B --non-ref-ac-any .I .br .B --max-non-ref-ac-any .I .RS 2 Include only sites with all Non-Reference (ALT) Allele Frequencies (af) or Counts (ac) within the range specified, and including the specified value. The default options require .B all alleles to meet the specified criteria, whereas the options appended with "any" require only one allele to meet the criteria. The Allele frequency is defined as the number of times an allele appears over all individuals at that site, divided by the total number of non-missing alleles at that site. .RE .PP .B --mac .I .br .B --max-mac .I .RS 2 Include only sites with Minor Allele Count greater than or equal to the "--mac" value and less than or equal to the "--max-mac" value. One of these options may be used without the other. Allele count is simply the number of times that allele appears over all individuals at that site. .RE .PP .B --min-alleles .I .br .B --max-alleles .I .RS 2 Include only sites with a number of alleles greater than or equal to the "--min-alleles" value and less than or equal to the "--max-alleles" value. One of these options may be used without the other. .br For example, to include only bi-allelic sites, one could use: .br .RS 2 .B vcftools --vcf file1.vcf --min-alleles 2 --max-alleles 2 .RE .SS GENOTYPE VALUE FILTERING .RS 2 .B --min-meanDP .I .br .B --max-meanDP .I .RS 2 Includes only sites with mean depth values (over all included individuals) greater than or equal to the "--min-meanDP" value and less than or equal to the "--max-meanDP" value. One of these options may be used without the other. These options require that the "DP" FORMAT tag is included for each site. .RE .PP .B --hwe .I .RS 2 Assesses sites for Hardy-Weinberg Equilibrium using an exact test, as defined by Wigginton, Cutler and Abecasis (2005). Sites with a p-value below the threshold defined by this option are taken to be out of HWE, and therefore excluded. .RE .PP .B --max-missing .I .RS 2 Exclude sites on the basis of the proportion of missing data (defined to be between 0 and 1, where 0 allows sites that are completely missing and 1 indicates no missing data allowed). .RE .PP .B --max-missing-count .I .RS 2 Exclude sites with more than this number of missing genotypes over all individuals. .RE .PP .B --phased .RS 2 Excludes all sites that contain unphased genotypes. .RE .SS MISCELLANEOUS FILTERING .RS 2 .B --minQ .I .RS 2 Includes only sites with Quality value above this threshold. .RE .SH INDIVIDUAL FILTERING OPTIONS These options are used to include or exclude certain individuals from any analysis being performed by the program. .br .RS 2 .B --indv .I .br .B --remove-indv .I .RS 2 Specify an individual to be kept or removed from the analysis. This option can be used multiple times to specify multiple individuals. If both options are specified, then the "--indv" option is executed before the "--remove-indv option". .RE .PP .B --keep .I .br .B --remove .I .RS 2 Provide files containing a list of individuals to either include or exclude in subsequent analysis. Each individual ID (as defined in the VCF headerline) should be included on a separate line. If both options are used, then the "--keep" option is executed before the "--remove" option. When multiple files are provided, the union of individuals from all keep files subtracted by the union of individuals from all remove files are kept. No header line is expected. .RE .PP .B --max-indv .I .RS 2 Randomly thins individuals so that only the specified number are retained. .RE .SH GENOTYPE FILTERING OPTIONS These options are used to exclude genotypes from any analysis being performed by the program. If excluded, these values will be treated as missing. .br .RS 2 .B --remove-filtered-geno-all .RS 2 Excludes all genotypes with a FILTER flag not equal to "." (a missing value) or PASS. .RE .PP .B --remove-filtered-geno .I .RS 2 Excludes genotypes with a specific FILTER flag. .RE .PP .B --minGQ .I .RS 2 Exclude all genotypes with a quality below the threshold specified. This option requires that the "GQ" FORMAT tag is specified for all sites. .RE .PP .B --minDP .I .br .B --maxDP .I .RS 2 Includes only genotypes greater than or equal to the "--minDP" value and less than or equal to the "--maxDP" value. This option requires that the "DP" FORMAT tag is specified for all sites. .RE .SH OUTPUT OPTIONS These options specify which analyses or conversions to perform on the data that passed through all specified filters. .SS OUTPUT ALLELE STATISTICS .RS 2 .B --freq .br .B --freq2 .RS 2 Outputs the allele frequency for each site in a file with the suffix ".frq". The second option is used to suppress output of any information about the alleles. .RE .PP .B --counts .br .B --counts2 .RS 2 Outputs the raw allele counts for each site in a file with the suffix ".frq.count". The second option is used to suppress output of any information about the alleles. .RE .PP .B --derived .RS 2 For use with the previous four frequency and count options only. Re-orders the output file columns so that the ancestral allele appears first. This option relies on the ancestral allele being specified in the VCF file using the AA tag in the INFO field. .RE .SS OUTPUT DEPTH STATISTICS .RS 2 .B --depth .RS 2 Generates a file containing the mean depth per individual. This file has the suffix ".idepth". .RE .PP .B --site-depth .RS 2 Generates a file containing the depth per site summed across all individuals. This output file has the suffix ".ldepth". .RE .PP .B --site-mean-depth .RS 2 Generates a file containing the mean depth per site averaged across all individuals. This output file has the suffix ".ldepth.mean". .RE .PP .B --geno-depth .RS 2 Generates a (possibly very large) file containing the depth for each genotype in the VCF file. Missing entries are given the value -1. The file has the suffix ".gdepth". .RE .SS OUTPUT LD STATISTICS .RS 2 .B --hap-r2 .RS 2 Outputs a file reporting the r2, D, and D' statistics using phased haplotypes. These are the traditional measures of LD often reported in the population genetics literature. The output file has the suffix ".hap.ld". This option assumes that the VCF input file has phased haplotypes. .RE .PP .B --geno-r2 .RS 2 Calculates the squared correlation coefficient between genotypes encoded as 0, 1 and 2 to represent the number of non-reference alleles in each individual. This is the same as the LD measure reported by PLINK. The D and D' statistics are only available for phased genotypes. The output file has the suffix ".geno.ld". .RE .PP .B --geno-chisq .RS 2 If your data contains sites with more than two alleles, then this option can be used to test for genotype independence via the chi-squared statistic. The output file has the suffix ".geno.chisq". .RE .PP .B --hap-r2-positions .I .br .B --geno-r2-positions .I .RS 2 Outputs a file reporting the r2 statistics of the sites contained in the provided file verses all other sites. The output files have the suffix ".list.hap.ld" or ".list.geno.ld", depending on which option is used. .RE .PP .B --ld-window .I .RS 2 This optional parameter defines the maximum number of SNPs between the SNPs being tested for LD in the "--hap-r2", "--geno-r2", and "--geno-chisq" functions. .RE .PP .B --ld-window-bp .I .RS 2 This optional parameter defines the maximum number of physical bases between the SNPs being tested for LD in the "--hap-r2", "--geno-r2", and "--geno-chisq" functions. .RE .PP .B --ld-window-min .I .RS 2 This optional parameter defines the minimum number of SNPs between the SNPs being tested for LD in the "--hap-r2", "--geno-r2", and "--geno-chisq" functions. .RE .PP .B --ld-window-bp-min .I .RS 2 This optional parameter defines the minimum number of physical bases between the SNPs being tested for LD in the "--hap-r2", "--geno-r2", and "--geno-chisq" functions. .RE .PP .B --min-r2 .I .RS 2 This optional parameter sets a minimum value for r2, below which the LD statistic is not reported by the "--hap-r2", "--geno-r2", and "--geno-chisq" functions. .RE .PP .B --interchrom-hap-r2 .br .B --interchrom-geno-r2 .RS 2 Outputs a file reporting the r2 statistics for sites on different chromosomes. The output files have the suffix ".interchrom.hap.ld" or ".interchrom.geno.ld", depending on the option used. .RE .SS OUTPUT TRANSITION/TRANSVERSION STATISTICS .RS 2 .B --TsTv .I .RS 2 Calculates the Transition / Transversion ratio in bins of size defined by this option. Only uses bi-allelic SNPs. The resulting output file has the suffix ".TsTv". .RE .PP .B --TsTv-summary .RS 2 Calculates a simple summary of all Transitions and Transversions. The output file has the suffix ".TsTv.summary". .RE .PP .B --TsTv-by-count .RS 2 Calculates the Transition / Transversion ratio as a function of alternative allele count. Only uses bi-allelic SNPs. The resulting output file has the suffix ".TsTv.count". .RE .PP .B --TsTv-by-qual .RS 2 Calculates the Transition / Transversion ratio as a function of SNP quality threshold. Only uses bi-allelic SNPs. The resulting output file has the suffix ".TsTv.qual". .RE .PP .B --FILTER-summary .RS 2 Generates a summary of the number of SNPs and Ts/Tv ratio for each FILTER category. The output file has the suffix ".FILTER.summary". .RE .SS OUTPUT NUCLEOTIDE DIVERGENCE STATISTICS .RS 2 .B --site-pi .RS 2 Measures nucleotide divergency on a per-site basis. The output file has the suffix ".sites.pi". .RE .PP .B --window-pi .I .br .B --window-pi-step .I .RS 2 Measures the nucleotide diversity in windows, with the number provided as the window size. The output file has the suffix ".windowed.pi". The latter is an optional argument used to specify the step size in between windows. .RE .SS OUTPUT FST STATISTICS .RS 2 .B --weir-fst-pop .I .RS 2 This option is used to calculate an Fst estimate from Weir and Cockerham's 1984 paper. This is the preferred calculation of Fst. The provided file must contain a list of individuals (one individual per line) from the VCF file that correspond to one population. This option can be used multiple times to calculate Fst for more than two populations. These files will also be included as "--keep" options. By default, calculations are done on a per-site basis. The output file has the suffix ".weir.fst". .RE .PP .B --fst-window-size .I .br .B --fst-window-step .I .RS 2 These options can be used with "--weir-fst-pop" to do the Fst calculations on a windowed basis instead of a per-site basis. These arguments specify the desired window size and the desired step size between windows. .RE .SS OUTPUT OTHER STATISTICS .RS 2 .B --het .RS 2 Calculates a measure of heterozygosity on a per-individual basis. Specfically, the inbreeding coefficient, F, is estimated for each individual using a method of moments. The resulting file has the suffix ".het". .RE .PP .B --hardy .RS 2 Reports a p-value for each site from a Hardy-Weinberg Equilibrium test (as defined by Wigginton, Cutler and Abecasis (2005)). The resulting file (with suffix ".hwe") also contains the Observed numbers of Homozygotes and Heterozygotes and the corresponding Expected numbers under HWE. .RE .PP .B --TajimaD .I .RS 2 Outputs Tajima's D statistic in bins with size of the specified number. The output file has the suffix ".Tajima.D". .RE .PP .B --indv-freq-burden .RS 2 This option calculates the number of variants within each individual of a specific frequency. The resulting file has the suffix ".ifreqburden". .RE .PP .B --LROH .RS 2 This option will identify and output Long Runs of Homozygosity. The output file has the suffix ".LROH". This function is experimental, and will use a lot of memory if applied to large datasets. .RE .PP .B --relatedness .RS 2 This option is used to calculate and output a relatedness statistic based on the method of Yang et al, Nature Genetics 2010 (doi:10.1038/ng.608). Specifically, calculate the unadjusted Ajk statistic. Expectation of Ajk is zero for individuals within a populations, and one for an individual with themselves. The output file has the suffix ".relatedness". .RE .PP .B --relatedness2 .RS 2 This option is used to calculate and output a relatedness statistic based on the method of Manichaikul et al., BIOINFORMATICS 2010 (doi:10.1093/bioinformatics/btq559). The output file has the suffix ".relatedness2". .RE .PP .B --site-quality .RS 2 Generates a file containing the per-site SNP quality, as found in the QUAL column of the VCF file. This file has the suffix ".lqual". .RE .PP .B --missing-indv .RS 2 Generates a file reporting the missingness on a per-individual basis. The file has the suffix ".imiss". .RE .PP .B --missing-site .RS 2 Generates a file reporting the missingness on a per-site basis. The file has the suffix ".lmiss". .RE .PP .B --SNPdensity .I .RS 2 Calculates the number and density of SNPs in bins of size defined by this option. The resulting output file has the suffix ".snpden". .RE .PP .B --kept-sites .RS 2 Creates a file listing all sites that have been kept after filtering. The file has the suffix ".kept.sites". .RE .PP .B --removed-sites .RS 2 Creates a file listing all sites that have been removed after filtering. The file has the suffix ".removed.sites". .RE .PP .B --singletons .RS 2 This option will generate a file detailing the location of singletons, and the individual they occur in. The file reports both true singletons, and private doubletons (i.e. SNPs where the minor allele only occurs in a single individual and that individual is homozygotic for that allele). The output file has the suffix ".singletons". .RE .PP .B --hist-indel-len .RS 2 This option will generate a histogram file of the length of all indels (including SNPs). It shows both the count and the percentage of all indels for indel lengths that occur at least once in the input file. SNPs are considered indels with length zero. The output file has the suffix ".indel.hist". .RE .PP .B --hapcount .I .RS 2 This option will output the number of unique haplotypes within user specified bins, as defined by the BED file. The output file has the suffix ".hapcount". .RE .PP .B --mendel .I .RS 2 This option is use to report mendel errors identified in trios. The command requires a PLINK-style PED file, with the first four columns specifying a family ID, the child ID, the father ID, and the mother ID. The output of this command has the suffix ".mendel". .RE .PP .B --extract-FORMAT-info .I .RS 2 Extract information from the genotype fields in the VCF file relating to a specfied FORMAT identifier. The resulting output file has the suffix "..FORMAT". For example, the following command would extract the all of the GT (i.e. Genotype) entries: .br .RS 2 .B vcftools --vcf file1.vcf --extract-FORMAT-info GT .RE .RE .PP .B --get-INFO .I .RS 2 This option is used to extract information from the INFO field in the VCF file. The argument specifies the INFO tag to be extracted, and the option can be used multiple times in order to extract multiple INFO entries. The resulting file, with suffix ".INFO", contains the required INFO information in a tab-separated table. For example, to extract the NS and DB flags, one would use the command: .br .RS 2 .B vcftools --vcf file1.vcf --get-INFO NS --get-INFO DB .RE .SS OUTPUT VCF FORMAT .RS 2 .B --recode .br .B --recode-bcf .RS 2 These options are used to generate a new file in either VCF or BCF from the input VCF or BCF file after applying the filtering options specified by the user. The output file has the suffix ".recode.vcf" or ".recode.bcf". By default, the INFO fields are removed from the output file, as the INFO values may be invalidated by the recoding (e.g. the total depth may need to be recalculated if individuals are removed). This behavior may be overriden by the following options. By default, BCF files are written out as BGZF compressed files. .RE .PP .B --recode-INFO .I .br .B --recode-INFO-all .RS 2 These options can be used with the above recode options to define an INFO key name to keep in the output file. This option can be used multiple times to keep more of the INFO fields. The second option is used to keep all INFO values in the original file. .RE .PP .B --contigs .I .RS 2 This option can be used in conjuction with the --recode-bcf when the input file does not have any contig declarations. This option expects a file name with one contig header per line. These lines are included in the output file. .RE .SS OUTPUT OTHER FORMATS .RS 2 .B --012 .RS 2 This option outputs the genotypes as a large matrix. Three files are produced. The first, with suffix ".012", contains the genotypes of each individual on a separate line. Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles. Missing genotypes are represented by -1. The second file, with suffix ".012.indv" details the individuals included in the main file. The third file, with suffix ".012.pos" details the site locations included in the main file. .RE .PP .B --IMPUTE .RS 2 This option outputs phased haplotypes in IMPUTE reference-panel format. As IMPUTE requires phased data, using this option also implies --phased. Unphased individuals and genotypes are therefore excluded. Only bi-allelic sites are included in the output. Using this option generates three files. The IMPUTE haplotype file has the suffix ".impute.hap", and the IMPUTE legend file has the suffix ".impute.hap.legend". The third file, with suffix ".impute.hap.indv", details the individuals included in the haplotype file, although this file is not needed by IMPUTE. .RE .PP .B --ldhat .br .B --ldhelmet .br .B --ldhat-geno .RS 2 These options output data in LDhat/LDhelmet format. This option requires the "--chr" filter option to also be used. The two first options output phased data only, and therefore also implies "--phased" be used, leading to unphased individuals and genotypes being excluded. For LDhelmet, only snps will be considered, and therefore it implies "--remove-indels". The second option treats all of the data as unphased, and therefore outputs LDhat files in genotype/unphased format. Two output files are generated with the suffixes ".ldhat.sites" and ".ldhat.locs", which correspond to the LDhat "sites" and "locs" input files respectively; for LDhelmet, the two files generated have the suffixes ".ldhelmet.snps" and ".ldhelmet.pos", which corresponds to the "SNPs" and "positions" files. .RE .PP .B --BEAGLE-GL .br .B --BEAGLE-PL .RS 2 These options output genotype likelihood information for input into the BEAGLE program. The VCF file is required to contain FORMAT fields with "GL" or "PL" tags, which can generally be output by SNP callers such as the GATK. Use of this option requires a chromosome to be specified via the "--chr" option. The resulting output file has the suffix ".BEAGLE.GL" or ".BEAGLE.PL" and contains genotype likelihoods for biallelic sites. This file is suitable for input into BEAGLE via the "like=" argument. .RE .PP .B --plink .br .B --plink-tped .br .B --chrom-map .RS 2 These options output the genotype data in PLINK PED format. With the first option, two files are generated, with suffixes ".ped" and ".map". Note that only bi-allelic loci will be output. Further details of these files can be found in the PLINK documentation. .br Note: The first option can be very slow on large datasets. Using the --chr option to divide up the dataset is advised, or alternatively use the --plink-tped option which outputs the files in the PLINK transposed format with suffixes ".tped" and ".tfam". .br For usage with variant sites in species other than humans, the --chrom-map option may be used to specify a file name that has a tab-delimited mapping of chromosome name to a desired integer value with one line per chromosome. This file must contain a mapping for every chromosome value found in the file. .RE .SH COMPARISON OPTIONS These options are used to compare the original variant file to another variant file and output the results. All of the diff functions require both files to contain the same chromosomes and that the files be sorted in the same order. If one of the files contains chromosomes that the other file does not, use the --not-chr filter to remove them from the analysis. .SS DIFF VCF FILE .RS 2 .B --diff .I .br .B --gzdiff .I .br .B --diff-bcf .I .RS 2 These options compare the original input file to this specified VCF, gzipped VCF, or BCF file. These options must be specified with one additional option described below in order to specify what type of comparison is to be performed. See the examples section for typical usage. .RE .SS DIFF OPTIONS .RS 2 .B --diff-site .RS 2 Outputs the sites that are common / unique to each file. The output file has the suffix ".diff.sites_in_files". .RE .PP .B --diff-indv .RS 2 Outputs the individuals that are common / unique to each file. The output file has the suffix ".diff.indv_in_files". .RE .PP .B --diff-site-discordance .RS 2 This option calculates discordance on a site by site basis. The resulting output file has the suffix ".diff.sites". .RE .PP .B --diff-indv-discordance .RS 2 This option calculates discordance on a per-individual basis. The resulting output file has the suffix ".diff.indv". .RE .PP .B --diff-indv-map .I .RS 2 This option allows the user to specify a mapping of individual IDs in the second file to those in the first file. The program expects the file to contain a tab-delimited line containing an individual's name in file one followed by that same individual's name in file two with one mapping per line. .RE .PP .B --diff-discordance-matrix .RS 2 This option calculates a discordance matrix. This option only works with bi-allelic loci with matching alleles that are present in both files. The resulting output file has the suffix ".diff.discordance.matrix". .RE .PP .B --diff-switch-error .RS 2 This option calculates phasing errors (specifically "switch errors"). This option creates an output file describing switch errors found between sites, with suffix ".diff.switch". .RE .SH AUTHORS Adam Auton (adam.auton@einstein.yu.edu) .br Anthony Marcketta (anthony.marcketta@einstein.yu.edu) vcftools-0.1.15/src/cpp/vcftools.cpp000066400000000000000000000151461307140004000173450ustar00rootroot00000000000000/* * vcftools.cpp */ #include "vcftools.h" output_log LOG; int main(int argc, char *argv[]) { time_t start,end; time(&start); // The following turns off sync between C and C++ streams. // Apparently it's faster to turn sync off, and as I don't use C streams, it's okay to turn off. ios_base::sync_with_stdio(false); parameters params(argc, argv); params.print_help(); params.read_parameters(); LOG.open(params.stream_out, params.stream_err, params.output_prefix); LOG.printLOG("\nVCFtools - " + VCFTOOLS_VERSION + "\n"); LOG.printLOG("(C) Adam Auton and Anthony Marcketta 2009\n\n"); params.print_params(); variant_file *vf; if (!params.bcf_format) vf = new vcf_file(params); else vf = new bcf_file(params); vf->apply_filters(params); LOG.printLOG("After filtering, kept " + output_log::int2str(vf->N_kept_individuals()) + " out of " + output_log::int2str(vf->meta_data.N_indv) + " Individuals\n"); if (params.diff_file != "") { variant_file *variant_diff; if (params.diff_file_bcf) variant_diff = new bcf_file(params, true); else variant_diff = new vcf_file(params, true); variant_diff->apply_filters(params); if (params.diff_indv == true) vf->output_indv_in_files(params, *variant_diff); else if (params.diff_site_discordance == true) vf->output_discordance_by_site(params, *variant_diff); else if (params.diff_discordance_matrix == true) vf->output_discordance_matrix(params, *variant_diff); else if (params.diff_indv_discordance == true) vf->output_discordance_by_indv(params, *variant_diff); else if (params.diff_switch_error == true) vf->output_switch_error(params, *variant_diff); else if (params.diff_site == true) vf->output_sites_in_files(params, *variant_diff); else LOG.warning("Diff file provided, but no additional option.\n"); delete variant_diff; } if (params.num_outputs == 0) vf->write_stats(params); if (!params.INFO_to_extract.empty()) vf->output_INFO_for_each_site(params); if (params.FORMAT_id_to_extract != "") vf->output_FORMAT_information(params); if (params.output_indv_burden == true) vf->output_indv_burden(params); if (params.output_indv_depth == true) vf->output_individuals_by_mean_depth(params); if (params.output_indv_freq_burden == true) vf->output_indv_freq_burden(params); if (params.output_indv_freq_burden2 == true) vf->output_indv_freq_burden(params, 1); if (params.output_geno_depth == true) vf->output_genotype_depth(params); if (params.output_site_depth == true) vf->output_site_depth(params, false); if (params.output_site_mean_depth == true) vf->output_site_depth(params, true); if (params.output_freq == true) vf->output_frequency(params, false); if (params.output_counts == true) vf->output_frequency(params, true); if (params.plink_output == true) vf->output_as_plink(params); if (params.plink_tped_output == true) vf->output_as_plink_tped(params); if (params.output_HWE == true) vf->output_hwe(params); if (params.output_SNP_density_bin_size > 0) vf->output_SNP_density(params); if (params.output_indv_missingness == true) vf->output_indv_missingness(params); if (params.output_site_missingness == true) vf->output_site_missingness(params); if (params.output_geno_chisq == true) vf->output_genotype_chisq(params, -1.0); if (params.output_geno_rsq == true) vf->output_genotype_r2(params); if (params.output_interchromosomal_hap_rsq == true) vf->output_interchromosomal_haplotype_r2(params); if (params.output_interchromosomal_geno_rsq == true) vf->output_interchromosomal_genotype_r2(params); if (params.output_hap_rsq == true) vf->output_haplotype_r2(params); if (params.hap_rsq_position_list != "") vf->output_haplotype_r2_of_SNP_list_vs_all_others(params); if (params.geno_rsq_position_list != "") vf->output_genotype_r2_of_SNP_list_vs_all_others(params); if (params.output_het == true) vf->output_het(params); if (params.hapcount_BED != "") vf->output_haplotype_count(params); if (params.output_site_quality == true) vf->output_site_quality(params); if (params.output_012_matrix == true) vf->output_as_012_matrix(params); if (params.output_as_IMPUTE == true) vf->output_as_IMPUTE(params); if (params.output_BEAGLE_genotype_likelihoods_GL == true) vf->output_BEAGLE_genotype_likelihoods(params, 0); if (params.output_BEAGLE_genotype_likelihoods_PL == true) vf->output_BEAGLE_genotype_likelihoods(params, 1); if (params.output_as_ldhat_unphased == true) vf->output_as_LDhat_unphased(params); if (params.output_as_ldhat_phased == true) vf->output_as_LDhat_phased(params); if (params.output_as_ldhelmet == true) vf->output_as_LDhelmet(params); if (params.output_singletons == true) vf->output_singletons(params); if (params.output_site_pi == true) vf->output_per_site_nucleotide_diversity(params); if (params.pi_window_size > 0) vf->output_windowed_nucleotide_diversity(params); if (params.output_Tajima_D_bin_size > 0) vf->output_Tajima_D(params); if (params.output_TsTv_bin_size > 0) vf->output_TsTv(params); if (params.output_TsTv_by_count) vf->output_TsTv_by_count(params); if (params.output_TsTv_by_qual) vf->output_TsTv_by_quality(params); if (params.output_TsTv_summary) vf->output_TsTv_summary(params); if (params.recode == true) vf->print(params); if (params.recode_bcf == true) vf->print_bcf(params); if (params.output_filter_summary == true) vf->output_FILTER_summary(params); if (params.output_kept_sites == true) vf->output_kept_sites(params); if (params.output_removed_sites == true) vf->output_removed_sites(params); if (params.output_LROH == true) vf->output_LROH(params); if (params.output_relatedness_Yang == true) vf->output_indv_relatedness_Yang(params); if (params.output_relatedness_Manichaikul == true) vf->output_indv_relatedness_Manichaikul(params); if (params.output_PCA == true) vf->output_PCA(params); if (params.output_N_PCA_SNP_loadings > 0) vf->output_PCA_SNP_loadings(params); if (params.mendel_ped_file != "") vf->output_mendel_inconsistencies(params); if (params.fst_window_size <= 0 && params.weir_fst_populations.size() > 0) vf->output_weir_and_cockerham_fst(params); else if (params.weir_fst_populations.size() > 0) vf->output_windowed_weir_and_cockerham_fst(params); if (params.output_indel_hist == true) vf->output_indel_hist(params); LOG.printLOG("After filtering, kept " + header::int2str(vf->N_kept_sites()) + " out of a possible " + header::int2str(vf->N_total_sites()) + " Sites\n"); if (vf->N_total_sites() <= 0) LOG.warning("File does not contain any sites"); else if (vf->N_kept_sites() <= 0) LOG.warning("No data left for analysis!"); time(&end); double running_time = difftime(end,start); LOG.printLOG("Run Time = " + output_log::dbl2str_fixed(running_time, 2) + " seconds\n"); LOG.close(); delete vf; return 0; } vcftools-0.1.15/src/cpp/vcftools.h000066400000000000000000000003431307140004000170030ustar00rootroot00000000000000/* * vcftools.h */ #ifndef VCFTOOLS_H_ #define VCFTOOLS_H_ #include "output_log.h" #include "parameters.h" #include "bcf_file.h" #include "vcf_file.h" #include "variant_file.h" #include "header.h" #endif /* VCFTOOLS_H_ */ vcftools-0.1.15/src/perl/000077500000000000000000000000001307140004000151535ustar00rootroot00000000000000vcftools-0.1.15/src/perl/ChangeLog000066400000000000000000000107631307140004000167340ustar00rootroot000000000000002012-05-02 15:53 petr.danecek@sanger * vcf-consensus * vcf-indel-stats * vcf-compare: handle spaces in file names 2012-02-23 09:45 petr.danecek@sanger * vcf-merge: redundant ALT alleles are no longer removed by default but only with -t. * vcf-annotate: - set the FILTER column, remove and annotate in one go (e.g. ID) - support of genotype columns in user filters - new --fill-type option 2012-01-23 10:41 petr.danecek@sanger * Notable changes since the last release: - fill-fs: new script for annotating VCFs with flanking sequence - fill-ref-md5: new script for annotating VCFs with 'reference' and 'contig' tags recommended by VCFv4.1 - vcf-annotate: now also removes annotations and can apply user-defined filters - vcf-compare: changed output format, more stats reported and plots the results - vcf-fix-newlines: new script for fixing newline representation - vcf-phased-join: new script for joining pre-phased VCFs - vcf-query: significant speed up for some type of queries - vcf-sort: chromosomal ordering (1,2,10,MT,X rather than 1,10,2,MT,X) with new versions of unix sort - Vcf.pm: new set of API methods for faster access - some of the tools now work also with remote files 2011-04-04 14:00 petr.danecek@sanger * VCFtools now support VCFv4.1 * fill-ref-md5: New tool backfilling sequence MD5s into VCF header * Renamed merge-vcf, compare-vcf etc. to consistent naming vcf-merge, vcf-compare * vcf-merge: Now merging also GL and other Number=[AG] tags * vcf-compare: Comparing indel haplotypes 2011-02-21 12:31 petr.danecek@sanger * vcf-stats: new -s option to speed up parsing when stats computed for selected samples only * merge-vcf: allow to merge arbitrary chunks; -c option now deprecated, use -r instead * compare-vcf: change in output format and more detailed comparison 2011-02-17 17:36 petr.danecek@sanger * vcf-stats: allow querying stats of individual samples 2011-02-16 12:07 petr.danecek@sanger * vcf-stats: major revision * vcf-annotate: more filtering options 2011-02-04 14:43 petr * merge-vcf: if possible, calculate AC,AN even for sites without genotypes 2011-02-03 15:04 petr * merge-vcf: fixed a bug introduced by the previous fix. 2011-02-02 21:02 petr * merge-vcf: fixed a bug in merging indel ALTs. Only VCFs without samples were affected. 2011-01-28 15:38 petr * vcf-subset: new option for printing rows with calls private to the subset group 2011-01-24 13:38 petr * Vcf.pm: uppercase floating point number expressions (such as 1.0382033E-6) now pass validation 2011-01-20 08:28 petr * vcf-concat: print header also for empty VCFs with the -s option 2011-01-04 08:59 petr * vcf-isec, vcf-sort, Vcf.pm: replaced "zcat" by "gunzip -c" 2010-12-22 14:18 petr * vcf-annotate: New --SnpCluster option * Vcf.pm: new sub add_filter() 2010-12-15 13:44 petr * vcf-isec: By default output records from all files with unique positions (duplicate records from the same file still should be printed). With the -o switch, only positions from the left-most file will be printed. 2010-12-09 14:48 petr * query-vcf: Output 'True' for Flag tags when present and . when absent * vcf-annotate: Fix: the command line eats quotes when they are not escaped 2010-12-08 12:06 petr * Vcf.pm: throw an error when tabix fails. * query-vcf: enable streaming of files when region is not specified. 2010-12-02 11:53 petr * Vcf.pm: allow ALT alleles which are not present in samples * vcf-isec: Multiple files can be created simultaneously with all possible isec combinations. Suitable for Venn Diagram analysis. * merge-vcf: Do not remove ALT alleles if no samples are present * merge-vcf: Do FILTER merging more intelligently. * merge-vcf: Join the QUAL column: use average value weighted by the number of samples. 2010-11-28 08:34 petr * vcf-concat: Partial sort * vcf-validator: Added -u option * VcfStats.pm: dump_counts 2010-11-27 13:04 petr * vcf-subset: Filter variants by type 2010-11-26 09:08 petr * vcf-annotate: Added possibility to read header descriptions from a file 2010-11-24 13:25 petr * Fix in Vcf.pm:fill_ref_alt_mapping. VCF files processed with merge-vcf were affected when containing IDs in the ALT column. 2010-11-23 13:12 petr * Major revamp of Vcf.pm to allow better inheritance. Problems likely. vcftools-0.1.15/src/perl/FaSlice.pm000066400000000000000000000151271307140004000170250ustar00rootroot00000000000000# Author: petr.danecek@sanger # =head1 NAME FaSlice.pm. Module for cached access to fasta sequences, employs samtools faidx. =head1 SYNOPSIS use FaSlice; my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345); $fa->get_slice(1,12345,54321); =cut package FaSlice; use strict; use warnings; use Carp; =head2 new About : Creates new FaSlice object. Usage : my $fa = FaSlice->new(file=>'ref.fa'); Args : file .. the fasta file oob .. out-of-bounds requests: one of 'throw' (throws), 'N' (fills the missing bases with Ns), or '' (returns empty string, default) size .. size of the cached chunk read by samtools faidx (1_000_000) =cut sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{file} ) { $self->throw("Missing the parameter file\n"); } $$self{chr} = undef; $$self{from} = undef; $$self{to} = undef; if ( !$$self{size} ) { $$self{size}=1_000_000; } $$self{ncache_missed} = 0; $$self{nqueries} = 0; if ( !exists($$self{oob}) ) { $$self{oob}=''; } if ( $$self{oob} ne '' && $$self{oob} ne 'throw' && $$self{oob} ne 'N' ) { $self->throw("The value of oob not recognised: [$$self{oob}]"); } $self->chromosome_naming($$self{file}); return $self; } sub throw { my ($self,@msg) = @_; confess(@msg); } sub cmd { my ($self,$cmd) = @_; my @out = `$cmd`; if ( $? ) { my @msg = (); push @msg, qq[The command "$cmd" returned non-zero status $?]; if ( $! ) { push @msg, ": $!\n"; } else { push @msg, ".\n"; } if ( scalar @out ) { push @msg, @out; } $self->throw(@msg); } return (@out); } # Read the first file of the fasta file and make a guess: Are all chromosomes # names as 'chr1','chr2',etc or just '1','2',...? # Future TODO: more robust chromosome name mapping? sub chromosome_naming { my ($self,$fa_file) = @_; open(my $fh,'<',"$fa_file.fai") or $self->throw("$fa_file.fai: $!"); my $line=<$fh>; if ( !($line=~/^(chr)?\S+\t/) ) { chomp($line); $self->throw("FIXME: the sequence names not in '>(chr)?\\S+' format [$line] ... $fa_file.fai\n"); } close($fh); $$self{chr_naming} = defined $1 ? $1 : ''; } sub cache_chr_lengths { my ($self) = @_; if ( exists($$self{chr_lengths}) ) { return; } open(my $fh,'<',"$$self{file}.fai") or $self->throw("$$self{file}.fai: $!"); while (my $line=<$fh>) { my @items = split(/\t/,$line); my $chr = $$self{chr_naming}.$items[0]; $$self{chr_lengths}{$chr} = $items[1]; } close($fh) or $self->throw("close $$self{file}.fai"); } sub read_chunk { my ($self,$chr,$pos) = @_; $$self{chr} = $chr; $chr =~ s/^chr//; $chr = $$self{chr_naming}.$chr; if ( exists($$self{chr_lengths}) && (!exists($$self{chr_lengths}{$chr}) or $$self{chr_lengths}{$chr} < $pos ) ) { $$self{to} = $$self{from} - 1; $$self{chunk} = ''; return; } my $to = $pos + $$self{size}; my $cmd = "samtools faidx \Q$$self{file}\E \Q$chr:$pos-$to\E"; my @out = $self->cmd($cmd) or $self->throw("$cmd: $!"); my $line = shift(@out); if ( !($line=~/^>\Q$chr\E:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } $$self{from} = $1; my $chunk = ''; while ($line=shift(@out)) { chomp($line); $chunk .= $line; } $$self{to} = $$self{from} + length($chunk) - 1; $$self{chunk} = $chunk; $self->cache_chr_lengths(); return; } =head2 get_base About : Retrieves base at the given chromosome and position Usage : my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345); Args : chromosome 1-based coordinate =cut sub get_base { my ($self,$chr,$pos) = @_; if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) { $self->read_chunk($chr,$pos); } $$self{nqueries}++; my $idx = $pos - $$self{from}; if ( $$self{from}>$$self{to} ) { if ( $$self{oob} eq '' ) { return ''; } elsif ( $$self{oob} eq 'N' ) { return 'N'; } $self->throw("No such site $chr:$pos in $$self{file}\n"); } return substr($$self{chunk},$idx,1); } =head2 get_slice About : Retrieves region Usage : my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345,54321); Args : chromosome 1-based coordinate =cut sub get_slice { my ($self,$chr,$from,$to) = @_; if ( $to-$from >= $$self{size} ) { $$self{size} = $to-$from+1; } if ( $from>$to ) { $self->throw("Expected $from>$to\n"); } if ( !$$self{chr} || $chr ne $$self{chr} || $from<$$self{from} || $to>$$self{to} ) { $self->read_chunk($chr,$from); } $$self{nqueries}++; if ( $$self{from}>$$self{to} || $$self{from}>$from || $$self{to}<$to ) { if ( $$self{oob} eq 'throw' ) { $self->throw("The region out of bounds $chr:$from-$to in $$self{file}\n"); } elsif ( $$self{oob} eq '' ) { return ''; } if ( $$self{from}>$$self{to} ) { return 'N' x ($to-$from+1); } if ( $$self{from}>$to ) { $self->throw("FIXME: this shouldn't happen $chr:$from-$to .. $$self{from},$$self{to} .. $$self{file}"); } my $lfill = ''; my $rfill = ''; if ( $$self{from}>$from ) { $lfill = 'N' x ($$self{from}-$from); $from=$$self{from}; } if ( $$self{to}<$to ) { $rfill = 'N' x ($to-$$self{to}); $to=$$self{to}; } return $lfill . substr($$self{chunk},$from-$$self{from},$to-$from+1) . $rfill; } return substr($$self{chunk},$from-$$self{from},$to-$from+1); } # http://www.illumina.com/documents/products/technotes/technote_topbot.pdf sub illumina_alleles_TOP_to_ref { my ($self,$a1,$a2,$chr,$pos,$ref) = @_; my %map = (A=>'T', C=>'G', G=>'C', T=>'A'); my %top = ( A=>{A=>-2,C=> 1,G=> 1,T=>-1}, C=>{A=> 1,C=>-2,G=>-1,T=> 0}, G=>{A=> 1,C=>-1,G=>-2,T=> 0}, T=>{A=>-1,C=> 0,G=> 0,T=>-2} ); my $stat = $top{$a1}{$a2}; if ( $stat==-2 ) { $self->throw("Expected two different bases, got $a1 and $a2.\n"); } if ( $stat==-1 ) { # Now we should do the sequence walking to see if the reference is TOP or BOT, # but we do not this in ill-to-vcf: C/G would become G/C and A/T would become T/A. return ($a1,$a2); } if ( $stat==0 ) { $self->throw("Expected Illumina TOP, got $a1 and $a2.\n"); } if ( $ref eq $a1 or $ref eq $a2 ) { return ($a1,$a2); } return ($map{$a1},$map{$a2}); } 1; vcftools-0.1.15/src/perl/Makefile.am000066400000000000000000000007241307140004000172120ustar00rootroot00000000000000dist_bin_SCRIPTS = \ fill-aa \ fill-an-ac \ fill-fs \ fill-ref-md5 \ vcf-annotate \ vcf-compare \ vcf-concat \ vcf-consensus \ vcf-contrast \ vcf-convert \ vcf-fix-newlines \ vcf-fix-ploidy \ vcf-indel-stats \ vcf-isec \ vcf-merge \ vcf-phased-join \ vcf-query \ vcf-shuffle-cols \ vcf-sort \ vcf-stats \ vcf-subset \ vcf-to-tab \ vcf-tstv \ vcf-validator pmdir = $(exec_prefix)/$(PMDIR) dist_pm_DATA = \ FaSlice.pm \ Vcf.pm \ VcfStats.pm vcftools-0.1.15/src/perl/Vcf.pm000066400000000000000000003221041307140004000162310ustar00rootroot00000000000000package Vcf; our $VERSION = 'v0.1.14-12-gcdb80b8'; # http://vcftools.sourceforge.net/specs.html # http://samtools.github.io/hts-specs/ # # Authors: petr.danecek@sanger # for VCF v3.2, v3.3, v4.0, v4.1, v4.2 # =head1 NAME Vcf.pm. Module for validation, parsing and creating VCF files. Supported versions: 3.2, 3.3, 4.0, 4.1, 4.2 =head1 SYNOPSIS From the command line: perl -MVcf -e validate example.vcf perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf From a script: use Vcf; my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000'); $vcf->parse_header(); # Do some simple parsing. Most thorough but slowest way how to get the data. while (my $x=$vcf->next_data_hash()) { for my $gt (keys %{$$x{gtypes}}) { my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt); print "\t$gt: $al1$sep$al2\n"; } print "\n"; } # This will split the fields and print a list of CHR:POS while (my $x=$vcf->next_data_array()) { print "$$x[0]:$$x[1]\n"; } # This will return the lines as they were read, including the newline at the end while (my $x=$vcf->next_line()) { print $x; } # Only the columns NA00001, NA00002 and NA00003 will be printed. my @columns = qw(NA00001 NA00002 NA00003); print $vcf->format_header(\@columns); while (my $x=$vcf->next_data_array()) { # this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0 print $vcf->format_line($x,\@columns); } $vcf->close(); =cut use strict; use warnings; use Carp; use Exporter; use Data::Dumper; use POSIX ":sys_wait_h"; use vars qw/@ISA @EXPORT/; @ISA = qw/Exporter/; @EXPORT = qw/validate validate_v32/; =head2 validate About : Validates the VCF file. Usage : perl -MVcf -e validate example.vcf.gz # (from the command line) validate('example.vcf.gz'); # (from a script) validate(\*STDIN); Args : File name or file handle. When no argument given, the first command line argument is interpreted as the file name. =cut sub validate { my ($fh) = @_; if ( !$fh && @ARGV ) { $fh = $ARGV[0]; } my $vcf; if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); } else { $vcf = Vcf->new(fh=>\*STDIN); } $vcf->run_validation(); } =head2 validate_v32 About : Same as validate, but assumes v3.2 VCF version. Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line) Args : File name or file handle. When no argument given, the first command line argument is interpreted as the file name. =cut sub validate_v32 { my ($fh) = @_; if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; } my %params = ( version=>'3.2' ); my $vcf; if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); } else { $vcf = Vcf->new(%params, fh=>\*STDIN); } $vcf->run_validation(); } =head2 new About : Creates new VCF reader/writer. Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2'); Args : fh .. Open file handle. If neither file nor fh is given, open in write mode. file .. The file name. If neither file nor fh is given, open in write mode. region .. Optional region to parse (requires tabix indexed VCF file) silent .. Unless set to 0, warning messages may be printed. strict .. Unless set to 0, the reader will die when the file violates the specification. version .. If not given, '4.0' is assumed. The header information overrides this setting. =cut sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; $$self{silent} = 0 unless exists($$self{silent}); $$self{strict} = 0 unless exists($$self{strict}); $$self{buffer} = []; # buffer stores the lines in the reverse order $$self{columns} = undef; # column names $$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory}); $$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols}); $$self{recalc_ac_an} = 1; $$self{has_header} = 0; $$self{default_version} = '4.2'; $$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1 Vcf4_2) ]; if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} } $$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1; my %open_args = (); if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; if ( !exists($$self{print_header}) ) { $$self{print_header}=1; } } if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; } return $self->_open(%open_args); } sub throw { my ($self,@msg) = @_; confess @msg,"\n"; } sub warn { my ($self,@msg) = @_; if ( $$self{silent} ) { return; } if ( $$self{strict} ) { $self->throw(@msg); } warn @msg; } sub _open { my ($self,%args) = @_; if ( !exists($$self{fh}) && !exists($$self{file}) ) { # Write mode, the version must be supplied by the user return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version}); } # Open the file unless filehandle is provided if ( !exists($$self{fh}) ) { if ( !defined $$self{file} ) { $self->throw("Undefined value passed to Vcf->new(file=>undef)."); } my $cmd = "<$$self{file}"; my $tabix_args = ''; if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; } $tabix_args .= qq['$$self{file}']; if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= qq[ '$args{region}']; } if ( -e $$self{file} && $$self{file}=~/\.gz/i ) { if ( exists($args{region}) && defined($args{region}) ) { $cmd = "tabix $tabix_args |"; } else { $cmd = "gunzip -c '$$self{file}' |"; } } elsif ( $$self{file}=~m{^(?:http|ftp)://} ) { if ( !exists($args{region}) ) { $tabix_args .= ' .'; } $cmd = "tabix $tabix_args |"; } open($$self{fh},$cmd) or $self->throw("$cmd: $!"); } # Set the correct VCF version, but only when called for the first time my $vcf = $self; if ( !$$self{_version_set} ) { my $first_line = $self->next_line(); $vcf = $self->_set_version($first_line); $self->_unread_line($first_line); } return $vcf; } =head2 open About : (Re)Open file. No need to call this explicitly unless reading from a different region is requested. Usage : $vcf->open(); # Read from the start $vcf->open(region=>'1:12345-92345'); Args : region .. Supported only for tabix indexed files =cut sub open { my ($self,%args) = @_; $self->close(); $self->_open(%args); } =head2 close About : Close the filehandle Usage : $vcf->close(); Args : none Returns : close exit status =cut sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } my $ret = close($$self{fh}); delete($$self{fh}); $$self{buffer} = []; return $ret; } =head2 next_line About : Reads next VCF line. Usage : my $vcf = Vcf->new(); my $x = $vcf->next_line(); Args : none =cut sub next_line { my ($self) = @_; if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } my $line; if ( !exists($$self{max_line_len}) ) { $line = readline($$self{fh}); } else { while (1) { $line = readline($$self{fh}); if ( !defined $line ) { last; } my $len = length($line); if ( $len>$$self{max_line_len} && !($line=~/^#/) ) { if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); } $self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n"); next; } last; } } return $line; } sub _unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } =head2 next_data_array About : Reads next VCF line and splits it into an array. The last element is chomped. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $x = $vcf->next_data_array(); Args : Optional line to parse =cut sub next_data_array { my ($self,$line) = @_; if ( !$line ) { $line = $self->next_line(); } if ( !$line ) { return undef; } if ( ref($line) eq 'ARRAY' ) { return $line; } my @items = split(/\t/,$line); if ( @items<8 ) { $line=~s/\n/\\n/g; $self->throw("Could not parse the line, wrong number of columns: [$line]"); } chomp($items[-1]); return \@items; } =head2 set_samples About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit. Usage : my $vcf = Vcf->new(); $vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded. $vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included. my $x = $vcf->next_data_hash(); Args : Optional line to parse =cut sub set_samples { my ($self,%args) = @_; if ( exists($args{include}) ) { for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; } for my $sample (@{$args{include}}) { if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); } my $idx = $$self{has_column}{$sample} - 1; $$self{samples_to_parse}[$idx] = 1; } } if ( exists($args{exclude}) ) { for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; } for my $sample (@{$args{exclude}}) { if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); } my $idx = $$self{has_column}{$sample} - 1; $$self{samples_to_parse}[$idx] = 0; } } } sub _set_version { my ($self,$version_line) = @_; if ( $$self{_version_set} ) { return $self; } $$self{_version_set} = 1; $$self{version} = $$self{default_version}; if ( $version_line ) { if ( $version_line=~/^(\d+(?:\.\d+)?)$/ ) { $$self{version} = $1; undef $version_line; } elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) ) { chomp($version_line); $self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n"); undef $version_line; } else { $$self{version} = $1; } } my $reader; if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); } elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); } elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); } elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); } elsif ( $$self{version} eq '4.2' ) { $reader=Vcf4_2->new(%$self); } elsif ( $$self{version} eq '4.3' ) { $reader=Vcf4_3->new(%$self); } else { $self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]); $$self{version} = '4.2'; $reader = Vcf4_2->new(%$self); } $self = $reader; # When changing version, change also the fileformat header line if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' ) { shift(@{$$self{header_lines}}); } return $self; } #--------------------------------------- package VcfReader; use base qw(Vcf); use strict; use warnings; use Carp; use Data::Dumper; sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; return $self; } =head2 next_data_hash About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $x = $vcf->next_data_hash(); # Or having a VCF data line $line my $x = $vcf->next_data_hash($line); Args : Optional line to parse. =cut sub next_data_hash { my ($self,$line) = @_; if ( !$line ) { $line = $self->next_line(); } if ( !$line ) { return undef; } my @items; if ( ref($line) eq 'ARRAY' ) { @items = @$line; } else { @items = split(/\t/,$line); } chomp($items[-1]); my $cols = $$self{columns}; if ( !$cols ) { $self->_fake_column_names(scalar @items - 9); $cols = $$self{columns}; } # Check the number of columns if ( scalar @items != scalar @$cols ) { if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); } my $c = substr($line,0,1); if ( $c eq '#' ) { if ( !$$self{header_parsed} ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); } else { $self->throw("Multiple header blocks (^#) not allowed.\n"); } } if ( $items[-1] eq '' ) { my $nremoved = 0; while ( $items[-1] eq '' ) { pop(@items); $nremoved++; } if ( $nremoved && !$$self{trailing_tabs_warned} ) { $self->warn("Broken VCF: empty columns (trailing TABs) starting at $items[0]:$items[1].\n"); $$self{trailing_tabs_warned} = 1; } } if ( scalar @items != scalar @$cols ) { my @test = split(/\s+/,$line); if ( scalar @test == scalar @$cols ) { $self->warn("(Were spaces used instead of tabs?)\n\n"); } else { $self->throw(sprintf "Wrong number of fields%s; expected %d, got %d. The offending line was:\n[%s]\n\n", exists($$self{file}) ? "in $$self{file}" : '', scalar @$cols, scalar @items, join("\t",@items)); } @items = @test; } } my %out; # Mandatory fields $out{CHROM} = $items[0]; $out{POS} = $items[1]; $out{ID} = $items[2]; $out{REF} = $items[3]; $out{ALT} = [ split(/,/,$items[4]) ]; $out{QUAL} = $items[5]; $out{FILTER} = [ split(/;/,$items[6]) ]; # INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2 if ( defined $items[7] ) { my %hash; for my $info (split(/;/,$items[7])) { my ($key,$val) = split(/=/,$info); if ( !defined $key ) { $self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n"); next; } if ( defined $val ) { $hash{$key} = $val; } elsif ( exists($$self{header}{INFO}{$key}) ) { $hash{$key} = $$self{header}{INFO}{$key}{default}; } else { $hash{$key} = undef; } } $out{INFO} = \%hash; } # The FORMAT field may not be present. GT:GQ:DP:HQ my $format; if ( $$cols[8] || $items[8] ) { $format = $out{FORMAT} = [ split(/:/,$items[8]) ]; if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); } } # Genotype fields my %gtypes; my $check_nformat = $$self{drop_trailings} ? 0 : 1; for (my $icol=9; $icol<@items; $icol++) { if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; } if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; } my @fields = split(/:/, $items[$icol]); if ( $check_nformat && @fields != @$format ) { $self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] (" .scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n"); } my %hash; for (my $ifield=0; $ifield<@fields; $ifield++) { $hash{$$format[$ifield]} = $fields[$ifield]; } $gtypes{$$cols[$icol]} = \%hash; } $out{gtypes} = \%gtypes; return \%out; } =head2 parse_header About : Reads (and stores) the VCF header. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); Args : silent .. do not warn about duplicate header lines =cut sub parse_header { my ($self,%args) = @_; # First come the header lines prefixed by ## while ($self->_next_header_line(%args)) { ; } # Now comes the column names line prefixed by # $self->_read_column_names(); $$self{header_parsed} = 1; } =head2 _next_header_line About : Stores the header lines and meta information, such as fields types, etc. Args : silent .. do not warn about duplicate column names =cut sub _next_header_line { my ($self,%args) = @_; my $line = $self->next_line(); if ( !defined $line ) { return undef; } if ( substr($line,0,2) ne '##' ) { $self->_unread_line($line); return undef; } my $rec = $self->parse_header_line($line); if ( $rec ) { $self->add_header_line($rec,%args); } return $rec; } =head2 get_header_line Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC') $vcf->get_header_line(key=>'FILTER', ID=>'q10') $vcf->get_header_line(key=>'reference') $vcf->get_header_line(key=>'contig',ID=>'20') Args : Header line filter as in the example above Returns : List ref of header line hashes matching the filter =cut sub get_header_line { my ($self,%filter) = @_; my $key = $filter{key}; delete($filter{key}); my $id = $filter{ID}; my @out; while (my ($hline_key,$hline_hash) = each %{$$self{header}}) { if ( $key ne $hline_key ) { next; } if ( defined $id ) { if ( !exists($$hline_hash{$id}) ) { next; } $hline_hash = $$hline_hash{$id}; } my $match = 1; while (my ($fkey,$fval) = each %filter) { if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval ) { $match=0; last; } } if ( $match ) { push @out,$hline_hash } } return \@out; } =head2 add_header_line Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}) $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'}) Args : Header line hash as in the example above Hash with additional parameters [optional] silent .. do not warn about existing header keys append .. append timestamp to the name of the new one Returns : =cut sub add_header_line { my ($self,$rec,%args) = @_; if ( !%args ) { $args{silent}=0; } my $key = $$rec{key}; if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); } if ( exists($$rec{Type}) ) { if ( !exists($$rec{default}) ) { my $type = $$rec{Type}; if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; } else { $$rec{default}=$$self{defaults}{default}; } } if ( !exists($$rec{handler}) ) { my $type = $$rec{Type}; if ( !exists($$self{handlers}{$type}) ) { $self->warn("Unknown type [$type]\n"); $type = 'String'; $$rec{Type} = $type; } if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; } else { $self->throw("Unknown type [$type].\n"); } } } if ( exists($$rec{ID}) ) { my $id = $$rec{ID}; if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); } $$self{header}{$key}{$id} = $rec; push @{$$self{header_lines}}, $rec; return; } if ( $args{append} ) { my @tm = gmtime(time); $key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4]+1,$tm[3]; my $i = 1; while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; } $key = $key.'.'.$i; $$rec{key} = $key; } if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); } push @{$$self{header}{$key}}, $rec; if ( $$rec{key} eq 'fileformat' ) { unshift @{$$self{header_lines}}, $rec; } else { push @{$$self{header_lines}}, $rec; } } sub _header_line_exists { my ($self,$key,$rec) = @_; if ( !exists($$self{header}{$key}) ) { return 0; } if ( $key eq 'fileformat' ) { return 1; } for my $hrec (@{$$self{header}{$key}}) { my $differ = 0; for my $item (keys %$rec) { if ( !exists($$hrec{$item}) ) { $differ=1; last; } if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; } } if ( !$differ ) { return $hrec; } } return 0; } =head2 remove_header_line Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC') Args : Returns : =cut sub remove_header_line { my ($self,%args) = @_; my $key = $args{key}; my %to_be_removed; for (my $i=0; $i<@{$$self{header_lines}}; $i++) { my $line = $$self{header_lines}[$i]; if ( $$line{key} ne $key ) { next; } if ( exists($args{ID}) ) { if ( $args{ID} ne $$line{ID} ) { next; } delete($$self{header}{$key}{$args{ID}}); splice(@{$$self{header_lines}},$i--,1); } elsif ( scalar keys %args==1 && exists($$self{header}{$key}) ) { splice(@{$$self{header_lines}},$i--,1); $to_be_removed{$key} = 1; } else { my $to_be_removed = $self->_header_line_exists($key,\%args); if ( !$to_be_removed ) { next; } for (my $j=0; $j<@{$$self{header}{$key}}; $j++) { if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; } } splice(@{$$self{header_lines}},$i--,1); } } for my $key (keys %to_be_removed) { delete($$self{header}{$key}); } } =head2 parse_header_line Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36]) $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"]) Args : Returns : =cut sub parse_header_line { my ($self,$line) = @_; chomp($line); $line =~ s/^##//; if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; } my $key = $1; my $value = $'; my $desc; if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; } if ( !$desc ) { return { key=>$key, value=>$value }; } if ( $key eq 'INFO' or $key eq 'FORMAT' ) { my ($id,$number,$type,@rest) = split(/,\s*/,$value); if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); } return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc }; } if ( $key eq 'FILTER' ) { my ($id,@rest) = split(/,\s*/,$value); if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); } return { key=>$key, ID=>$id, Description=>$desc }; } $self->throw("Could not parse the header line: $line\n"); } =head2 _read_column_names About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index. The indexes go from 1. Usage : $vcf->_read_column_names(); Args : none =cut sub _read_column_names { my ($self) = @_; my $line = $self->next_line(); if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); } $$self{column_line} = $line; my @cols = split(/\t/, substr($line,1)); chomp($cols[-1]); my $nremoved = 0; for (my $i=0; $i<@cols; $i++) { if ( !($cols[$i]=~/^\s*$/) ) { next; } $self->warn(sprintf "Empty fields in the header line, the column %d is empty, removing.\n",$i+1+$nremoved); $nremoved++; splice(@cols,$i,1); } my $ncols = scalar @cols; if ( $ncols == 1 ) { # If there is only one name, it can be space-separated instead of tab separated @cols = split(/\s+/, $cols[0]); $ncols = scalar @cols; chomp($line); if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; } $self->warn("The column names not tab-separated? [$line]\n"); } my $fields = $$self{mandatory}; my $nfields = scalar @$fields; # Check the names of the mandatory columns if ( $ncols < $nfields ) { chomp($line); $self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n"); return; } for (my $i=0; $i<$ncols; $i++) { if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ ) { $self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n"); $cols[$i] =~ s/^\s+//; $cols[$i] =~ s/\s+$//; } if ( $i<$nfields && $cols[$i] ne $$fields[$i] ) { $self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n"); $cols[$i] = $$fields[$i]; } $$self{has_column}{$cols[$i]} = $i+1; } $$self{columns} = \@cols; return; } =head2 _fake_column_names About : When no header is present, fake column names as the default mandatory ones + numbers Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present =cut sub _fake_column_names { my ($self,$ncols) = @_; $$self{columns} = [ @{$$self{mandatory}} ]; if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; } for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; } } =head2 format_header About : Returns the header. Usage : print $vcf->format_header(); Args : The columns to include on output [optional] =cut sub format_header { my ($self,$columns) = @_; my $out = ''; for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); } # This is required when using the API for writing new VCF files and the caller does not add the line explicitly if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' ) { $out = "##fileformat=VCFv$$self{version}\n" .$out; } if ( !$$self{columns} ) { return $out; } my @out_cols; if ( $columns ) { @out_cols = @{$$self{columns}}[0..8]; for my $col (@$columns) { if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; } } } else { @out_cols = @{$$self{columns}}; } $out .= "#". join("\t", @out_cols). "\n"; return $out; } =head2 format_line About : Returns the header. Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x); $x = $vcf->next_data_array(); print $vcf->format_line($x); Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array. 2 : The columns to include [optional] =cut sub format_line { my ($self,$record,$columns) = @_; if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); } if ( ref($record) eq 'ARRAY' ) { return join("\t",@$record)."\n"; } $self->throw("FIXME: todo .. " .ref($record). "\n"); } =head2 recalc_ac_an About : Control if the AC and AN values should be updated. Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x); Args 1 : 0 .. never recalculate 1 .. recalculate if present 2 .. recalculate if present and add if missing =cut sub recalc_ac_an { my ($self,$value) = @_; if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; } return; } =head2 get_tag_index Usage : my $idx = $vcf->get_tag_index('GT:PL:DP:SP:GQ','PL',':'); Arg 1 : Field 2 : The tag to find 3 : Tag separator Returns : Index of the tag or -1 when not found =cut sub get_tag_index { my ($self,$field,$tag,$sep) = @_; if ( !defined $field ) { return -1; } my $idx = 0; my $prev_isep = 0; my $isep = 0; while (1) { $isep = index($field,':',$prev_isep); if ( $isep==-1 ) { if ( substr($field,$prev_isep) eq $tag ) { return $idx; } else { return -1; } } if ( substr($field,$prev_isep,$isep-$prev_isep) eq $tag ) { return $idx; } $prev_isep = $isep+1; $idx++; } } =head2 remove_field Usage : my $field = $vcf->remove_field('GT:PL:DP:SP:GQ',1,':'); # returns 'GT:DP:SP:GQ' Arg 1 : Field 2 : The index of the field to remove 3 : Field separator Returns : Modified string =cut sub remove_field { my ($self,$string,$idx,$sep) = @_; my $isep = -1; my $prev_isep = 0; my $itag = 0; while ($itag!=$idx) { $isep = index($string,$sep,$prev_isep); # The index may be out of range, VCFv4.1 allows omitting empty fields if ( $isep==-1 ) { return $string; } $prev_isep = $isep+1; $itag++; } my $out; if ( $isep>=0 ) { $out = substr($string,0,$isep); } my $ito=index($string,$sep,$isep+1); if ( $ito!=-1 ) { if ( $isep>=0 ) { $out .= ':' } $out .= substr($string,$ito+1); } if ( !defined $out ) { return '.'; } return $out; } =head2 replace_field Usage : my $col = $vcf->replace_field('GT:PL:DP:SP:GQ','XX',1,':'); # returns 'GT:XX:DP:SP:GQ' Arg 1 : Field 2 : Replacement 3 : 0-based index of the field to replace 4 : Field separator Returns : Modified string =cut sub replace_field { my ($self,$string,$repl,$idx,$sep) = @_; my $isep = -1; my $prev_isep = 0; my $itag = 0; while ($itag!=$idx) { $isep = index($string,$sep,$prev_isep); if ( $isep==-1 ) { # the out of range index may be OK, VCFv4.1 allows omitting empty fields if ( $$self{version}<4.1 ) { $self->throw("The index out of range ($string,$repl,$idx,$sep), missing fields not supported in VCFv$$self{version}."); } while ( $itag<$idx ) { $string .= ':'; $itag++; } $string .= $repl; return $string; } $prev_isep = $isep+1; $itag++; } my $out; if ( $isep>=0 ) { $out = substr($string,0,$isep+1); } my $ito = index($string,$sep,$isep+1); if ( $ito==-1 ) { $out .= $repl; } else { $out .= $repl; $out .= ':'; $out .= substr($string,$ito+1); } if ( !defined $out ) { return '.'; } return $out; } =head2 get_info_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); $af = $vcf->get_info_field('DP=14;AF=0.5;DB','AF'); # returns 0.5 $af = $vcf->get_info_field('DP=14;AF=0.5;DB','DB'); # returns 1 $af = $vcf->get_info_field('DP=14;AF=0.5;DB','XY'); # returns undef Arg 1 : The VCF line broken into an array 2 : The tag to retrieve Returns : undef when tag is not present, the tag value if present, or 1 if flag is present =cut sub get_info_field { my ($self,$info,$tag) = @_; my $info_len = length($info); my $tag_len = length($tag); my $idx = 0; while (1) { $idx = index($info,$tag,$idx); if ( $idx==-1 ) { return undef; } if ( $idx!=0 && substr($info,$idx-1,1) ne ';' ) { $idx += $tag_len; next; } if ( $tag_len+$idx >= $info_len ) { return 1; } my $follows = substr($info,$idx+$tag_len,1); if ( $follows eq ';' ) { return 1; } $idx += $tag_len; if ( $follows ne '=' ) { next; } $idx++; my $to = index($info,';',$idx); return $to==-1 ? substr($info,$idx) : substr($info,$idx,$to-$idx); } } =head2 get_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); my $idx = $vcf->get_tag_index($$line[8],'PL',':'); my $pl = $vcf->get_field($$line[9],$idx) unless $idx==-1; Arg 1 : The VCF line broken into an array 2 : The index of the field to retrieve 3 : The delimiter [Default is ':'] Returns : The tag value =cut sub get_field { my ($self,$col,$idx,$delim) = @_; if ( !defined $delim ) { $delim=':'; } my $isep = 0; my $prev_isep = 0; my $itag = 0; while (1) { $isep = index($col,$delim,$prev_isep); if ( $itag==$idx ) { last; } if ( $isep==-1 ) { return '.'; } # This is valid, missing fields can be ommited from genotype columns $prev_isep = $isep+1; $itag++; } return $isep<0 ? substr($col,$prev_isep) : substr($col,$prev_isep,$isep-$prev_isep); } =head2 get_sample_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); my $idx = $vcf->get_tag_index($$line[8],'PL',':'); my $pls = $vcf->get_sample_field(\@items,$idx) unless $idx==-1; Arg 1 : The VCF line broken into an array 2 : The index of the field to retrieve Returns : Array of values =cut sub get_sample_field { my ($self,$cols,$idx) = @_; my @out; my $n = @$cols; for (my $icol=9; $icol<$n; $icol++) { my $col = $$cols[$icol]; my $isep = 0; my $prev_isep = 0; my $itag = 0; while (1) { $isep = index($col,':',$prev_isep); if ( $itag==$idx ) { last; } if ( $isep==-1 ) { return '.'; } # This is valid, missing fields can be ommited from genotype columns $prev_isep = $isep+1; $itag++; } my $val = $isep<0 ? substr($col,$prev_isep) : substr($col,$prev_isep,$isep-$prev_isep); push @out,$val; } return \@out; } =head2 split_mandatory About : Faster alternative to regexs, extract the mandatory columns Usage : my $line=$vcf->next_line; my @cols = $vcf->split_mandatory($line); Arg : Returns : Pointer to the array of values =cut sub split_mandatory { my ($self,$line) = @_; my @out; my $prev = 0; for (my $i=0; $i<7; $i++) { my $isep = index($line,"\t",$prev); if ( $isep==-1 ) { $self->throw("Could not parse the mandatory columns: $line"); } push @out, substr($line,$prev,$isep-$prev); $prev = $isep+1; } my $isep = index($line,"\t",$prev); if ( $isep!=-1 ) { push @out, substr($line,$prev,$isep-$prev-1); } else { push @out, substr($line,$prev); } return \@out; } =head2 split_gt About : Faster alternative to regexs Usage : my ($a1,$a2,$a3) = $vcf->split_gt('0/0/1'); # returns (0,0,1) Arg : Diploid genotype to split into alleles Returns : Array of values =cut sub split_gt { my ($self,$gt) = @_; my @als; my $iprev = 0; while (1) { my $isep = index($gt,'/',$iprev); my $jsep = index($gt,'|',$iprev); if ( $isep<0 or ($jsep>=0 && $jsep<$isep) ) { $isep = $jsep; } push @als, $isep<0 ? substr($gt,$iprev) : substr($gt,$iprev,$isep-$iprev); if ( $isep<0 ) { return (@als); } $iprev = $isep+1; } return (@als); } =head2 split_by About : Generalization of split_gt Usage : my ($a1,$a2,$a3) = $vcf->split_gt('0/0|1',qw(| /)); # returns (0,0,1) Arg : Diploid genotype to split into alleles Returns : Array of values =cut sub split_by { my ($self,$str,@seps) = @_; my @out; my $iprev = 0; while (1) { my $min; for my $sep (@seps) { my $idx = index($str,$sep,$iprev); if ( $idx==-1 ) { next; } if ( !defined $min or $idx<$min ) { $min=$idx } } push @out, defined $min ? substr($str,$iprev,$min-$iprev) : substr($str,$iprev); if ( !defined $min ) { return @out; } $iprev = $min+1; } return (@out); } =head2 decode_genotype About : Faster alternative to regexs Usage : my $gt = $vcf->decode_genotype('G',['A','C'],'0/0'); # returns 'G/G' Arg 1 : Ref allele 2 : Alt alleles 3 : The genotype to decode Returns : Decoded GT string =cut sub decode_genotype { my ($self,$ref,$alt,$gt) = @_; my $isep = 0; my $out; while (1) { my $i = index($gt,'/',$isep); my $j = index($gt,'|',$isep); if ( $i==-1 && $j==-1 ) { my $idx = substr($gt,$isep); if ( $idx eq '.' ) { $out .= $idx; } else { if ( $idx>@$alt ) { $self->throw("The genotype index $idx in $gt is out of bounds: ", join(',',@$alt)); } $out .= $idx==0 ? $ref : $$alt[$idx-1]; } return $out; } if ( $i!=-1 && $j!=-1 && $i>$j ) { $i=$j; } elsif ( $i==-1 ) { $i=$j } my $idx = substr($gt,$isep,$i-$isep); if ( $idx eq '.' ) { $out .= $idx; } else { if ( $idx>@$alt ) { $self->throw("The genotype index $idx in $gt out of bounds: ", join(',',@$alt)); } $out .= $idx==0 ? $ref : $$alt[$idx-1]; } $out .= substr($gt,$i,1); $isep = $i+1; } } sub _format_line_hash { my ($self,$record,$columns) = @_; if ( !$$self{columns} ) { my $ngtypes = scalar keys %{$$record{gtypes}}; if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; } $self->_fake_column_names($ngtypes); } my $cols = $$self{columns}; # CHROM POS ID REF my $out; $out .= $$record{CHROM} . "\t"; $out .= $$record{POS} . "\t"; $out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t"; $out .= $$record{REF} . "\t"; # ALT $out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.'); # QUAL $out .= "\t". $$record{QUAL}; # FILTER $out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.'); # Collect the gtypes of interest my $gtypes; if ( $columns ) { # Select only those gtypes keys with a corresponding key in columns. for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; } } else { $gtypes = $$record{gtypes}; } # INFO # .. calculate NS, AN and AC, but only if recalc_ac_an is set my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0; my @info; while (my ($key,$value) = each %{$$record{INFO}}) { if ( $$self{recalc_ac_an}>0 ) { if ( $key eq 'AN' ) { $needs_an_ac=1; next; } if ( $key eq 'AC' ) { $needs_an_ac=1; next; } } if ( defined $value ) { push @info, "$key=$value"; } elsif ( $key ne '.' ) { push @info, $key; } } if ( $needs_an_ac ) { my $nalt = scalar @{$$record{ALT}}; if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; } my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt); push @info, "AN=$an","AC=$ac"; } if ( !@info ) { push @info, '.'; } $out .= "\t". join(';', sort @info); # FORMAT, the column is not required, it may not be present when there are no genotypes if ( exists($$cols[8]) && defined $$record{FORMAT} ) { $out .= "\t". join(':',@{$$record{FORMAT}}); } # Genotypes: output all columns or only a selection? my @col_names = $columns ? @$columns : @$cols[9..@$cols-1]; my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0; for my $col (@col_names) { my $gt = $$gtypes{$col}; my $can_drop = $$self{drop_trailings}; my @gtype; for (my $i=$nformat-1; $i>=0; $i--) { my $field = $$record{FORMAT}[$i]; if ( $i==0 ) { $can_drop=0; } if ( exists($$gt{$field}) ) { $can_drop = 0; if ( ref($$gt{$field}) eq 'HASH' ) { # Special treatment for Number=[AG] tags unshift @gtype, $self->format_AGtag($record,$col,$$gt{$field},$field); } else { unshift @gtype,$$gt{$field}; } } elsif ( $can_drop ) { next; } elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; } else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); } } $out .= "\t" . join(':',@gtype); } $out .= "\n"; return $out; } sub calc_an_ac { my ($self,$gtypes,$nalleles) = @_; my $sep_re = $$self{regex_gtsep}; my ($an,%ac_counts); if ( defined $nalleles ) { for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; } } $an = 0; for my $gt (keys %$gtypes) { my $value = $$gtypes{$gt}{GT}; if ( !defined $value ) { next; } # GT may not be present my ($al1,$al2) = split($sep_re,$value); if ( defined($al1) && $al1 ne '.' ) { $an++; if ( $al1 ne '0' ) { $ac_counts{$al1}++; } } if ( defined($al2) && $al2 ne '.' ) { $an++; if ( $al2 ne '0' ) { $ac_counts{$al2}++; } } } my @ac; for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; } if ( !@ac ) { @ac = ('0'); } return ($an,join(',',@ac),\@ac); } sub _validate_alt_field { my ($self,$values,$ref) = @_; for (my $i=0; $i<@$values; $i++) { for (my $j=0; $j<$i; $j++) { if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; } } if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; } } return undef; } =head2 validate_alt_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT}); Args : The ALT arrayref Returns : Error message in case of an error. =cut sub validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my @err; for my $item (@$values) { if ( $item=~/^[ACTGN]$/ ) { next; } elsif ( $item=~/^I[ACTGN]+$/ ) { next; } elsif ( $item=~/^D\d+$/ ) { next; } push @err, $item; } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']'; } =head2 event_type Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); for my $allele (@$alleles) { my ($type,$len,$ht) = $vcf->event_type($x,$allele); } or my ($type,$len,$ht) = $vcf->event_type($ref,$al); Args : VCF data line parsed by next_data_hash or the reference allele : Allele Returns : 's' for SNP and number of SNPs in the record 'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion) 'r' identical to the reference, length 0 'o' for other (complex events) and the number of affected bases 'b' breakend 'u' unknown =cut sub event_type { my ($self,$rec,$allele) = @_; my $ref = $rec; if ( ref($rec) eq 'HASH' ) { if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); } $ref = $$rec{REF}; } my ($type,$len,$ht); if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; } elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; } elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; } elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; } elsif ( length($allele)==length($ref) && $allele=~/^[ACGTN]+$/ && $ref=~/^[ACGTN]+$/ ) { $len = length($allele); $type='s'; $ht=$allele; } else { my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef'; my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef'; $self->throw("Eh?: $chr:$pos .. $ref $allele\n"); } if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = [$type,$len,$ht]; } return ($type,$len,$ht); } =head2 has_AGtags About : Checks the header for the presence of tags with variable number of fields (Number=A or Number=G, such as GL) Usage : $vcf->parse_header(); my $agtags = $vcf->has_AGtags(); Args : None Returns : Hash {fmtA=>[tags],fmtG=>[tags],infoA=>[tags],infoG=>[tags]} or undef if none is present =cut sub has_AGtags { my ($self) = @_; my $out; if ( exists($$self{header}{FORMAT}) ) { for my $tag (keys %{$$self{header}{FORMAT}}) { if ( $$self{header}{FORMAT}{$tag}{Number} eq 'A' ) { push @{$$out{fmtA}},$tag; } if ( $$self{header}{FORMAT}{$tag}{Number} eq 'G' ) { push @{$$out{fmtG}},$tag; } } } if ( exists($$self{header}{INFO}) ) { for my $tag (keys %{$$self{header}{INFO}}) { if ( $$self{header}{INFO}{$tag}{Number} eq 'A' ) { push @{$$out{infoA}},$tag; } if ( $$self{header}{INFO}{$tag}{Number} eq 'G' ) { push @{$$out{infoG}},$tag; } } } if ( defined $out ) { for my $key (qw(fmtA fmtG infoA infoG)) { if ( !exists($$out{$key}) ) { $$out{$key}=[] } } } return $out; } =head2 parse_AGtags About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x); Args : VCF data line parsed by next_data_hash : Mapping between ALT representations based on different REFs [optional] : New REF [optional] Returns : Hash {Allele=>Value} =cut sub parse_AGtags { my ($self,$rec,$ref_alt_map,$new_ref) = @_; if ( !exists($$rec{gtypes}) ) { return; } my (@atags,@gtags); for my $fmt (@{$$rec{FORMAT}}) { # These have been listed explicitly for proper merging of v4.0 VCFs if ( $$self{fix_v40_AGtags} ) { if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; } if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; } } if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; } } my $missing = $$self{defaults}{default}; if ( @atags ) { # Parse Number=A tags my $alts; if ( defined $ref_alt_map ) { $alts = []; for my $alt (@{$$rec{ALT}}) { if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); } push @$alts, $$ref_alt_map{$new_ref}{$alt}; } } else { $alts = $$rec{ALT}; } for my $tag (@atags) { for my $sample (values %{$$rec{gtypes}}) { if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; } my @values = split(/,/,$$sample{$tag}); $$sample{$tag} = {}; for (my $i=0; $i<@values; $i++) { $$sample{$tag}{$$alts[$i]} = $values[$i]; } } } } if ( @gtags ) { # Parse Number=G tags my @alleles; if ( defined $ref_alt_map ) { push @alleles, $new_ref; for my $alt (@{$$rec{ALT}}) { if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); } push @alleles, $$ref_alt_map{$new_ref}{$alt}; } } else { @alleles = ($$rec{REF},@{$$rec{ALT}}); if ( @alleles==2 && $alleles[1] eq '.' ) { pop(@alleles); } } my @gtypes; for (my $i=0; $i<@alleles; $i++) { for (my $j=0; $j<=$i; $j++) { push @{$gtypes[1]}, $alleles[$i].'/'.$alleles[$j]; } push @{$gtypes[0]}, $alleles[$i]; } for my $tag (@gtags) { for my $name (keys %{$$rec{gtypes}}) { my $sample = $$rec{gtypes}{$name}; if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; } my @values = split(/,/,$$sample{$tag}); my $ploidy = $self->guess_ploidy(scalar @alleles, scalar @values) - 1; if ( $ploidy<0 ) { my $nals = scalar @alleles; my $nvals = scalar @values; my $ndip = $nals*($nals+1)/2; $self->throw( "Wrong number of values in $name/$tag at $$rec{CHROM}:$$rec{POS} .. nAlleles=$nals, nValues=$nvals.\n". "Expected $ndip values for diploid genotypes or $nals for haploid genotypes.\n"); } if ( $ploidy>1 ) { $self->throw("Sorry, not ready for ploidy bigger than 2\n"); } if ( $ploidy!=1 ) { $$rec{_cached_ploidy}{$name} = $ploidy; } $$sample{$tag} = {}; for (my $i=0; $i<@values; $i++) { $$sample{$tag}{$gtypes[$ploidy][$i]} = $values[$i]; } } } } } =head2 format_AGtag About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) Usage : Args : : : Returns : =cut sub format_AGtag { my ($self,$record,$sample,$tag_data,$tag) = @_; # The FORMAT field is checked only once and the results are cached. if ( !exists($$record{_atags}) ) { $$record{_atags} = {}; # Check if there are any A,G tags for my $fmt (@{$$record{FORMAT}}) { # These have been listed explicitly for proper merging of v4.0 VCFs if ( $$self{fix_v40_AGtags} ) { if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; } if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; } } if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; } } } my @out; if ( exists($$record{_atags}{$tag}) ) { for my $alt (@{$$record{ALT}}) { push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default}; } } if ( exists($$record{_gtags}{$tag}) ) { my $gtypes = $$record{_gtypes}; my $gtypes2 = $$record{_gtypes2}; if ( !defined $gtypes ) { $gtypes = []; $gtypes2 = []; my @alleles = ( $$record{REF}, @{$$record{ALT}} ); for (my $i=0; $i<@alleles; $i++) { for (my $j=0; $j<=$i; $j++) { push @{$$gtypes[1]}, $alleles[$i].'/'.$alleles[$j]; push @{$$gtypes2[1]}, $alleles[$j].'/'.$alleles[$i]; } push @{$$gtypes[0]}, $alleles[$i]; } $$record{_gtypes} = $gtypes; $$record{_gtypes2} = $gtypes2; } my $ploidy = exists($$record{_cached_ploidy}{$sample}) ? $$record{_cached_ploidy}{$sample} : 1; for (my $i=0; $i<@{$$gtypes[$ploidy]}; $i++) { my $gt = $$gtypes[$ploidy][$i]; if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$ploidy][$i]; } push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default}; } } return join(',',@out); } =head2 parse_alleles About : Deprecated, use parse_haplotype instead. Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001'); Args : VCF data line parsed by next_data_hash : The genotype column name Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string. =cut sub parse_alleles { my ($self,$rec,$column) = @_; if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); } my $gtype = $$rec{gtypes}{$column}{GT}; if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); } my $al1 = $1; my $sep = $2; my $al2 = $3; if ( !$al1 ) { $al1 = $$rec{REF}; } elsif ( $al1 ne '.' ) { if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); } $al1 = $$rec{ALT}[$al1-1]; } if ( !defined $al2 or $al2 eq '' ) { $sep = ''; $al2 = ''; } else { if ( !$al2 ) { $al2 = $$rec{REF}; } elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; } } return ($al1,$sep,$al2); } =head2 parse_haplotype About : Similar to parse_alleles, supports also multiploid VCFs. Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); Args : VCF data line parsed by next_data_hash : The genotype column name Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values can be cashed and must be therefore considered read only! =cut sub parse_haplotype { my ($self,$rec,$column) = @_; if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); } if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); } my $gtype = $$rec{gtypes}{$column}{GT}; if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); } my @alleles = (); my @seps = (); my $is_phased = 0; my $is_empty = 1; my $buf = $gtype; while ($buf ne '') { if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); } $buf = $'; if ( $1 eq '.' ) { push @alleles,'.'; } else { $is_empty = 0; if ( $1 eq '0' ) { push @alleles,$$rec{REF}; } elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; } else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); } } if ( $2 ) { if ( $2 eq '|' ) { $is_phased=1; } push @seps,$2; } } $$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty]; return (@{$$rec{_cached_haplotypes}{$gtype}}); } =head2 format_haplotype Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps); =cut sub format_haplotype { my ($self,$alleles,$seps) = @_; if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); } my $out = $$alleles[0]; for (my $i=1; $i<@$alleles; $i++) { $out .= $$seps[$i-1]; $out .= $$alleles[$i]; } return $out; } =head2 format_genotype_strings Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 }; $vcf->format_genotype_strings($x); print $vcf->format_line($x); Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters. 2 : Optionally, a subset of columns can be supplied. See also format_line. Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles numbers starting from 1. If the key $$vcf{trim_redundant_ALTs} is set, ALT alleles not appearing in any of the sample column will be removed. =cut sub format_genotype_strings { my ($self,$rec,$columns) = @_; if ( !exists($$rec{gtypes}) ) { return; } my $ref = $$rec{REF}; my $nalts = 0; my %alts = (); if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; } for my $key (@$columns) { my $gtype = $$rec{gtypes}{$key}{GT}; my $buf = $gtype; my $out = ''; while ($buf ne '') { $buf=~m{^([^/|]+)([/|]?)}; $buf = $'; my $al = $1; my $sep = $2; if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; } else { if ( $al=~/^\d+$/ ) { if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); } $al = $$rec{ALT}[$al-1]; } if ( exists($alts{$al}) ) { $al = $alts{$al} } elsif ( $al ne '.' ) { $alts{$al} = ++$nalts; $al = $nalts; } } $out .= $al; if ( $sep ) { $out .= $sep; } } $$rec{gtypes}{$key}{GT} = $out; } if ( !$$self{trim_redundant_ALTs} && exists($$rec{ALT}) && @{$$rec{ALT}} ) { for my $alt (@{$$rec{ALT}}) { if ( !exists($alts{$alt}) ) { $alts{$alt} = ++$nalts; } } } $$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ]; } sub fill_ref_alt_mapping { my ($self,$map) = @_; my $new_ref; for my $ref (keys %$map) { $new_ref = $ref; if ( $ref ne $new_ref ) { $self->warn("The reference prefixes do not agree: $ref vs $new_ref\n"); return undef; } for my $alt (keys %{$$map{$ref}}) { $$map{$ref}{$alt} = $alt; } } $$map{$new_ref}{$new_ref} = $new_ref; return $new_ref; } =head2 format_header_line Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}) Args : Returns : =cut sub format_header_line { my ($self,$rec) = @_; my $line = "##$$rec{key}"; $line .= "=$$rec{value}" unless !exists($$rec{value}); $line .= "=$$rec{ID}" unless !exists($$rec{ID}); $line .= ",$$rec{Number}" unless !exists($$rec{Number}); $line .= ",$$rec{Type}" unless !exists($$rec{Type}); $line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description}); $line .= "\n"; return $line; } =head2 remove_columns Usage : my $rec=$vcf->next_data_hash(); $vcf->remove_columns($rec,remove=>['NA001','NA0002']); Args : VCF hash pointer : list of columns to remove or a lookup hash with column names to keep (remove=>[] or keep=>{}) Returns : =cut sub remove_columns { my ($self,$rec,%args) = @_; if ( ref($rec) ne 'HASH' ) { $self->throw("TODO: rec for array"); } if ( exists($args{keep}) ) { for my $col (keys %{$$rec{gtypes}}) { if ( !exists($args{keep}{$col}) ) { delete($$rec{gtypes}{$col}); } } } if ( exists($args{remove}) ) { for my $col (@{$args{remove}}) { if ( exists($$rec{gtypes}{$col}) ) { delete($$rec{gtypes}{$col}); } } } } =head2 add_columns Usage : $vcf->add_columns('NA001','NA0002'); Args : Returns : =cut sub add_columns { my ($self,@columns) = @_; if ( !$$self{columns} ) { # The columns should be initialized de novo. Figure out if the @columns contain also the mandatory # columns and if FORMAT should be present (it can be absent when there is no genotype column present). my $has_other = 0; for my $col (@columns) { if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; } } $$self{columns} = [ @{$$self{mandatory}} ]; if ( $has_other ) { push @{$$self{columns}},'FORMAT'; } for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; } } my $ncols = @{$$self{columns}}; for my $col (@columns) { if ( $$self{has_column}{$col} ) { next; } $ncols++; push @{$$self{columns}}, $col; } } =head2 add_format_field Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x); Args : The record obtained by next_data_hash : The field name Returns : =cut sub add_format_field { my ($self,$rec,$field) = @_; if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; } for my $key (@{$$rec{FORMAT}}) { if ( $key eq $field ) { return; } # already there } push @{$$rec{FORMAT}}, $field; } =head2 remove_format_field Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x); Args : The record obtained by next_data_hash : The field name Returns : =cut sub remove_format_field { my ($self,$rec,$field) = @_; if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; } my $i = 0; for my $key (@{$$rec{FORMAT}}) { if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; } $i++; } } =head2 add_info_field Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n"; Args : The record obtained by next_data_array : The INFO field name and value pairs. If value is undef and the key is present in $$x[7], it will be removed. To add fields without a value, use empty string ''. Returns : The formatted INFO. =cut sub add_info_field { my ($self,$info,%fields) = @_; my @out = (); # First handle the existing values, keep everything unless in %fields for my $field (split(/;/,$info)) { my ($key,$value) = split(/=/,$field); if ( $key eq '.' ) { next; } if ( !exists($fields{$key}) ) { push @out,$field; next; } } # Now add the new values and remove the unwanted ones while (my ($key,$value)=each %fields) { if ( !defined($value) ) { next; } # this one should be removed if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3 else { push @out,"$key=$value"; } # this is the standard key=value pair } if ( !@out ) { push @out,'.'; } return join(';',@out); } =head2 add_filter Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n"; Args : The record obtained by next_data_array or next_data_hash : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed. Returns : The formatted filter field. =cut sub add_filter { my ($self,$filter,%filters) = @_; my @out = (); my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter); # First handle the existing filters, keep everything unless in %filters for my $key (@filters) { if ( $key eq '.' or $key eq 'PASS' ) { next; } if ( !exists($filters{$key}) ) { push @out,$key; next; } } # Now add the new filters and remove the unwanted ones while (my ($key,$value)=each %filters) { if ( !$value ) { next; } # this one should be removed push @out,$key; # this one should be added } if ( !@out ) { push @out,'PASS'; } return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out); } =head2 validate_filter_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER}); Args : The FILTER arrayref Returns : Error message in case of an error. =cut sub validate_filter_field { my ($self,$values) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my @errs; my @missing; for my $item (@$values) { if ( $item eq $$self{filter_passed} ) { next; } if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; } if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; } if ( exists($$self{header}{FILTER}{$item}) ) { next; } push @missing, $item; $self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'}); } if ( !@errs && !@missing ) { return undef; } if ( $$self{version}<3.3 ) { return undef; } return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.'; } sub _add_unknown_field { my ($self,$field,$key,$nargs) = @_; $self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'}); } =head2 validate_header About : Version specific header validation code. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header(); Args : =cut sub validate_header { my ($self) = @_; } =head2 validate_line About : Version specific line validation code. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x); Args : =cut sub validate_line { my ($self,$x) = @_; # Is the ID composed of alphanumeric chars if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); } } =head2 validate_info_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT}); Args : The INFO hashref Returns : Error message in case of an error. =cut sub validate_info_field { my ($self,$values,$alts) = @_; if ( !defined $values ) { return 'Empty INFO field.'; } # First handle the empty INFO field (.) if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; } # Expected numbers my $ng = -1; my $na = -1; my $nr = -1; if ( $$self{version}>4.0 ) { if ( $$alts[0] eq '.' ) { $ng=1; $na=1; } else { $na = @$alts; $ng = (1+$na+1)*($na+1)/2; $nr = $na+1; } } my @errs; while (my ($key,$value) = each %$values) { if ( !exists($$self{header}{INFO}{$key}) ) { push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3; my $nargs = defined $value ? -1 : 0; $self->_add_unknown_field('INFO',$key,$nargs); next; } my $type = $$self{header}{INFO}{$key}; my @vals = defined $value ? split(/,/, $value) : (); if ( $$type{Number} eq 'G' ) { if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'A' ) { if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'R' ) { if ( $nr != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $nr, found ".scalar @vals.")"; } } elsif ( $$type{Number}==0 ) { if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; } next; } elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} ) { if ( !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})"; } } if ( !$$type{handler} ) { next; } for my $val (@vals) { my $err = &{$$type{handler}}($self,$val,$$type{default}); if ( $err ) { push @errs, $err; } } } if ( !@errs ) { return undef; } return join(',',@errs); } =head2 validate_gtype_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT}); Args : The genotype data hashref The ALT arrayref Returns : Error message in case of an error. =cut sub guess_ploidy { my ($self, $nals, $nvals) = @_; if ( $nvals==$nals ) { return 1; } if ( $nvals==binom(1+$nals,2) ) { return 2; } return -1; } sub binom { my ($n, $k) = @_; my $b = 1; if ( $k > $n-$k ) { $k = $n-$k; } if ( $k < 1 ) { return 1; } for (my $i=1; $i<=$k; $i++) { $b *= ($n-$k+$i)/$i; } return $b; } sub validate_gtype_field { my ($self,$data,$alts,$format) = @_; my @errs; my $ploidy = 2; if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; } else { my (@als) = $self->split_by($$data{GT},@{$$self{gt_sep}}); for my $al (@als) { if ( $al eq '.' or $al eq '0' ) { next; } if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integers"; } if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; } } $ploidy = @als; } # Expected numbers my $ng = -1; my $na = -1; my $nr = -1; if ( $$self{version}>4.0 ) { if ( $$alts[0] eq '.' ) { $ng=1; $na=1; $nr=1; } else { $na = @$alts; $ng = binom($ploidy+$na,$ploidy); $nr = $na+1; } } while (my ($key,$value) = each %$data) { if ( !exists($$self{header}{FORMAT}{$key}) ) { push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3; $self->_add_unknown_field('FORMAT',$key,-1); next; } my $type = $$self{header}{FORMAT}{$key}; my @vals = split(/,/, $value); if ( $$type{Number} eq 'G' ) { if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'A' ) { if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'R' ) { if ( $nr != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $nr, found ".scalar @vals.")"; } } elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} ) { if ( !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})"; } } if ( !$$type{handler} ) { next; } for my $val (@vals) { my $err = &{$$type{handler}}($self,$val,$$type{default}); if ( $err ) { push @errs, $err; } } } if ( !@errs ) { return undef; } return join(',',@errs); } sub validate_ref_field { my ($self,$ref) = @_; if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; } return undef; } sub validate_int { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( $value =~ /^-?\d+$/ ) { return undef; } return "Could not validate the int [$value]"; } sub validate_float { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; } if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; } if ( $value =~ /^-?\d+$/ ) { return undef; } if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; } return "Could not validate the float [$value]"; } sub validate_char { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( length($value)==1) { return undef; } return "Could not validate the char value [$value]"; } =head2 run_validation About : Validates the VCF file. Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz'); Args : File name or file handle. =cut sub run_validation { my ($self) = @_; $self->parse_header(); $self->validate_header(); if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); } elsif ( !exists($$self{header}{fileformat}) ) { $self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]); } elsif ( $$self{header_lines}[0]{key} ne 'fileformat' ) { $self->warn(qq[The "fileformat" not the first line in the header\n]); } if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); } my $default_qual = $$self{defaults}{QUAL}; my $warn_sorted=1; my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1; my ($prev_chrm,$prev_pos); while (my $line=$self->next_data_array()) { for (my $i=0; $i<@$line; $i++) { if (!defined($$line[$i]) or $$line[$i] eq '' ) { my $colname = $i<@{$$self{columns}} ? $$self{columns}[$i] : $i+1; $self->warn("The column $colname is empty at $$line[0]:$$line[1].\n"); } } my $x = $self->next_data_hash($line); $self->validate_line($x); # Is the position numeric? if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); } if ( $warn_duplicates ) { if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} ) { $self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n"); $warn_duplicates = 0; } } # Is the file sorted? if ( $warn_sorted ) { if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} ) { $self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n"); $warn_sorted = 0; } $prev_chrm = $$x{CHROM}; $prev_pos = $$x{POS}; } # The reference base: one of A,C,G,T,N, non-empty. my $err = $self->validate_ref_field($$x{REF}); if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); } # The ALT field (alternate non-reference base) $err = $self->validate_alt_field($$x{ALT},$$x{REF}); if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); } # The QUAL field my $ret = $self->validate_float($$x{QUAL},$default_qual); if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); } elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); } # The FILTER field $err = $self->validate_filter_field($$x{FILTER}); if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); } # The INFO field $err = $self->validate_info_field($$x{INFO},$$x{ALT}); if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); } while (my ($gt,$data) = each %{$$x{gtypes}}) { $err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT}); if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); } } if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) ) { my $nalt = scalar @{$$x{ALT}}; if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; } my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} ) { $self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n"); } if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} ) { $self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n"); } } } } =head2 get_chromosomes About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed. Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes(); Args : none =cut sub get_chromosomes { my ($self) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } my (@out) = `tabix -l '$$self{file}'`; if ( $? ) { my @has_tabix = `which tabix`; if ( !@has_tabix ) { $self->throw(qq[The command "tabix" not found, please add it to your PATH\n]); } $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]); } for (my $i=0; $i<@out; $i++) { chomp($out[$i]); } return \@out; } =head2 get_samples About : Get list of samples. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples(); Args : none =cut sub get_samples { my ($self) = @_; my $n = @{$$self{columns}} - 1; return (@{$$self{columns}}[9..$n]); } =head2 get_column About : Convenient way to get data for a sample Usage : my $rec = $vcf->next_data_array(); my $sample_col = $vcf->get_column($rec, 'NA0001'); Args 1 : Array pointer returned by next_data_array 2 : Column/Sample name =cut sub get_column { my ($self,$line,$column) = @_; if ( !exists($$self{has_column}{$column}) ) { $self->throw("No such column: [$column]\n"); } my $idx = $$self{has_column}{$column}; return $$line[$idx-1]; } =head2 get_column_name About : Mapping between zero-based VCF column and its name Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $name = $vcf->get_column_name(1); # returns POS Args : Index of the column (0-based) =cut sub get_column_name { my ($self,$idx) = @_; if ( $idx >= @{$$self{columns}} ) { $self->throw("The index out of bounds\n"); } return $$self{columns}[$idx]; } =head2 get_column_index About : Mapping between VCF column name and its zero-based index Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $name = $vcf->get_column_index('POS'); # returns 1 Args : Name of the column =cut sub get_column_index { my ($self,$column) = @_; if ( !exists($$self{has_column}{$column}) ) { $self->throw("No such column: [$column]\n"); } return $$self{has_column}{$column}-1; } #------------------------------------------------ # Version 3.2 specific functions package Vcf3_2; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '3.2', drop_trailings => 1, filter_passed => 0, defaults => { QUAL => '-1', default => '.', Flag => undef, GT => '.', }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$/i, regex_ins => qr/^I[ACGTN]+$/, regex_del => qr/^D\d+$/, regex_gtsep => qr{[\\|/]}, regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$}, regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } #------------------------------------------------ # Version 3.3 specific functions package Vcf3_3; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '3.3', drop_trailings => 0, filter_passed => 0, defaults => { QUAL => '-1', Integer => '-1', Float => '-1', Character => '.', String => '.', Flag => undef, GT => './.', default => '.', }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$/i, regex_ins => qr/^I[ACGTN]+$/, regex_del => qr/^D\d+$/, regex_gtsep => qr{[\\|/]}, regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$}, regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT gt_sep => [qw(\ | /)], }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } #------------------------------------------------ # Version 4.0 specific functions =head1 VCFv4.0 VCFv4.0 specific functions =cut package Vcf4_0; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '4.0', drop_trailings => 1, filter_passed => 'PASS', defaults => { QUAL => '.', Flag => undef, GT => '.', default => '.', }, reserved => { FILTER => { 0=>1 }, }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i, regex_ins => qr/^[ACGTN]+$/, regex_del => qr/^[ACGTN]+$/, regex_gtsep => qr{[|/]}, # | / regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1 regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0| gt_sep => [qw(| /)], }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } sub Vcf4_0::format_header_line { my ($self,$rec) = @_; my %tmp_rec = ( %$rec ); if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' } my $value; if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' ) { my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output my @items; for my $key (qw(ID Number Type Description), sort keys %tmp_rec) { if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; } my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : ''; push @items, "$key=$quote$tmp_rec{$key}$quote"; $has{$key}=1; } $value = '<' .join(',',@items). '>'; } elsif ( $tmp_rec{key} =~ /vcfProcessLog(_.+)*/) { my %has = ( key=>1, handler=>1, default=>1 ); my @items; my @knownKeys = qw(InputVCF InputVCFSource InputVCFVer InputVCFParam); for my $key (qw(InputVCF InputVCFSource InputVCFVer InputVCFParam), sort keys %tmp_rec) { if ( !exists($tmp_rec{$key}) || $has{$key} || !grep(/^$key$/,@knownKeys)) { next; } my $value; if($key eq "InputVCFParam"){ $value=undef; foreach my $ky(keys %{$tmp_rec{$key}}){ if(defined($value)){ $value .= ","; } $value .= $ky."=".$tmp_rec{$key}->{$ky}; } #$value="<".$tmp_rec{$key}.">"; }else{ $value = $tmp_rec{$key}; } push @items, "$key=<$value>"; $has{$key}=1; } $value = '<' .join(',',@items). '>'; } else { $value = $tmp_rec{value}; } my $line = "##$tmp_rec{key}=".$value."\n"; return $line; } =head2 parse_header_line Usage : $vcf->parse_header_line(q[##FORMAT=]) $vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36]) Args : Returns : =cut sub Vcf4_0::parse_header_line { my ($self,$line) = @_; chomp($line); $line =~ s/^##//; if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); } my $key = $1; my $value = $'; if ( !($value=~/^<(.+)>\s*$/) ) { # Simple sanity check for subtle typos if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' ) { $self->throw("Hmm, is this a typo? [$key] [$value]"); } return { key=>$key, value=>$value }; } my $rec = { key=>$key }; my $tmp = $1; my ($attr_key,$attr_value,$quoted); while ($tmp ne '') { if ( !defined $attr_key ) { if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; } elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; } else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); } } if( $tmp!~/>,/ && $tmp=~/^(<)(.+,{1}.+)(>)$/) {$tmp=$'; %$attr_value = split(/[,=]/,$2);} if( $tmp=~ m/^<"([^,">]+)">/) {$tmp=$'; $attr_value = $1;} if ( $tmp=~/^[^,\\"]+/) { $attr_value .= $&; $tmp = $'; if($attr_value =~ m/^<.+>$/){ $attr_value =~ s/^$//; } } if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; } if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; } if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ ) { if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ ) { $self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n"); $attr_key =~ s/^\s+//; $attr_key =~ s/\s+$//; $attr_value =~ s/^\s+//; $attr_value =~ s/\s+$//; } if($tmp =~ m/^(,([^=<].)+>),/){ $quoted = 1; $attr_value.= $1; } $$rec{$attr_key} = $attr_value; $tmp = $'; if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; } $attr_key = $attr_value = $quoted = undef; next; } if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; } $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); } if( $key =~ m/vcfProcessLog/) {return $rec;} if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' ) { if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); } if ( !exists($$rec{Description}) ) { $self->warn("Missing the Description tag in $line\n"); } } if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); } if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; } return $rec; } sub Vcf4_0::validate_ref_field { my ($self,$ref) = @_; if ( !($ref=~/^[ACGTN]+$/) ) { my $offending = $ref; $offending =~ s/[ACGTN]+//g; return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n"; } return undef; } sub Vcf4_0::validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my $ref_len = length($ref); my $ref1 = substr($ref,0,1); my @err; my $msg = ''; for my $item (@$values) { if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; } if ( $item=~/^<[^<>\s]+>$/ ) { next; } if ( $ref_len==length($item) ) { next; } if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; } } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg; } =head2 fill_ref_alt_mapping About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping from the original haplotypes to the haplotypes based on the new REF. Consider the following example: REF ALT G GA GT G GT GA GT GAA GTC G G my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{''=>1}}; my $new_ref=$vcf->fill_ref_alt_mapping($map); The call returns GTC and $map is now G GA -> GTC GATC GT G -> GTC GC GT GA -> GTC GAC GT GAA -> GTC GAAC GTC G -> GTC G G -> GTC Args : Returns : New REF string and fills the hash with appropriate ALT or undef on error. =cut sub Vcf4_0::fill_ref_alt_mapping { my ($self,$map) = @_; my $max_len = 0; my $new_ref; for my $ref (keys %$map) { my $len = length($ref); if ( $max_len<$len ) { $max_len = $len; $new_ref = $ref; } $$map{$ref}{$ref} = 1; } for my $ref (keys %$map) { my $rlen = length($ref); if ( substr($new_ref,0,$rlen) ne $ref ) { $self->warn("The reference prefixes do not agree: $ref vs $new_ref\n"); return undef; } for my $alt (keys %{$$map{$ref}}) { # The second part of the regex is for VCF>4.0, but does no harm for v<=4.0 if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; } my $new = $alt; if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); } $$map{$ref}{$alt} = $new; } } return $new_ref; } =head2 normalize_alleles About : Makes REF and ALT alleles more compact if possible (e.g. TA,TAA -> T,TA) Usage : my $line = $vcf->next_data_array(); ($ref,@alts) = $vcf->normalize_alleles($$line[3],$$line[4]); =cut sub Vcf4_0::normalize_alleles { my ($self,$ref,$alt) = @_; my $rlen = length($ref); if ( $rlen==1 or length($alt)==1 ) { return ($ref,split(/,/,$alt)); } my @als = split(/,/,$alt); my $i = 1; my $done = 0; while ( $i<$rlen ) { my $r = substr($ref,$rlen-$i,1); for my $al (@als) { my $len = length($al); if ( $i>=$len ) { $done = 1; } my $c = substr($al,$len-$i,1); if ( $c ne $r ) { $done = 1; last; } } if ( $done ) { last; } $i++; } if ( $i>1 ) { $i--; $ref = substr($ref,0,$rlen-$i); for (my $j=0; $j<@als; $j++) { $als[$j] = substr($als[$j],0,length($als[$j])-$i); } } return ($ref,@als); } sub Vcf4_0::normalize_alleles_pos { my ($self,$ref,$alt) = @_; my @als; ($ref,@als) = $self->normalize_alleles($ref,$alt); my $rlen = length($ref); if ( $rlen==1 ) { return (0,$ref,@als); } my $i = 0; my $done = 0; while ( $i+1<$rlen ) { my $r = substr($ref,$i,1); for my $al (@als) { my $len = length($al); if ( $i+1>=$len ) { $done = 1; last; } my $c = substr($al,$i,1); if ( $c ne $r ) { $done = 1; last; } } if ( $done ) { last; } $i++; } if ( $i<0 ) { $i = 0; } if ( $i>0 ) { substr($ref,0,$i,''); for (my $j=0; $j<@als; $j++) { substr($als[$j],0,$i,''); } } return ($i,$ref,@als); } sub Vcf4_0::event_type { my ($self,$rec,$allele) = @_; my $ref = $rec; if ( ref($rec) eq 'HASH' ) { if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); } $ref = $$rec{REF}; } if ( $allele=~/^<[^>]+>$/ ) { if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; } return ('u',0,$allele); } if ( $allele eq '.' ) { if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; } return ('r',0,$ref); } my $reflen = length($ref); my $len = length($allele); my $ht; my $type; if ( $len==$reflen ) { # This can be a reference, a SNP, or multiple SNPs my $mism = 0; for (my $i=0; $i<$len; $i++) { if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; } } if ( $mism==0 ) { $type='r'; $len=0; } else { $type='s'; $len=$mism; } } else { ($len,$ht)=$self->is_indel($ref,$allele); if ( $len ) { # Indel $type = 'i'; $allele = $ht; } else { $type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1; } } if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = [$type,$len,$allele]; } return ($type,$len,$allele); } # The sequences start at the same position, which simplifies things greatly. # Returns length of the indel (+ insertion, - deletion), the deleted/inserted sequence # and the position of the first base after the shared sequence sub is_indel { my ($self,$seq1,$seq2) = @_; my $len1 = length($seq1); my $len2 = length($seq2); if ( $len1 eq $len2 ) { return (0,'',0); } my ($del,$len,$LEN); if ( $len1<$len2 ) { $len = $len1; $LEN = $len2; $del = 1; } else { $len = $len2; $LEN = $len1; $del = -1; my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp; } my $ileft; for ($ileft=0; $ileft<$len; $ileft++) { if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; } } if ( $ileft==$len ) { return ($del*($LEN-$len), substr($seq2,$ileft), $ileft); } my $iright; for ($iright=0; $iright<$len; $iright++) { if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; } } if ( $iright+$ileft<=$len ) { return (0,'',0); } return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len),$ileft); } #------------------------------------------------ # Version 4.1 specific functions =head1 VCFv4.1 VCFv4.1 specific functions =cut package Vcf4_1; use base qw(Vcf4_0); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '4.1', drop_trailings => 1, filter_passed => 'PASS', defaults => { QUAL => '.', Flag => undef, GT => '.', default => '.', }, reserved => { FILTER => { 0=>1 }, cols => {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1}, }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i, regex_ins => qr/^[ACGTN]+$/i, regex_del => qr/^[ACGTN]+$/i, regex_gtsep => qr{[|/]}, # | / regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1 regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0| gt_sep => [qw(| /)], }; $$self{ignore_missing_GT} = 1; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } sub Vcf4_1::validate_header { my ($self) = @_; my $lines = $self->get_header_line(key=>'reference'); if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); } } sub Vcf4_1::validate_line { my ($self,$line) = @_; if ( !$$self{_contig_validated}{$$line{CHROM}} ) { my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM}); if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); } $$self{_contig_validated}{$$line{CHROM}} = 1; } if ( index($$line{CHROM},':')!=-1 ) { $self->warn("Colons not allowed in chromosome names: $$line{CHROM}\n"); } # Is the ID composed of alphanumeric chars if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); } } sub Vcf4_1::validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my $ref_len = length($ref); my $ref1 = substr($ref,0,1); my @err; my $msg = ''; for my $item (@$values) { if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ ) { if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; } my $rpl; if ( $1 ne '' ) { $rpl = $1; if ( $rpl ne '.' ) { my $rref = substr($rpl,0,1); if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; } } } else { $rpl = $3; if ( $rpl ne '.' ) { my $rref = substr($rpl,-1,1); if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; } } } my $pos = $2; if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; } if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; } next; } if ( $item=~/^\.[ACTGNactgn]*([ACTGNactgn])$/ ) { next; } elsif ( $item=~/^([ACTGNactgn])[ACTGNactgn]*\.$/ ) { next; } if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; } } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg; } sub Vcf4_1::next_data_hash { my ($self,@args) = @_; my $out = $self->SUPER::next_data_hash(@args); if ( !defined $out or $$self{assume_uppercase} ) { return $out; } # Case-insensitive ALT and REF bases $$out{REF} = uc($$out{REF}); my $nalt = @{$$out{ALT}}; for (my $i=0; $i<$nalt; $i++) { if ( $$out{ALT}[$i]=~/^SUPER::next_data_array(@args); if ( !defined $out or $$self{assume_uppercase} ) { return $out; } # Case-insensitive ALT and REF bases $$out[3] = uc($$out[3]); my $alt = $$out[4]; $$out[4] = ''; my $pos = 0; while ( $pos',$start+1); if ( $end==-1 ) { $self->throw("Could not parse ALT [$alt]\n") } if ( $start>$pos ) { $$out[4] .= uc(substr($alt,$pos,$start-$pos)); } $$out[4] .= substr($alt,$start,$end-$start+1); $pos = $end+1; } if ( $posSUPER::event_type($rec,$allele); } my $c = substr($allele,0,1); if ( $c eq '<' ) { return ('u',0,$allele); } elsif ( $c eq '[' or $c eq ']' or $c eq '.' ) { return 'b'; } $c = substr($allele,-1,1); if ( $c eq '[' or $c eq ']' or $c eq '.' ) { return 'b'; } elsif ( index($allele,'[')!=-1 or index($allele,']')!=-1 ) { return 'b'; } return $self->SUPER::event_type($rec,$allele); } #------------------------------------------------ # Version 4.2 specific functions =head1 VCFv4.2 VCFv4.2 specific functions =cut package Vcf4_2; use base qw(Vcf4_1); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{version} = '4.2'; return $self; } sub Vcf4_2::validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my $ref_len = length($ref); my $ref1 = substr($ref,0,1); my @err; my $msg = ''; for my $item (@$values) { if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ ) { if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; } my $rpl; if ( $1 ne '' ) { $rpl = $1; if ( $rpl ne '.' ) { my $rref = substr($rpl,0,1); if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; } } } else { $rpl = $3; if ( $rpl ne '.' ) { my $rref = substr($rpl,-1,1); if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; } } } my $pos = $2; if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; } if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; } next; } if ( $item=~/^\.[ACTGNactgn]*([ACTGNactgn])$/ ) { next; } elsif ( $item=~/^([ACTGNactgn])[ACTGNactgn]*\.$/ ) { next; } elsif ( $item eq '*' ) { next; } if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; } } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg; } #------------------------------------------------ # Version 4.3 specific functions =head1 VCFv4.3 VCFv4.2 specific functions =cut package Vcf4_3; use base qw(Vcf4_2); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{version} = '4.3'; return $self; } 1; vcftools-0.1.15/src/perl/VcfStats.pm000066400000000000000000000433431307140004000172550ustar00rootroot00000000000000# # Author: petr.danecek@sanger # =head1 NAME VcfStats.pm. Module for collecting stats from VCF files. =head1 SYNOPSIS use VcfStats; my $vstats = VcfStats->new(file=>'example.vcf.gz'); while (my $x=$vstats->next_data_hash()) { $vstats->collect_stats($x); } $vstats->dump(); =cut package VcfStats; use strict; use warnings; use Carp; use Data::Dumper; use base 'Vcf'; =head2 new About : Creates new VcfStats. Usage : my $vstats = VcfStats->new(file=>'my.vcf'); Args : See Vcf.pm =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); for my $version (@{$$self{versions}}) { if ( $self->isa($version) ) { eval "use base '$version'"; } } bless($self,$class); return $self; } sub parse_header { my ($self,@args) = @_; $self->SUPER::parse_header(@args); } =head2 get_stats_key About : Creates relevant stats hash key, used by select_stats Usage : Args [1]: Hash with filter definition (value to match, range, etc.) [2]: Prefix of the stat [3]: Value of the filter =cut sub get_stats_key { my ($self,$filter,$key,$value) = @_; my $stat_key; if ( $$filter{exact} ) { if ( $value ne $$filter{value} ) { next; } $stat_key = $key.'/'.$value; } elsif ( $value eq '.' ) { $stat_key = $key.'/.'; } elsif ( $$filter{any} ) { $stat_key = $key.'/'.$value; } elsif ( $$filter{bin} ) { my $bin = int($value/$$filter{bin_size}) * $$filter{bin_size}; if ( $bin>$$filter{max} ) { $bin=">$$filter{max}"; } $stat_key = $key.'/'.$bin; } else { $self->throw("TODO: $key...\n"); } return $stat_key; } =head2 select_stats About : Selects relevant stats hashes Usage : Args [1]: Hash record from next_data_hash [2]: Filters =cut sub select_stats { my ($self,$rec,$filters) = @_; if ( !exists($$self{stats}{all}) ) { $$self{stats}{all}={}; } my @mandatory = ( $$self{stats}{all} ); my %samples; for my $sample (keys %{$$rec{gtypes}}) { if ( !exists($$self{stats}{samples}{$sample}) ) { $$self{stats}{samples}{$sample} = {}; } push @{$samples{$sample}}, $$self{stats}{samples}{$sample}; } if ( !defined $filters ) { return (\@mandatory,\%samples); } while (my ($key,$filter) = each %$filters) { if ( $key eq 'FILTER' ) { for my $value (@{$$rec{FILTER}}) { my $stats_key = $self->get_stats_key($filter,$key,$value); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } } elsif ( $key eq 'QUAL' ) { my $stats_key = $self->get_stats_key($filter,$key,$$rec{QUAL}); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } elsif ( $key=~m{^INFO/} ) { if ( $$filter{is_flag} ) { if ( $$filter{value} && !exists($$rec{INFO}{$$filter{tag}}) ) { next; } elsif ( !$$filter{value} && exists($$rec{INFO}{$$filter{tag}}) ) { next; } if ( !exists($$self{stats}{$key}) ) { $$self{stats}{$key}={}; } push @mandatory, $$self{stats}{$key}; next; } elsif ( exists($$rec{INFO}{$$filter{tag}}) ) { my $stats_key = $self->get_stats_key($filter,$key,$$rec{INFO}{$$filter{tag}}); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } } elsif ( $key=~m{^FORMAT/([^/]+)$} ) { while (my ($sample,$hash) = each %{$$rec{gtypes}}) { if ( !exists($$hash{$1}) ) { next; } my $stats_key = $self->get_stats_key($filter,$1,$$hash{$1}); if ( !exists($$self{stats}{samples}{$sample}{user}{$stats_key}) ) { $$self{stats}{samples}{$sample}{user}{$stats_key}={}; } push @{$samples{$sample}}, $$self{stats}{samples}{$sample}{user}{$stats_key}; } } elsif ( $key=~m{^SAMPLE/([^/]+)/([^/]+)$} ) { if ( !exists($$rec{gtypes}{$1}{$2}) ) { next; } my $stats_key = $self->get_stats_key($filter,$2,$$rec{gtypes}{$1}{$2}); if ( !exists($$self{stats}{samples}{$1}{user}{$stats_key}) ) { $$self{stats}{samples}{$1}{user}{$stats_key}={} } push @{$samples{$1}}, $$self{stats}{samples}{$1}{user}{$stats_key}; } else { $self->throw("The feature currently not recognised: $key.\n"); } } return (\@mandatory,\%samples); } =head2 collect_stats About : Collect stats Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats($x); Args : =cut sub collect_stats { my ($self,$rec,$filters) = @_; # Ts/Tv and custom numbers based on INFO, QUAL etc. for the mandatory columns my ($mandatory_stats,$sample_stats) = $self->select_stats($rec,$filters); $self->collect_stats_mandatory($rec,$mandatory_stats); # Ts/Tv for samples while (my ($sample,$stats) = each %$sample_stats) { $self->collect_stats_sample($rec,$sample,$stats); } my %type_keys = ( r=>'ref', s=>'snp', i=>'indel' ); # Private calls and the number of shared SNPs. Check if: # - there is a nonref variant present only in this sample (samples->sample_name->private) # - there is a nonref variant in N samples (samples->all->shared) # - there is a non-empty call (samples->sample_name->count) my $shared = 0; my $sample_name; while (my ($sample,$stats) = each %$sample_stats) { my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample); if ( $is_empty ) { next; } my $is_hom=1; my %types; my $is_ref = 1; for my $al (@$alleles) { if ( $$alleles[0] ne $al ) { $is_hom=0; } my ($type,$len,$ht) = $self->event_type($rec,$al); $types{$type} = 1; if ( $type eq 'r' ) { next; } $is_ref = 0; } for my $stat (@$stats) { $$stat{count}++; } for my $type (keys %types) { my $key = exists($type_keys{$type}) ? $type_keys{$type} : 'other'; $key .= '_count'; for my $stat (@$stats) { $$stat{$key}++; } } my $key; if ( exists($types{r}) ) { if ( $is_hom ) { $key='hom_RR'; } else { $key='het_RA' } } elsif ( $is_hom ) { $key='hom_AA'; } else { $key='het_AA'; } $key .= '_count'; for my $stat (@$stats) { $$stat{$key}++; } $key = $is_phased ? 'phased' : 'unphased'; for my $stat (@$stats) { $$stat{$key}++; } if ( $is_ref ) { next; } $shared++; if ( !defined $sample_name ) { $sample_name = $sample; } } $$self{stats}{all}{shared}{$shared}++; if ( $shared==1 ) { for my $stat (@{$$sample_stats{$sample_name}}) { $$stat{private}++; } } } =head2 collect_stats_mandatory About : Collect stats based on mandatory columns Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats_mandatory($x); Args : =cut sub collect_stats_mandatory { my ($self,$rec,$stats) = @_; # How many mono,bi,tri-allelic etc sites are there my $nalt = 0; if ( !scalar keys %{$$rec{gtypes}} ) { $nalt = scalar @{$$rec{ALT}}; if ( $nalt==1 && $$rec{ALT}[0] eq '.' ) { $nalt=0 } } elsif ( exists($$rec{INFO}{AC}) ) { for my $ac (split(/,/,$$rec{INFO}{AC})) { if ( $ac ) { $nalt++; } } } else { my ($an,$ac,$acs) = $self->calc_an_ac($$rec{gtypes}); for my $ac (@$acs) { if ( $ac ) { $nalt++; } } } my %types; for my $alt (@{$$rec{ALT}}) { if ( $alt eq '.' ) { $alt=$$rec{REF}; } my $type = $self->add_variant($rec,$alt,$stats); $types{$type} = 1; } # Increment counters for my $stat (@$stats) { $$stat{'nalt_'.$nalt}++; $$stat{count}++; for my $type (keys %types) { $$stat{$type.'_count'}++; } } } =head2 collect_stats_sample About : Collect stats for given sample Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats_sample($x,'NA0001'); Args [1] hash row from next_data_hash [2] sample name [3] stats to collect =cut sub collect_stats_sample { my ($self,$rec,$sample,$stats) = @_; my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample); if ( @$alleles > 2 ) { $self->throw("FIXME: currently handling diploid data only (easy to fix)\n"); } my $prev; for my $al (@$alleles) { if ( !defined $prev or $prev ne $al ) { # Only heterozygous SNPs will be counted twice $self->add_variant($rec,$al,$stats); } $prev = $al; } } =head2 add_variant About : Register mutation type in the selected pool Usage : $vstats->add_variant('A','AT',$stats); $vstats->add_variant($rec,'AT',$stats); Args [1] Reference haplotype or VCF data line parsed by next_data_hash [2] Variant haplotype [3] Array of hash stats Returns : The event type (snp,indel,ref) =cut sub add_variant { my ($self,$ref,$alt,$stats) = @_; my $key_type = 'other'; my %key_subt; if ( $alt eq '.' ) { $key_type = 'missing'; } else { my ($type,$len,$ht) = $self->event_type($ref,$alt); if ( $type eq 's' ) { $key_type = 'snp'; # The SNP can be encoded for example as GTTTTTTT>CTTTTTTT my $ref_str = ref($ref) eq 'HASH' ? $$ref{REF} : $ref; my $ref_len = length($ref_str); if ( $ref_len>1 ) { for (my $i=0; $i<$ref_len; $i++) { my $ref_nt = substr($ref_str,$i,1); my $alt_nt = substr($alt,$i,1); if ( $ref_nt ne $alt_nt ) { $key_subt{$ref_nt.'>'.$alt_nt}++; } } } else { $key_subt{$ref_str.'>'.$alt}++; } } elsif ( $type eq 'i' ) { $key_type = 'indel'; $key_subt{$len}++; } elsif ( $type eq 'r' ) { $key_type = 'ref'; } } for my $stat (@$stats) { if ( %key_subt ) { while (my ($subt,$value)=each %key_subt) { $$stat{$key_type}{$subt}+=$value; } } else { $$stat{$key_type}++; } } return $key_type; } =head2 dump About : Produce Data::Dumper dump of the collected stats Usage : Args : Returns : The dump. =cut sub dump { my ($self) = @_; return Dumper($$self{stats}); } sub _calc_tstv { my ($self,$stat) = @_; my $ts = 0; for my $mut (qw(A>G G>A C>T T>C)) { if ( exists($$stat{$mut}) ) { $ts += $$stat{$mut}; } } my $tv = 0; for my $mut (qw(A>C C>A G>T T>G A>T T>A C>G G>C)) { if ( exists($$stat{$mut}) ) { $tv += $$stat{$mut}; } } my $ratio = $tv ? $ts/$tv : 0; return ($ts,$tv,$ratio); } =head2 dump_tstv About : Calculate transitions/transversions ratio and output string Usage : Args : Returns : Formatted string =cut sub dump_tstv { my ($self,$stats) = @_; my $out = "#Transitions\tTransversions\tts/tv\tSample\n"; for my $key (sort keys %$stats) { if ( !exists($$stats{$key}{snp}) ) { next; } my $stat = $$stats{$key}{snp}; my ($ts,$tv,$ratio) = $self->_calc_tstv($stat); $out .= sprintf "%d\t%d\t%.2f\t%s\n", $ts,$tv,$ratio,$key; } return $out; } =head2 dump_qual_tstv About : Calculate marginal transitions/transversions ratios for QUAL/* stats Usage : Args : Returns : Formatted string =cut sub dump_qual_tstv { my ($self,$file) = @_; my @values; for my $stat (keys %{$$self{stats}}) { if ( !($stat=~m{^QUAL/(.+)}) ) { next; } my $qual = $1; # The quality record can be also of the form ">200". Exclude these from numeric comparison if ( !($qual=~/^[0-9.]+$/) ) { $qual = "#$qual"; } my $count = $$self{stats}{$stat}{count}; if ( !exists($$self{stats}{$stat}{snp}) ) { next; } my ($ts,$tv,$ratio) = $self->_calc_tstv($$self{stats}{$stat}{snp}); push @values, [$qual,$count,$ratio]; } my @svalues = sort { if ($$a[0]=~/^#/ or $$b[0]=~/^#/) { return $$a[0] cmp $$b[0]; } return $$a[0] <=> $$b[0]; } @values; my $out = "#Quality\tMarginal count\tMarginal Ts/Tv\n"; for my $val (@svalues) { if ( $$val[0]=~/^#/ ) { $out .= sprintf "%s\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2]; } else { $out .= sprintf "%.2f\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2]; } } return $out; } =head2 dump_counts About : Usage : Args : Returns : Formatted string =cut sub dump_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{count}) ) { next; } $out .= sprintf "%d\t%s\n", $$self{stats}{$key}{count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{count},$key; } return $out; } sub dump_snp_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{snp_count}) ) { next; } $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{snp_count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{snp_count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{snp_count},$key; } return $out; } sub dump_indel_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{indel_count}) ) { next; } $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{indel_count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{indel_count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{indel_count},$key; } return $out; } sub dump_shared_counts { my ($self) = @_; my $out = "#Shared SNPs\tFrequency\n"; for my $key (sort {$a<=>$b} keys %{$$self{stats}{all}{shared}}) { $out .= sprintf "%d\t%s\n", $key,$$self{stats}{all}{shared}{$key}; } return $out; } sub dump_private_counts { my ($self) = @_; my $out = "#Private SNPs\tSample\n"; for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{private}) ) { next; } $out .= sprintf "%d\t%s\n", $$self{stats}{samples}{$key}{private},$key; } return $out; } sub _init_path { my ($self,$prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub legend { my ($self) = @_; return q[ count Number of positions with known genotype nalt_X Number of monoallelic (X=0), biallelic (X=1), etc. sites ref, ref_count Number of sites containing reference allele shared Number of sites having a non-reference allele in 0,1,2,etc samples snp_count Number of positions with SNPs ]; } =head2 save_stats About : Save all collected stats to files Usage : Args : The prefix of output files. Non-existent directories will be created. Returns : N/A =cut sub save_stats { my ($self,$prefix) = @_; if ( !defined $prefix ) { print $self->dump(); return; } my $path = $self->_init_path($prefix); $self->_write_file($path.'.legend', $self->legend()); $self->_write_file($path.'.dump', $self->dump()); $self->_write_file($path.'.tstv', $self->dump_tstv($$self{stats})); $self->_write_file($path.'.counts', $self->dump_counts()); $self->_write_file($path.'.snps', $self->dump_snp_counts()); $self->_write_file($path.'.indels', $self->dump_indel_counts()); $self->_write_file($path.'.qual-tstv',$self->dump_qual_tstv); $self->_write_file($path.'.shared',$self->dump_shared_counts()); $self->_write_file($path.'.private',$self->dump_private_counts()); if ( exists($$self{stats}{samples}) ) { $self->_write_file($path.'.samples-tstv',$self->dump_tstv($$self{stats}{samples})); } } sub _write_file { my ($self,$fname,$text) = @_; open(my $fh,'>',$fname) or $self->throw("$fname: $!"); print $fh $text; close($fh); } 1; vcftools-0.1.15/src/perl/fill-aa000077500000000000000000000137301307140004000164120ustar00rootroot00000000000000#!/usr/bin/env perl # # Notes: # * The AA files can be downloaded from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments # * The program runs samtools, therefore the AA files must be gzipped (not b2zipped). # # support: pd3@sanger use strict; use warnings; use Carp; use Vcf; use FindBin; use lib "$FindBin::Bin"; use FaSlice; my $opts = parse_params(); fill_aa($opts,$$opts{aa_file}); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: This script fills ancestral alleles into INFO column of VCF files. It depends on samtools,\n", " therefore the fasta sequence must be gzipped (not bgzipped!) and indexed by samtools faidx.\n", " The AA files can be downloaded from\n", " ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments\n", " and processed as shown in the example below. This is because the sequences in the original files\n", " are named as 'ANCESTOR_for_chromosome:NCBI36:1:1:247249719', but the underlying FaSplice.pm\n", " requires names as 'chr1' or '1'.\n", "Usage: fill-aa [OPTIONS] < in.vcf >out.vcf\n", "Options:\n", " -a, --ancestral-allele Prefix to ancestral allele chromosome files.\n", " -t, --type Variant types to process: all,indel,ref,snp. [all]\n", " -h, -?, --help This help message.\n", "Example:\n", " # Get the files ready: compress by gzip and index by samtools faidx. Either repeat the\n", " # following command for each file manually\n", " bzcat human_ancestor_1.fa.bz2 | sed 's,^>.*,>1,' | gzip -c > human_ancestor_1.fa.gz\n", " samtools faidx human_ancestor_1.fa.gz\n", " \n", " # .. or use this loop (tested in bash shell)\n", " ls human_ancestor_*.fa.bz2 | while read IN; do\n", " OUT=`echo \$IN | sed 's,bz2\$,gz,'`\n", " CHR=`echo \$IN | sed 's,human_ancestor_,, ; s,.fa.bz2,,'`\n", " bzcat \$IN | sed \"s,^>.*,>\$CHR,\" | gzip -c > \$OUT\n", " samtools faidx \$OUT\n", " done\n", " \n", " # After this has been done, the following command should return 'TACGTGGcTGCTCTCACACAT'\n", " samtools faidx human_ancestor_1.fa.gz 1:1000000-1000020\n", " \n", " # Now the files are ready to use with fill-aa. Note that the VCF file\n", " # should be sorted (see vcf-sort), otherwise the performance would be seriously\n", " # affected.\n", " cat file.vcf | fill-aa -a human_ancestor_ 2>test.err | gzip -c >out.vcf.gz \n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-a' || $arg eq '--ancestral-allele' ) { $$opts{aa_file} = shift(@ARGV); next } if ( $arg eq '-t' || $arg eq '--type' ) { my %known = ( snp=>'s', indel=>'i', all=>'a', ref=>'r' ); my $types = shift(@ARGV); for my $t (split(/,/,$types)) { if ( !(exists($known{$t})) ) { error("Unknown type [$t] with -t [$types]\n"); } $$opts{types}{$known{$t}} = 1; } if ( exists($$opts{types}{a}) ) { $$opts{types}{s} = 1; $$opts{types}{i} = 1; $$opts{types}{r} = 1; } next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{aa_file}) ) { error("Missing the -a option.\n") } return $opts; } sub fill_aa { my ($opts,$aa_fname) = @_; my $n_unknown = 0; my $n_filled_sites = 0; my $n_filled_bases = 0; my $vcf = Vcf->new(fh=>\*STDIN, assume_uppercase=>1); $vcf->parse_header(); $vcf->add_header_line({key=>'INFO',ID=>'AA',Number=>1,Type=>'String', Description=>'Ancestral Allele, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments/README'}); print $vcf->format_header(); my %chr2fa = (); my $nskipped = 0; while (my $line = $vcf->next_line() ) { my $rec = $vcf->next_data_array($line); my $chr = $$rec[0]; my $pos = $$rec[1]; my $ref = $$rec[3]; if ( !exists($chr2fa{$chr}) ) { my $fname = $aa_fname; if ( ! -e $fname ) { if ( -e "$fname$chr.fa.gz" ) { $fname = "$fname$chr.fa.gz"; } else { error(qq[Neither "$fname" nor "$fname$chr.fa.gz" exists.\n]); } } $chr2fa{$chr} = FaSlice->new(file=>$fname, size=>100_000); } my $fa = $chr2fa{$chr}; my $ref_len = length($ref); if ( exists($$opts{types}) && !exists($$opts{types}{a}) ) { my $ok = 0; for my $alt (split(/,/,$$rec[4])) { my ($type,$len,$ht) = $vcf->event_type($ref,$alt); if ( exists($$opts{types}{$type}) ) { $ok=1; last; } } if ( !$ok ) { print $line; $nskipped++; next; } } my $aa = $ref_len==1 ? $fa->get_base($chr,$pos) : $fa->get_slice($chr,$pos,$pos+$ref_len-1); if ( $aa ) { $$rec[7] = $vcf->add_info_field($$rec[7],'AA'=>$aa); $n_filled_sites++; $n_filled_bases+=$ref_len; } else { $$rec[7] = $vcf->add_info_field($$rec[7],'AA'=>'.'); $n_unknown++; } print join("\t",@$rec),"\n"; } print STDERR "AA sites filled .. $n_filled_sites\n", "AA bases filled .. $n_filled_bases\n", "No AAs .. $n_unknown\n", "Lines skipped .. $nskipped\n"; } vcftools-0.1.15/src/perl/fill-an-ac000077500000000000000000000023551307140004000170110ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); fill_an_ac($$opts{file}); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: fill-an-ac [OPTIONS] < in.vcf >out.vcf\n", "Options:\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( -e $arg ) { $$opts{file} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub fill_an_ac { my ($file) = @_; my $vcf = $file ? Vcf->new(file=>$file) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); $vcf->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); print $vcf->format_header(); $vcf->recalc_ac_an(2); while (my $rec=$vcf->next_data_hash()) { print $vcf->format_line($rec); } } vcftools-0.1.15/src/perl/fill-fs000077500000000000000000000176041307140004000164450ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); flanking_sequence($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Annotate VCF with flanking sequence (INFO/FS tag)\n", "Usage: fill-fs [OPTIONS] file.vcf\n", "Options:\n", " -b, --bed-mask Regions to mask (tabix indexed), multiple files can be given\n", " -c, --cluster Do self-masking of clustered variants within this range.\n", " -l, --length Flanking sequence length [100]\n", " -m, --mask-char The character to use or \"lc\" for lowercase. This option must preceed\n", " -b, -v or -c in order to take effect. With multiple files works\n", " as a switch on the command line, see the example below [N]\n", " -r, --refseq The reference sequence.\n", " -v, --vcf-mask Mask known variants in the flanking sequence, multiple files can be given (tabix indexed)\n", " -h, -?, --help This help message.\n", "Example:\n", " # Mask variants from the VCF file with N's and use lowercase for the bed file regions\n", " fill-fs file.vcf -v mask.vcf -m lc -b mask.bed\n", "\n"; } sub parse_params { my $opts = { length=>100, mask=>[], cluster=>0 }; my $mask = $$opts{mask_char}{default} = 'N'; my $mask_changed = 0; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-c' || $arg eq '--cluster' ) { $$opts{cluster}=shift(@ARGV); $$opts{mask_char}{default}=$mask; $mask_changed=0; next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-l' || $arg eq '--length' ) { $$opts{length}=shift(@ARGV); next; } if ( $arg eq '-m' || $arg eq '--mask' ) { $mask=shift(@ARGV); check_mask_char($mask); $mask_changed=1; next; } if ( $arg eq '-b' || $arg eq '--bed-mask' ) { $arg=shift(@ARGV); push @{$$opts{bed_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-v' || $arg eq '--vcf-mask' ) { $arg=shift(@ARGV); push @{$$opts{vcf_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !($$opts{length}=~/^\d+$/) ) { error("Expected integer after -l, got $$opts{length}\n"); } if ( !exists($$opts{refseq}) ) { error("Missing the -r option.\n"); } if ( $mask_changed ) { error("The -m parameter must preceed -b, -v, or the file in order to take effect.\n"); } return $opts; } sub check_mask_char { my ($mask) = @_; if ( $mask eq 'lc' ) { return; } if ( length($mask) eq 1 ) { return; } error("Currently only \"lc\" or one-character mask is supported, got \"$mask\".\n"); } sub flanking_sequence { my ($opts) = @_; $$opts{faref} = FaSlice->new(file=>$$opts{refseq},size=>1_024,oob=>'N'); my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header; $vcf->add_header_line({key=>'INFO',ID=>'FS',Number=>1,Type=>'String',Description=>'Flanking sequence'}); print $vcf->format_header; my (@lines,@mask); while (my $line=$vcf->next_data_array) { my $chr = $$line[0]; my $pos = $$line[1]; my $ref = $$line[3]; my $alt = $$line[4]; my $off; $alt =~ s/,.+$//; # first allele is used at multiallelic sites ($off,$ref,$alt) = $vcf->normalize_alleles_pos($ref,$alt); $pos += $off; push @lines, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line }; push @mask, { chr=>$chr, pos=>$pos, ref=>$ref }; flush_buffers($opts,\@lines,\@mask); } flush_buffers($opts,\@lines,\@mask,1); } sub flush_buffers { my ($opts,$lines,$mask,$force) = @_; if ( !@$lines ) { return; } if ( !$$opts{cluster} ) { shift(@$mask); output_line($opts,shift(@$lines),$mask); return; } while ( @$lines && ($force or $$mask[0]{chr} ne $$lines[-1]{chr} or $$mask[0]{pos}+2*$$opts{cluster}<=$$lines[-1]{pos}) ) { output_line($opts,$$lines[0],$mask); shift(@$lines); while ( @$mask && @$lines && ($$mask[0]{chr} ne $$lines[0]{chr} or $$mask[0]{pos}+$$opts{cluster}<=$$lines[0]{pos}) ) { shift(@$mask); } } } sub output_line { my ($opts,$hline,$mask) = @_; my $chr = $$hline{chr}; my $pos = $$hline{pos}; my $ref = $$hline{ref}; my $alt = $$hline{alt}; my $line = $$hline{line}; my $seq_pos = $$opts{length}; my $reflen = length($ref); my $from = $pos-$$opts{length}; my $to = $pos+($reflen-1)+$$opts{length}; my $seq = $$opts{faref}->get_slice($chr,$from,$to); $seq = mask_sequence($opts,$seq,$chr,$from,$to,$mask); my $reflen_ori = $reflen; my ($len,$indel,$off) = $$opts{vcf}->is_indel($ref,$alt); if ( $len<0 ) { $seq_pos += $off; $ref = $indel; $reflen = abs($len); $alt = '-'; } elsif ( $len>0 ) { $seq_pos += $off; $ref = '-'; $alt = $indel; $reflen = $off-1; } substr($seq,$seq_pos,$reflen,"[$ref/$alt]"); if ( $reflen_ori - $reflen > 0 ) { # for redundant pad bases which cannot be removed without changing the position, e.g. ACGT AC $seq = substr($seq,$reflen_ori-$reflen); } if ( $$line[7] eq '.' or !defined $$line[7] ) { $$line[7] = ''; } else { $$line[7] .= ';'; } $$line[7] .= "FS=$seq"; print join("\t",@$line),"\n"; } sub mask_sequence { my ($opts,$seq,$chr,$from,$to,$mask) = @_; for my $m (@$mask) { my $reflen = length($$m{ref}); if ( $$m{chr} ne $chr or $$m{pos}+$reflen<$from or $$m{pos}>$to ) { next; } apply_mask($opts,\$seq,$$m{pos}-$from,$$m{ref},$$opts{mask_char}{default}); } for my $file (@{$$opts{vcf_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my $items = $$opts{vcf}->split_mandatory($ret); # In different situations one may want to treat indels differently. For # now, mask the whole REF string as for primer design it is safer to # mask the whole thing; for example, a 2bp deletion can be reported by # samtools as REF=GACACACA ALT=GACACA, the script will mask it all. apply_mask($opts,\$seq,$$items[1]-$from,$$items[3],$$opts{mask_char}{$file}); } } for my $file (@{$$opts{bed_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my @items = split(/\t/,$ret); apply_mask($opts,\$seq,$items[1]-$from+1,$items[2]-$from,$$opts{mask_char}{$file}); } } return $seq; } sub apply_mask { my ($opts,$seq,$from,$ref,$mask_char) = @_; if ( $from<0 ) { $from=0; } my $ref_len = $ref=~/^\d+$/ ? $ref-$from+1 : length($ref); my $seq_len = length($$seq); if ( $from+$ref_len>=$seq_len ) { $ref_len = $seq_len - $from; } if ( $ref_len<0 ) { return; } if ( $ref_len==1 ) { my $rpl = substr($$seq,$from,1); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,1)) : $mask_char; substr($$seq,$from,1,$rpl); return; } my $rpl = substr($$seq,$from,$ref_len); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,$ref_len)) : ($mask_char x $ref_len); substr($$seq,$from,$ref_len,$rpl); } vcftools-0.1.15/src/perl/fill-ref-md5000077500000000000000000000146471307140004000173000ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use IPC::Open2; use Vcf; my $opts = parse_params(); fill_ref_md5($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: The script computes MD5 sum of the reference sequence and inserts\n", " 'reference' and 'contig' tags into header as recommended by VCFv4.1.\n", " The VCF file must be compressed and tabix indexed, as it takes advantage\n", " of the lightning fast tabix reheader functionality.\n", "Usage: fill-ref-md5 [OPTIONS] in.vcf.gz out.vcf.gz\n", "Options:\n", " -d, --dictionary Where to read/write computed MD5s. Opened in append mode, existing records are not touched.\n", " -i, --info Optional info on reference assembly (AS), species (SP), taxonomy (TX)\n", " -r, --refseq The reference sequence in fasta format indexed by samtools faidx\n", " -h, -?, --help This help message.\n", "Examples:\n", " fill-ref-md5 -i AS:NCBIM37,SP:\"Mus\\ Musculus\" -r NCBIM37_um.fa -d NCBIM37_um.fa.dict in.vcf.gz out.vcf.gz\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-i' || $arg eq '--info' ) { $$opts{info}=shift(@ARGV); next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-d' || $arg eq '--dictionary' ) { $$opts{dictionary}=shift(@ARGV); next; } if ( -e $arg && !exists($$opts{file}) ) { $$opts{file} = $arg; next } if ( exists($$opts{file}) && !exists($$opts{outfile}) ) { $$opts{outfile} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\" or non-existent file. Run -h for help.\n"); } if ( !exists($$opts{refseq}) && !exists($$opts{dictionary}) ) { error("Expected one of -d or -r options\n"); } if ( !exists($$opts{file}) ) { error("No input VCF file given.\n"); } if ( !exists($$opts{outfile}) ) { error("No output VCF file given.\n"); } return $opts; } sub read_dict { my ($dict) = @_; my $out = {}; if ( !$dict or !-e $dict ) { return $out } open(my $fh,'<',$dict) or error("$dict: $!"); my $line=<$fh>; if ( $line ne "\@HD\tVN:1.0\tSO:unsorted\n" ) { error("Could not parse $dict: $line"); } while (my $line=<$fh>) { chomp($line); # @SQ SN:5 LN:152537259 UR:file:/lustre/scratch102/projects/mouse/ref/NCBIM37_um.fa M5:f90804fb8fe9cb06076d51a710fb4563 my @items = split(/\t/,$line); if ( @items != 5 ) { error("Could not parse $dict: $line"); } my $item = shift(@items); if ( $item ne '@SQ' ) { next; } my $rec = {}; for my $item (@items) { if ( !($item=~/^([^:]+):(.+)$/) ) { error("Could not parse $dict: [$item] [$line]"); } $$rec{$1} = $2; } if ( !exists($$rec{SN}) ) { error("No SN in [$dict] [$line]?"); } $$out{$$rec{SN}} = $rec; } close($fh); return $out; } sub add_to_dictionary { my ($opts,$dict,$chr) = @_; if ( !exists($$opts{refseq}) ) { error("The chromosome [$chr] not present in the dictionary and no reference sequence given.\n"); } my($md5_in,$md5_out,$ok,$len); eval { open2($md5_out,$md5_in,'md5sum'); $ok=1; }; if ( !$ok ) { error("md5sum: $!"); } my $cmd = "samtools faidx $$opts{refseq} $chr"; open(my $refseq,"$cmd |") or error("$cmd: $!"); # get rid of the first ">$chr" line. <$refseq>; while (my $line=<$refseq>) { chomp($line); print $md5_in $line; $len += length($line); } close($refseq); close($md5_in); my @md5 = <$md5_out>; close($md5_out); $md5[0] =~ s/\s+.*$//; chomp($md5[0]); if ( !$len ) { error("The sequence [$chr] not present in $$opts{refseq}\n"); } $$dict{$chr} = { dirty=>1, SN=>$chr, LN=>$len, UR=>'file://'.$$opts{refseq}, M5=>$md5[0] }; $$dict{dirty} = 1; } sub write_dictionary { my ($opts,$dict) = @_; if ( !$$dict{dirty} or !exists($$opts{dictionary}) ) { return } my $needs_header = !-e $$opts{dictionary} ? 1 : 0; open(my $fh,'>>',$$opts{dictionary}) or error("$$opts{dictionary}: $!"); print $fh "\@HD\tVN:1.0\tSO:unsorted\n" unless !$needs_header; for my $key (sort keys %$dict) { if ( ref($$dict{$key}) ne 'HASH' or !$$dict{$key}{dirty} ) { next; } my $sn = $$dict{$key}{SN}; my $ln = $$dict{$key}{LN}; my $ur = $$dict{$key}{UR}; my $m5 = $$dict{$key}{M5}; print $fh "\@SQ\tSN:$sn\tLN:$ln\tUR:$ur\tM5:$m5\n"; } close($fh); } sub write_header { my ($opts,$dict,$chroms) = @_; my %info; if ( exists($$opts{info}) ) { $$opts{info} =~ s/AS:/assembly:/; $$opts{info} =~ s/SP:/species:/; $$opts{info} =~ s/TX:/taxonomy:/; for my $item (split(/,/,$$opts{info})) { my ($key,$value) = split(/:/,$item); if ( !defined $value ) { error("Could not parse the info: [$item] [$$opts{info}]"); } $info{$key} = $value; } } my $vcf = Vcf->new(file=>$$opts{file}); $vcf->parse_header(); my $uri = $$opts{refseq}=~m{^[^/:]+:} ? '' : 'file:'; $vcf->add_header_line({key=>'reference', value=>"$uri$$opts{refseq}"}); for my $chrom (@$chroms) { my %line = ( key => 'contig', ID => $$dict{$chrom}{SN}, length => $$dict{$chrom}{LN}, md5 => $$dict{$chrom}{M5}, %info ); $vcf->add_header_line(\%line); } open(my $out,'>',"$$opts{outfile}.header") or error("$$opts{outfile}.header: $!"); print $out $vcf->format_header(); close($out); } sub fill_ref_md5 { my ($opts) = @_; # List chromosomes my @chroms = `tabix -l $$opts{file}`; if ( $? ) { error("The command failed: tabix -l $$opts{file}\n"); } # Read dictionary my $dict = read_dict($$opts{dictionary},\@chroms); for my $chr (@chroms) { chomp($chr); if ( !exists($$dict{$chr}) ) { add_to_dictionary($opts,$dict,$chr); } } write_dictionary($opts,$dict); write_header($opts,$dict,\@chroms); `tabix -r $$opts{outfile}.header $$opts{file} > $$opts{outfile}`; } vcftools-0.1.15/src/perl/tab-to-vcf000077500000000000000000000044221307140004000170450ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); tab_to_vcf(); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: tab-to-vcf [OPTIONS]\n", "Options:\n", " -i, --id The column ID.\n", " -r, --ref The reference sequence (optional).\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-i' || $arg eq '--id' ) { $$opts{id} = shift(@ARGV); next } if ( $arg eq '-r' || $arg eq '--ref' ) { $$opts{refseq} = shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{id}) ) { error("Missing the -i option.\n") } return $opts; } sub tab_to_vcf { my ($data,$prefix) = @_; my $refseq = $$opts{refseq} ? FaSlice->new(file=>$$opts{refseq},size=>1_000_000) : undef; my $id = $$opts{id}; my $vcf_out = Vcf->new(); $vcf_out->add_columns($id); $vcf_out->add_header_line({key=>'FORMAT',ID=>'GT',Number=>'1',Type=>'String',Description=>"Genotype"}); print $vcf_out->format_header(); while (my $line=) { if ( $line=~/^#/ ) { next; } # 11 86881024 CT my @items = split(/\t/,$line); if ( $items[2] eq '*' ) { next; } my $chr = $items[0]; my $pos = $items[1]; my $snp = $items[2]; if ( !($pos=~/^\d+$/) ) { error("Could not parse the line: $line"); } if ( !($snp=~/^([ACGT])([ACGT])$/) ) { error("Could not parse the line: $line"); } $snp = "$1/$2"; my %out; $out{CHROM} = $chr; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{REF} = $refseq->get_base($chr,$pos); $out{QUAL} = '.'; $out{FILTER} = ['.']; $out{FORMAT} = ['GT']; $out{gtypes}{$id}{GT} = $snp; $vcf_out->format_genotype_strings(\%out); print $vcf_out->format_line(\%out); } } vcftools-0.1.15/src/perl/test.t000066400000000000000000000436031307140004000163250ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # # Usage: test.t [-d] # use strict; use warnings; use Carp; use IPC::Open2; use FindBin; use lib "$FindBin::Bin"; use Vcf; BEGIN { use Test::Most tests => 75; } my $path = $FindBin::RealBin; my $debug = ($ARGV[0] && $ARGV[0] eq '-d') ? 1 : 0; test_bgzip_and_tabix("$path/../examples/merge-test-a.vcf"); test_validator($path,"$path/../examples/valid-3.3.vcf"); test_validator($path,"$path/../examples/valid-4.0.vcf"); test_validator($path,"$path/../examples/valid-4.1.vcf"); test_validator($path,"$path/../examples/floats.vcf"); test_format_validation($path,'3.3'); test_format_validation($path,'4.0'); test_format_validation($path,'4.1'); test_parse($path); test_vcf_stats($path,"$path/../examples/valid-4.0.vcf"); test_empty_cols($path,'4.0'); test_merge($path,'merge-test.vcf.out','merge-test-a.vcf','merge-test-b.vcf','merge-test-c.vcf'); test_compare($path,'cmp-test-a.vcf','cmp-test-b.vcf','cmp-test.out'); test_isec($path,'-n +2','isec-n2-test.vcf.out','merge-test-a.vcf','merge-test-b.vcf','merge-test-c.vcf'); test_query_vcf("$path/../examples/",'cmp-test-a.vcf','query-test.out','%CHROM:%POS\tref=%REF\talt=%ALT\tqual=%QUAL\t%INFO/DP[\t%SAMPLE=%GT]\n'); test_shuffle("$path/../examples/",'cmp-test-a.vcf','shuffle-test.vcf'); test_concat("$path/../examples/",'concat.out','concat-a.vcf','concat-b.vcf','concat-c.vcf'); test_annotate("$path/../examples/",'-c FROM,TO,CHROM,-,-,-,INFO/HM2,INFO/GN,INFO/DP -d key=INFO,ID=HM2,Number=0,Type=Flag,Description="HapMap2 membership" -d key=INFO,ID=GN,Number=1,Type=String,Description="Gene Name" -d key=INFO,ID=DP,Number=0,Type=Integer,Description="Depth,etc"','annotate.out','concat-a.vcf','annotate.txt'); test_annotate("$path/../examples/",'-c FROM,TO,CHROM,ID,REF,ALT,INFO/HM2,INFO/GN,INFO/DP -d key=INFO,ID=HM2,Number=0,Type=Flag,Description="HapMap2 membership" -d key=INFO,ID=GN,Number=1,Type=String,Description="Gene Name" -d key=INFO,ID=DP,Number=0,Type=Integer,Description="Depth,etc"','annotate3.out','concat-a.vcf','annotate.txt'); test_annotate("$path/../examples/",'-f +/D=34/c=2,3','annotate2.out','annotate-test.vcf'); test_fill_an_ac("$path/../examples/",'fill-an-ac.out','concat-a.vcf'); test_indel_stats("$path/../examples/",'indel-stats.out','indel-stats.vcf','indel-stats.tab'); test_consensus("$path/../examples/",'','consensus.out','consensus.vcf','consensus.fa'); test_consensus("$path/../examples/",'-s NA001','consensus.out2','consensus.vcf','consensus.fa'); test_contrast("$path/../examples/",'-n +D -A,B,C -d 10','contrast.out','contrast.vcf'); test_ploidy("$path/../examples/",'fix-ploidy'); test_api_event_type([qw(A C),'s 1 C'],[qw(A ACGT),'i 3 CGT'],[qw(ACGT A),'i -3 CGT'],[qw(ACGT ACT),'i -1 G'], [qw(ACGT AAA),'o 3 AAA'],[qw(A .),'r 0 A'],[qw(A ),'u 0 '],[qw(ACG AGC),'s 2 AGC'], [qw(A .A),'b'], [qw(A A.),'b']); test_api(); exit; #-------------------------------------- sub test_bgzip_and_tabix { my ($file) = @_; my $cmd; $cmd = "cat $file | bgzip -c > $file.gz"; system($cmd); is($?,0,"Is bgzip OK? .. $cmd"); $cmd = "tabix $file.gz"; system($cmd); is($?,0,"Is tabix OK? .. $cmd"); } sub test_validator { my ($path,$fname) = @_; my $cmd = "perl -I$path -MVcf -e validate $fname"; my @out = `$cmd 2>&1`; my @exp = (); is_deeply(\@out,\@exp,"Testing validator .. $cmd"); } sub test_format_validation { my ($path,$version) = @_; my ($chld_in,$chld_out); my $cmd = "perl -I$path -MVcf -e validate 2>&1"; my $pid = open2($chld_out, $chld_in, $cmd); my $vcf = Vcf->new(version=>$version); $vcf->recalc_ac_an(2); $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO', ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); if ( $version >= 4.0 ) { $vcf->add_header_line({key=>'ALT',ID=>'DEL:ME:ALU', Description=>'Deletion of ALU element'}); } if ( $version >= 4.1 ) { $vcf->add_header_line({key=>'reference',value=>'file:/some/file.fa'}); $vcf->add_header_line({key=>'contig',ID=>'1',length=>12345,md5=>'f126cdf8a6e0c7f379d618ff66beb2da',assembly=>'E.T.'}); } $vcf->add_columns('NA0001','NA0002'); print $vcf->format_header() unless !$debug; print $chld_in $vcf->format_header(); my %rec = ( CHROM=>1, POS=>1, REF=>'A', QUAL=>$$vcf{defaults}{QUAL}, FORMAT=>['GT'] ); $rec{gtypes}{NA0001}{GT} = 'A/A'; $rec{gtypes}{NA0002}{GT} = $$vcf{defaults}{GT}; $vcf->format_genotype_strings(\%rec); print $vcf->format_line(\%rec) unless !$debug; print $chld_in $vcf->format_line(\%rec); $rec{POS} = 2; $rec{gtypes}{NA0002}{GT} = 'IA|D1'; if ( $version >= 4.0 ) { $rec{REF} = 'AC'; $rec{gtypes}{NA0002}{GT} = 'ATC|'; } $vcf->format_genotype_strings(\%rec); print $vcf->format_line(\%rec) unless !$debug; print $chld_in $vcf->format_line(\%rec); close($chld_in); my @exp = (); my @out = (); while (my $line=<$chld_out>) { chomp($line); push @out,$line; } close($chld_out); waitpid $pid, 0; if ( !is_deeply(\@out,\@exp,"Testing formatting followed by validation .. $cmd") ) { print STDERR @out; } } sub test_parse { my ($path) = @_; my $vcf = Vcf->new(file=>"$path/../examples/parse-test.vcf"); $vcf->parse_header; my $line; $line = $vcf->next_data_array; is_deeply($$line[4],"G","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],"G,,T,","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],",G,,T","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],",G,,T,","Testing next_data_array"); } sub test_vcf_stats { my ($path,$file) = @_; my $cmd = "perl -I$path -MVcf $path/vcf-stats $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$file.stats") or confess("$file.stats: $!"); my @exp = <$fh>; close($fh); is_deeply(\@out,\@exp,"Testing vcf-stats .. $cmd"); } sub test_empty_cols { my ($path,$version) = @_; my ($header,$vcf,@out,$exp); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns(qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); is_deeply($out[-1],'#'.$exp,"Testing add_columns with genotypes full, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns('NA0001'); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); is_deeply($out[-1],'#'.$exp,"Testing add_columns with genotypes brief, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns(); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO)); is_deeply($out[-1],'#'.$exp,"Testing add_columns brief, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns('FORMAT'); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT)); is_deeply($out[-1],'#'.$exp,"Testing add_columns no gtypes, $version."); } sub test_compare { my ($path,$a,$b,$expected) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); for my $file ($a,$b) { `cat $file | bgzip -c > $file.gz`; `tabix -p vcf -f $file.gz`; } my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-compare -g $a.gz $b.gz | grep -v '^# The command'"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$expected") or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-compare .. $cmd"); } sub test_merge { my ($path,$expected,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-merge"; for my $file (@files) { `cat $file | bgzip -c > $file.gz; tabix -f -p vcf $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>/dev/null | grep -v ^##source`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-merge .. $cmd"); } sub test_isec { my ($path,$opts,$expected,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-isec -f $opts"; for my $file (@files) { `cat $file | bgzip -c > $file.gz; tabix -f -p vcf $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>&1 | grep -v ^##source`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-isec .. $cmd"); } sub test_query_vcf { my ($path,$file,$expected,$query) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-query -f '$query' $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-query .. $cmd"); } sub test_shuffle { my ($path,$template,$file) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-shuffle-cols -t $template $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',$template) or confess("$template: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-shuffle-cols .. $cmd"); } sub test_concat { my ($path,$out,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-concat -s 3"; for my $file (@files) { `cat $file | bgzip -c > $file.gz`; `tabix -p vcf -f $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-concat .. $cmd"); } sub test_annotate { my ($path,$args,$out,$vcf,$annot) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-annotate $args $vcf"; if ( defined $annot ) { `cat $annot | bgzip -c > $annot.gz`; `tabix -s 3 -b 1 -e 2 -f $annot.gz`; $cmd .= " -a $annot.gz"; } my @out = `$cmd 2>&1 | grep -v ^##source`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-annotate .. $cmd"); } sub test_fill_an_ac { my ($path,$out,$vcf) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/fill-an-ac $vcf"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing fill-an-ac .. $cmd"); } sub test_indel_stats { my ($path,$out,$vcf,$tab) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-indel-stats -e $tab < $vcf"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing fill-an-ac .. $cmd"); } sub test_consensus { my ($path,$args,$out,$vcf,$fa) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); `cat $vcf | bgzip -c > $vcf.gz`; `tabix -p vcf -f $vcf.gz`; my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-consensus $args $vcf.gz < $fa"; my @out = `$cmd`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-consensus .. $cmd"); } sub test_contrast { my ($path,$args,$out,$vcf) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-contrast $args $vcf | grep -v ^##source"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-contrast .. $cmd"); } sub test_ploidy { my ($path,$prefix) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "cat $prefix.vcf | perl -I../perl/ -MVcf ../perl/vcf-fix-ploidy -s $prefix.samples -p $prefix.txt 2>/dev/null | vcf-query -f '\%POS[\\t\%SAMPLE \%GTR \%PL]\\n'"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$prefix.out") or confess("$prefix.out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-fix-ploidy .. $cmd"); } sub test_api_event_type { my (@subs) = @_; my $vcf = Vcf->new(); for my $mut (@subs) { my $exp = join(' ', $vcf->event_type($$mut[0],$$mut[1])); is_deeply($$mut[2],$exp,"Testing API event_type($$mut[0],$$mut[1]) .. $exp"); } } sub test_api { my $vcf = Vcf->new(); my $ret; my $fmt = 'GT:GL:PL'; $ret = $vcf->get_tag_index($fmt,'GT',':'); is($ret,0,"Testing get_tag_index($fmt,'GT',':')"); $ret = $vcf->get_tag_index($fmt,'GL',':'); is($ret,1,"Testing get_tag_index($fmt,'GL',':')"); $ret = $vcf->get_tag_index($fmt,'PL',':'); is($ret,2,"Testing get_tag_index($fmt,'PL',':')"); $ret = $vcf->remove_field($fmt,0,':'); is($ret,'GL:PL',"Testing get_tag_index($fmt,0,':')"); $ret = $vcf->remove_field($fmt,1,':'); is($ret,'GT:PL',"Testing get_tag_index($fmt,1,':')"); $ret = $vcf->remove_field($fmt,2,':'); is($ret,'GT:GL',"Testing get_tag_index($fmt,2,':')"); $ret = $vcf->replace_field($fmt,'XX',0,':'); is($ret,'XX:GL:PL',"Testing get_tag_index($fmt,'XX',0,':')"); $ret = $vcf->replace_field($fmt,'XX',1,':'); is($ret,'GT:XX:PL',"Testing get_tag_index($fmt,'XX',1,':')"); $ret = $vcf->replace_field($fmt,'XX',2,':'); is($ret,'GT:GL:XX',"Testing get_tag_index($fmt,'XX',2,':')"); $ret = $vcf->replace_field($fmt,'XX',4,':'); is($ret,'GT:GL:PL::XX',"Testing get_tag_index($fmt,'XX',4,':')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'0/1/2|1/0|1|2'); is($ret,'C/G/T|G/C|G|T',"Testing decode_genotype('C',['G','T'],'0/1/2|1/0|1|2')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'2|1'); is($ret,'T|G',"Testing decode_genotype('C',['G','T'],'2|1')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'2'); is($ret,'T',"Testing decode_genotype('C',['G','T'],'2')"); my $info = 'NS=2;HM;AF=0.333;AFA=T;DB'; $ret = $vcf->get_info_field($info,'NS'); is($ret,'2',"Testing get_info_field($info,'NS')"); $ret = $vcf->get_info_field($info,'AF'); is($ret,'0.333',"Testing get_info_field($info,'AF')"); $ret = $vcf->get_info_field($info,'AFA'); is($ret,'T',"Testing get_info_field($info,'AFA')"); $ret = $vcf->get_info_field($info,'HM'); is($ret,'1',"Testing get_info_field($info,'HM')"); $ret = $vcf->get_info_field($info,'DB'); is($ret,'1',"Testing get_info_field($info,'DB')"); $ret = $vcf->get_info_field($info,'DBX'); is($ret,undef,"Testing get_info_field($info,'DBX')"); $ret = $vcf->get_info_field('DB','DB'); is($ret,'1',"Testing get_info_field('DB','DB')"); $ret = $vcf->get_info_field('XDB','DB'); is($ret,undef,"Testing get_info_field('XDB','DB')"); my @ret; @ret = $vcf->split_gt('0/1'); is_deeply(\@ret,[0,1],"Testing split_gt('0/1')"); @ret = $vcf->split_gt('0'); is_deeply(\@ret,[0],"Testing split_gt('0')"); my @als; @als = ("TTGGTAT","TTGGTATCTAGTGGTAT,TGGTATCTAGTGGTAT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TTGGTATCTAG","TGGTATCTAG"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("TT","TCTAGTGGTAAT,TCT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TCTAGTGGTAA","TC"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("TGGGGGG","TGGGGGGG"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TG"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("CAAAAAA","CAAAAA"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["CA","C"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("CA","CT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["CA","CT"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("GAACCCACA","GA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[0,"GAACCCAC","G"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("CAGTAAAA","CAGAAAA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[2,"GT","G"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("CAGTAAA","CAGAAAA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[3,"T","A"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("GA","GACC"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[1,"A","ACC"],"Testing normalize_alleles_pos(".join(',',@als).")"); } vcftools-0.1.15/src/perl/vcf-annotate000077500000000000000000001440651307140004000175000ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my %filters = ( MinAB => { dflt=>2, usage=>'INT', desc=>'Minimum number of alternate bases (INFO/DP4)', nick=>'a' }, SnpCluster => { dflt=>undef, usage=>'INT1,INT2', desc=>"Filters clusters of 'INT1' or more SNPs within a run of 'INT2' bases", nick=>'c' }, MinDP => { dflt=>2, usage=>'INT', desc=>"Minimum read depth (INFO/DP or INFO/DP4)", nick=>'d' }, MaxDP => { dflt=>10_000_000, usage=>'INT', desc=>"Maximum read depth (INFO/DP or INFO/DP4)", nick=>'D' }, MinMQ => { dflt=>10, usage=>'INT', desc=>"Minimum RMS mapping quality for SNPs (INFO/MQ)", nick=>'q' }, SnpGap => { dflt=>10, usage=>'INT', desc=>"SNP within INT bp around a gap to be filtered", nick=>'w' }, GapWin => { dflt=>3, usage=>'INT', desc=>"Window size for filtering adjacent gaps", nick=>'W' }, StrandBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for strand bias (INFO/PV4)", nick=>'1' }, BaseQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for baseQ bias (INFO/PV4)", nick=>'2' }, MapQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for mapQ bias (INFO/PV4)", nick=>'3' }, EndDistBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for end distance bias (INFO/PV4)", nick=>'4' }, RefN => { dflt=>'', usage=>'', desc=>"Reference base is N", nick=>'r' }, Qual => { dflt=>'10', usage=>'INT', desc=>"Minimum value of the QUAL field", nick=>'Q' }, VDB => { dflt=>'0', usage=>'FLOAT', desc=>"Minimum Variant Distance Bias (INFO/VDB)", nick=>'v' }, HWE => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE and F<0 (invokes --fill-HWE)", nick=>'H' }, HWE_G3 => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE and F<0 (INFO/HWE and INFO/G3)", nick=>'HG' }, HWE2 => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE (plus F<0) (INFO/AC and INFO/AN or --fill-AC-AN)", nick=>'H2' }, ); my $opts = parse_params(); annotate($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } my @filters; for my $key (sort {lc($filters{$a}{nick}) cmp lc($filters{$b}{nick})} keys %filters) { push @filters, sprintf("\t%s, %-25s\t\t%s [%s]\n", $filters{$key}{nick},$key.' '.$filters{$key}{usage},$filters{$key}{desc},defined($filters{$key}{dflt})? $filters{$key}{dflt} : ''); } print "About: Annotates VCF file, adding filters or custom annotations. Requires tabix indexed file with annotations.\n", " Currently it can annotate ID, QUAL, FILTER and INFO columns, but will be extended on popular demand.\n", " For examples of user-defined filters see online documentation or examples/filters.txt in vcftools distribution.\n", "Usage: cat in.vcf | vcf-annotate [OPTIONS] > out.vcf\n", "Options:\n", " -a, --annotations The tabix indexed file with the annotations: CHR\\tFROM[\\tTO][\\tVALUE]+.\n", " -c, --columns The list of columns in the annotation file, e.g. CHROM,FROM,TO,-,QUAL,INFO/STR,INFO/GN. The dash\n", " in this example indicates that the third column should be ignored. If TO is not\n", " present, it is assumed that TO equals to FROM. When REF and ALT columns are present, only\n", " matching lines are annotated.\n", " -d, --description Header annotation, e.g. key=INFO,ID=HM2,Number=0,Type=Flag,Description='HapMap2 membership'.\n", " The descriptions can be read from a file, one annotation per line.\n", " --fill-AC-AN (Re)Calculate AC and AN tags\n", " --fill-HWE (Re)Calculate HWE, AC and AN tags\n", " --fill-ICF (Re)Calculate Inbreeding Coefficient F, HWE, AC and AN\n", " --fill-type Annotate INFO/TYPE with snp,del,ins,mnp,complex\n", " -f, --filter Apply filters, list is in the format flt1=value/flt2/flt3=value/etc. If argument to -f is a file,\n", " user-defined filters be applied. See User Defined Filters below.\n", " -H, --hard-filter Remove lines with FILTER anything else than PASS or \".\"\n", " -n, --normalize-alleles Make REF and ALT alleles more compact if possible (e.g. TA,TAA -> T,TA).\n", " -r, --remove Comma-separated list of tags to be removed (e.g. ID,INFO/DP,FORMAT/DP,FILTER).\n", " -h, -?, --help This help message.\n", "Filters:\n", sprintf("\t+ %-25s\t\tApply all filters with default values (can be overriden, see the example below).\n",''), sprintf("\t-X %-25s\t\tExclude the filter X\n",''), join('',@filters), "Examples:\n", " zcat in.vcf.gz | vcf-annotate -a annotations.gz -d descriptions.txt -c FROM,TO,CHROM,ID,INFO/DP | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -f +/-a/c=3,10/q=3/d=5/-D -a annotations.gz -d key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name' | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -a dbSNPv132.tab.gz -c CHROM,POS,REF,ALT,ID,-,-,- | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -r FILTER/MinDP | bgzip -c >out.vcf.gz \n", "Where descriptions.txt contains:\n", " key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name'\n", " key=INFO,ID=STR,Number=1,Type=Integer,Description='Strand'\n", "The file dbSNPv132.tab.gz with dbSNP IDs can be downloaded from\n", " ftp://ftp.sanger.ac.uk/pub/1000genomes/pd3/dbSNP/\n", "\n"; exit -1; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args=>[$0, @ARGV], }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-d' || $arg eq '--description' ) { my $desc = shift(@ARGV); if ( -e $desc ) { open(my $fh,'<',$desc) or error("$desc: $!"); while (my $line=<$fh>) { if ( $line=~/^\s*$/ or $line=~/^#/ ) { next; } chomp($line); push @{$$opts{desc}}, $line; } close($fh); } else { push @{$$opts{desc}}, $desc; } next; } if ( $arg eq '-f' || $arg eq '--filter' ) { my $filter = shift(@ARGV); parse_filters($opts,$filter); next; } if ( $arg eq '-c' || $arg eq '--columns' ) { my $cols = shift(@ARGV); $$opts{cols} = [ split(/,/,$cols) ]; next; } if ( $arg eq '-r' || $arg eq '--remove' ) { my $tags = shift(@ARGV); my @tags = split(/,/,$tags); for my $tag (@tags) { my ($col,$tag) = split(m{/},$tag); if ( !defined $tag ) { if ( $col eq 'ID' ) { $$opts{remove}{$col}=1; next; } if ( $col eq 'QUAL' ) { $$opts{remove}{$col}=1; next; } if ( $col eq 'FILTER' ) { $$opts{remove}{$col}=1; next; } $$opts{remove}{INFO}{$col} = 1; $$opts{remove}{FORMAT}{$col} = 1; } elsif ( $col eq 'FILTER' ) { $$opts{remove}{$col}{$tag} = 0; } else { $$opts{remove}{$col}{$tag} = 1; } } next; } if ( $arg eq '-n' || $arg eq '--normalize-alleles' ) { $$opts{normalize} = 1; next } if ( $arg eq '-a' || $arg eq '--annotations' ) { $$opts{annotations} = shift(@ARGV); next } if ( $arg eq '--fill-type' ) { $$opts{fill_type}=1; $$opts{fill}=1; next } if ( $arg eq '--fill-AC-AN' ) { $$opts{fill_ac_an} = 1; $$opts{fill}=1; next } if ( $arg eq '--fill-HWE' ) { $$opts{fill_ac_an} = $$opts{fill_hwe} = 1; $$opts{fill}=1; next } if ( $arg eq '--fill-ICF' ) { $$opts{fill_ac_an} = $$opts{fill_hwe} = $$opts{fill_icf} = 1; $$opts{fill}=1; next } if ( $arg eq '-t' || $arg eq '--tag' ) { $$opts{tag} = shift(@ARGV); next } if ( $arg eq '-H' || $arg eq '--hard-filter' ) { $$opts{hard_filter} = 1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) ) { if ( !exists($$opts{annotations}) && !exists($$opts{remove}) && !exists($$opts{fill}) && !exists($$opts{normalize}) && !exists($$opts{hard_filter}) ) { error("Missing one of the -a, -f, -n, -r or --fill-* options.\n") } } if ( exists($$opts{annotations}) && !exists($$opts{cols}) ) { error("Missing the -c option.\n"); } return $opts; } sub parse_user_defined_filters { my ($opts,$str) = @_; my $filters = [ do $str ]; if ( $@ ) { error("do $str: $@"); } for my $filter (@$filters) { if ( !exists($$filter{tag}) ) { error("Missing 'tag' key for one of the filters in $str\n"); } if ( $$filter{tag}=~m{^INFO/(.+)$} ) { $$filter{info_tag} = $1; } elsif ( $$filter{tag}=~m{^FORMAT/(.+)$} ) { $$filter{format_tag} = $1; } elsif ( $$filter{tag} eq 'Dummy' ) { $$filter{any_tag} = $1; $$filter{name} = 'Dummy'; $$filter{desc} = 'Dummy'; } else { error("Currently only INFO, FORMAT and Dummy tags are supported. Could not parse the tag [$$filter{tag}]\n"); } my $name = $$filter{name}; if ( !exists($$filter{name}) ) { error("Missing 'name' key for the filter [$$filter{tag}]\n"); } if ( !exists($$filter{desc}) ) { error("Missing 'desc' key for the filter [$$filter{tag}]\n"); } if ( exists($$filter{header}) ) { push @{$$opts{desc}}, ref($$filter{header}) eq 'ARRAY' ? @{$$filter{header}} : $$filter{header}; } elsif ( $$filter{tag} ne 'Dummy' ) { push @{$$opts{desc}}, "key=FILTER,ID=$name,Description='$$filter{desc}'"; } if ( !exists($$filter{apply_to}) or lc($$filter{apply_to}) eq 'all' ) { $$opts{udef_filters}{'all'}{$name} = $filter; $$opts{udef_filters}{'s'}{$name} = $filter; $$opts{udef_filters}{'i'}{$name} = $filter; } elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'snps' ) { $$opts{udef_filters}{'s'}{$name} = $filter; $$opts{udef_filters_typecheck_needed} = 1; } elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'indels' ) { $$opts{udef_filters}{'i'}{$name} = $filter; $$opts{udef_filters_typecheck_needed} = 1; } } } sub parse_filters { my ($opts,$str) = @_; if ( -e $str ) { parse_user_defined_filters($opts,$str); return; } my $has_filters = 0; my $set_defaults = 0; my @filters = split(m{/},$str); for my $fltr (@filters) { if ( $fltr eq '+' ) { $set_defaults=1; last; } } my %mapping; for my $flt (keys %filters) { if ( exists($mapping{$filters{$flt}{nick}}) ) { error("FIXME: the nick $filters{$flt}{nick} is not unique.\n"); } $mapping{$filters{$flt}{nick}} = $flt; if ( !defined($filters{$flt}{dflt}) ) { next; } if ( $set_defaults ) { $$opts{filters}{$flt} = $filters{$flt}{dflt}; } } for my $filter (@filters) { my ($key,$val) = split(/=/,$filter); if ( $key eq '+' ) { next; } my $to_be_deleted = 0; if ( $key=~/^-(.+)$/ ) { $to_be_deleted=1; $key = $1; } if ( !exists($filters{$key}) ) { $key = $mapping{$key}; } if ( !exists($filters{$key}) && !exists($mapping{$key}) ) { error("The filter [$key] not recognised.\n"); } if ( $to_be_deleted ) { delete($$opts{filters}{$key}); next; } if ( $key eq 'c' || $key eq 'SnpCluster' ) { ($$opts{SnpCluster_count},$$opts{SnpCluster_win}) = split(/,/,$val); # Simple sanity check if ( $$opts{SnpCluster_count}>$$opts{SnpCluster_win} ) { error("Did you really mean snpCluster=$$opts{SnpCluster_count},$$opts{SnpCluster_win}? The win (INT2) must be bigger or equal to count (INT1)."); } $$opts{SnpCluster_buffer} = []; push @{$$opts{desc}}, "key=FILTER,ID=SnpCluster,Description='$filters{SnpCluster}{desc} [win=$$opts{SnpCluster_win},count=$$opts{SnpCluster_count}]'"; $has_filters = 1; next; } $$opts{filters}{$key} = $val; $has_filters = 1; } for my $key (keys %{$$opts{filters}}) { push @{$$opts{desc}}, "key=FILTER,ID=$key,Description='$filters{$key}{desc}" . (defined $$opts{filters}{$key} ? " [$$opts{filters}{$key}]'" : "'"); } if ( !$has_filters && !scalar keys %{$$opts{filters}} ) { delete($$opts{filters}); } if ( exists($$opts{filters}{HWE}) ) { $$opts{fill_ac_an}=$$opts{fill_hwe}=1; } } # Convert text descriptions given on command line to hashes which will be # passed to Vcf::add_header_line sub parse_descriptions { my ($descs) = @_; my @out; for my $str (@$descs) { my $desc = {}; my $tmp = $str; while ($tmp) { my ($key,$value); if ( $tmp=~/^([^=]+)=["']([^\"]+)["']/ ) { $key=$1; $value=$2; } elsif ( $tmp=~/^([^=]+)=([^,"]+)/ && $1 eq 'Description' ) { # The command line eats the quotes $key=$1; $value=$2.$'; $$desc{$key} = $value; last; } elsif ( $tmp=~/^([^=]+)=([^,"]+)/ ) { $key=$1; $value=$2; } else { error(qq[Could not parse the description: [$str]\n]); } $$desc{$key} = $value; $tmp = $'; if ( $tmp=~/^,/ ) { $tmp = $'; } } if ( !exists($$desc{ID}) ) { error("No ID in description? [$str]\n"); } push @out, $desc; } return \@out; } # Create mapping from the annotation IDs to column indexes. The mapping is used # to determine which columns should be used from the annotation file. The # following structure is returned: # { # CHROM => col_idx, # FROM => col_idx, # TO => col_idx, # annots => # [ # { col=>col_idx, id=>annot_id, vcf_col=>vcf_column, is_flag=>0 }, # ] # } # If {annots}{is_flag} is nonzero, "annot_id" will be written to VCF instead of "annot_id=value". # Currently only one VCF column (INFO) is supported. # sub parse_columns { my ($cols,$descriptions) = @_; my %desc = (); my %out = ( annots=>[] ); if ( !defined $cols ) { return \%out; } for my $d (@$descriptions) { $desc{$$d{key}.'/'.$$d{ID}} = $d; } for (my $i=0; $i<@$cols; $i++) { my $col = $$cols[$i]; if ( $col eq '-' ) { next; } elsif ( $col eq 'CHROM' ) { $out{$col}=$i; } elsif ( $col eq 'FROM' ) { $out{$col}=$i; } elsif ( $col eq 'POS' ) { $out{'FROM'}=$i; } elsif ( $col eq 'TO' ) { $out{$col}=$i; } elsif ( $col eq 'ID' ) { $out{$col}=$i; } elsif ( $col eq 'FILTER' ) { $out{$col}=$i; } elsif ( $col eq 'REF' ) { $out{$col}=$i; } elsif ( $col eq 'ALT' ) { $out{$col}=$i; } elsif ( $col eq 'QUAL' ) { $out{$col}=$i; } else { if ( !exists($desc{$col}) && exists($desc{"INFO/$col"}) ) { print STDERR qq[The description for "$col" does not exist, assuming "INFO/$col"\n]; $col = "INFO/$col"; } if ( !exists($desc{$col})) { error("Missing the -d parameter for the column [$col]\n"); } if ( !($col=~m{^(.+)/(.+)$}) ) { error("Could not parse the column [$col].\n"); } my $key = $1; my $id = $2; my $rec = { col=>$i, id=>$id, vcf_col=>$key, is_flag=>($desc{$col}{Type} eq 'Flag' ? 1 : 0) }; push @{$out{annots}}, $rec; if ( $key ne 'INFO' ) { error("TODO: other than INFO columns\n"); } } } if ( !exists($out{CHROM}) ) { $out{CHROM}=0; } if ( !exists($out{FROM}) ) { $out{FROM}=1; } if ( !exists($out{TO}) ) { $out{TO}=$out{FROM}; } if ( exists($out{REF}) && !exists($out{ALT}) or !exists($out{REF}) && exists($out{ALT}) ) { error("Expected both REF and ALT columns in the annotation file.\n"); } return \%out; } sub annotate { my ($opts) = @_; # Init the variables my $descs = parse_descriptions($$opts{desc}); my $cols = parse_columns($$opts{cols},$descs); # Open VCF file and add all required header lines my %args = exists($$opts{file}) ? (file=>$$opts{file}) : (fh=>\*STDIN); my $vcf = $$opts{vcf} = Vcf->new(%args); $vcf->parse_header(); if ( exists($$opts{remove}) ) { for my $col (keys %{$$opts{remove}}) { if ( ref($$opts{remove}{$col}) ne 'HASH' ) { # remove all filters at once if ( $col eq 'FILTER' ) { $vcf->remove_header_line(key=>$col); } next; } for my $tag (keys %{$$opts{remove}{$col}}) { $vcf->remove_header_line(key=>$col, ID=>$tag); } } } for my $desc (@$descs) { $vcf->add_header_line($desc,silent=>1); } if ( $$opts{fill_type} ) { $vcf->add_header_line({key=>'INFO',ID=>'TYPE',Number=>'A',Type=>'String',Description=>'Variant type'}); } if ( $$opts{fill_ac_an} ) { $vcf->add_header_line({key=>'INFO',ID=>'AC',Number=>'A',Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); } if ( $$opts{fill_hwe} ) { $vcf->add_header_line({key=>'INFO',ID=>'HWE',Number=>1,Type=>'Float',Description=>'Hardy-Weinberg equilibrium test (PMID:15789306)'}); $vcf->add_header_line({key=>'INFO',ID=>'ICF',Number=>1,Type=>'Float',Description=>'Inbreeding coefficient F'}); } $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print $vcf->format_header(); my ($prev_chr,$prev_pos,$annot_from,$annot_to,$annot_line); my @annots = @{$$cols{annots}}; my $id_col = exists($$cols{ID}) ? $$cols{ID} : undef; my $fltr_col = exists($$cols{FILTER}) ? $$cols{FILTER} : undef; my $from_col = $$cols{FROM}; my $to_col = $$cols{TO}; my $ref_col = exists($$cols{REF}) ? $$cols{REF} : undef; my $alt_col = exists($$cols{ALT}) ? $$cols{ALT} : undef; my $qual_col = exists($$cols{QUAL}) ? $$cols{QUAL} : undef; # Initialize the annotation reader my $reader; if ( exists($$opts{annotations}) ) { $reader = Reader->new(file=>$$opts{annotations}); my $line = $vcf->next_line(); if ( !defined $line ) { # VCF file is empty undef $reader; } else { my @rec = split(/\t/,$line); $prev_chr = $rec[0]; $prev_pos = $rec[1]; $vcf->_unread_line($line); $reader->open(region=>"$prev_chr:$prev_pos"); } } while (defined $reader) { # Read next annotation group, i.e. all records with the same position (or overlapping in case of intervals) my (@annot_lines,$annot_prev_from,$annot_prev_to); while ($reader) { my $annot_line = $reader->next_line(); if ( !defined $annot_line ) { last; } my $annot_from = $$annot_line[$from_col]; my $annot_to = $$annot_line[$to_col]; if ( !@annot_lines ) { push @annot_lines, $annot_line; $annot_prev_from = $annot_from; $annot_prev_to = $annot_to; next; } if ( $annot_from <= $annot_prev_to or $annot_to <= $annot_prev_to ) { push @annot_lines, $annot_line; if ( $annot_prev_to < $annot_to ) { $annot_prev_to = $annot_to; } next; } $reader->unread_line($annot_line); last; } # Now loop through the VCF records my $line; while ($line = $vcf->next_line()) { my @rec = split(/\t/,$line); if ( $$opts{normalize} ) { my ($ref,@alts) = $vcf->normalize_alleles($rec[3],$rec[4]); $rec[3] = $ref; $rec[4] = join(',',@alts); } my $chr = $rec[0]; my $pos = $rec[1]; chomp($rec[-1]); if ( $chr ne $prev_chr ) { $vcf->_unread_line($line); $prev_chr = $chr; $prev_pos = $pos; $reader->open(region=>"$prev_chr:$prev_pos"); last; } if ( exists($$opts{remove}) ) { remove_tags($opts,\@rec); } # Quick position-based check: Is there an annotation for this record? if ( !defined $annot_prev_from or $pos < $annot_prev_from ) { output_line($opts,\@rec); next; } if ( $pos > $annot_prev_to ) { $vcf->_unread_line($line); last; } # Initialize the REF,ALT-based check. If there are multiple records with the same # position, they can appear in any order. A single ALT allele is expected in the # annot file but multiple ALTs can be present in the VCF. As long as one of them # matches the annot file, the record will be annotated. # The annot file can contain mutliallelic sites too. At least one ALT from the VCF # has to match an ALT from the annot file. my (%ref_alt_pairs); if ( defined $alt_col ) { my $ref = $rec[3]; for my $alt (split(/,/,$rec[4])) { my ($r,@a) = $vcf->normalize_alleles($ref,$alt); $ref_alt_pairs{$r.'-'.$a[0]} = 1; } } # Now fill the annotations; Existing annotations with the same tag will be overwritten my %values = (); my %ids = (); for my $annot_line (@annot_lines) { # Skip annotation lines which are not relevant to this VCF record if ( $$annot_line[$from_col] > $pos or $$annot_line[$to_col] < $pos ) { next; } if ( defined $alt_col && $$annot_line[$ref_col] ne '.' ) { my $alt_match = 0; for my $alt (split(/,/,$$annot_line[$alt_col])) { my ($r,@a) = $vcf->normalize_alleles($$annot_line[$ref_col],$alt); if ( exists($ref_alt_pairs{$r.'-'.$a[0]}) ) { $alt_match = 1; last; } } if ( !$alt_match ) { next; } } for my $info (@annots) { my $val = $$annot_line[$$info{col}]; if ( $val eq '' or $val eq '.' ) { $val=undef; } # Existing annotation should be removed elsif ( $$info{is_flag} ) { if ( $val ) { $val=''; } # Flag annotation should be added else { $val=undef; } # Flag annotation should be removed } # A single undef value can be overriden by other overlapping records (?) if ( !defined $val && exists($values{$$info{id}}) ) { next; } elsif ( exists($values{$$info{id}}) && !defined $values{$$info{id}}[0] ) { $values{$$info{id}}[0] = $val; next; } push @{$values{$$info{id}}}, $val; } if ( defined $id_col && $$annot_line[$id_col] ne '' ) { $ids{$$annot_line[$id_col]} = 1; } if ( defined $fltr_col && $$annot_line[$fltr_col] ne '' ) { $rec[6] = $$annot_line[$fltr_col]; } if ( defined $qual_col && $$annot_line[$qual_col] ne '' ) { $rec[5] = $$annot_line[$qual_col]; } } if ( scalar keys %ids ) { $rec[2] = join(';', keys %ids); } if ( scalar keys %values ) { for my $key (keys %values) { # Cannot use join on undef values $values{$key} = scalar @{$values{$key}} == 1 ? $values{$key}[0] : join(',', @{$values{$key}}); } $rec[7] = $vcf->add_info_field($rec[7],%values); } output_line($opts,\@rec); } if ( !defined $line ) { last; } } # Finish the VCF, no annotations for this part while (my $line=$vcf->next_line) { my @rec = split(/\t/,$line); if ( $$opts{normalize} ) { my ($ref,@alts) = $vcf->normalize_alleles($rec[3],$rec[4]); $rec[3] = $ref; $rec[4] = join(',',@alts); } chomp($rec[-1]); if ( exists($$opts{remove}) ) { remove_tags($opts,\@rec); } output_line($opts,\@rec); } # Output any lines left in the buffer output_line($opts); } sub fill_ac_an_hwe { my ($opts,$line) = @_; my $igt = $$opts{vcf}->get_tag_index($$line[8],'GT',':'); if ( $igt==-1 ) { return; } my %counts = ( 0=>0 ); my %dpl_counts = ( 0=>0 ); if ( $$line[4] ne '.' ) { my $idx=0; my $cnt=0; $counts{++$cnt} = 0; while ( ($idx=index($$line[4],',',$idx))>0 ) { $idx++; $counts{++$cnt} = 0; } } my $nhets = 0; my $ngts = 0; my $ncols = @$line; for (my $isample=9; $isample<$ncols; $isample++) { my $gt = $$opts{vcf}->get_field($$line[$isample],$igt); my ($a1,$a2) = $$opts{vcf}->split_gt($gt); if ( $a1 ne '.' ) { $counts{$a1}++ } if ( defined $a2 && $a2 ne '.' ) { $counts{$a2}++; if ( $a1 ne '.' ) { $dpl_counts{$a1}++; $dpl_counts{$a2}++; if ( $a1 ne $a2 ) { $nhets++ } $ngts++; } } } my $an = 0; my $ac; my $max_ac = 0; for my $key (sort {$a<=>$b} keys %counts) { if ( $key eq 0 ) { $an += $counts{$key}; next; } if ( defined $ac ) { $ac .= ','; } $ac .= $counts{$key}; $an += $counts{$key}; if ( exists($dpl_counts{$key}) && $dpl_counts{$key}>$max_ac ) { $max_ac = $dpl_counts{$key}; } } my %tags = (AN=>$an); if ( defined $ac ) { $tags{AC}=$ac } my $nall = $dpl_counts{0} + $max_ac; if ( scalar keys %counts==2 ) { if ( $$opts{fill_hwe} && $nall && scalar keys %counts==2 ) { my $freq_obs = 2*$nhets/$nall; my $freq_exp = 2*($max_ac/$nall)*(1-($max_ac/$nall)); $$opts{icf} = $freq_exp ? 1-$freq_obs/$freq_exp : 0; $$opts{hwe} = eval_hwe(($max_ac-$nhets)/2,($dpl_counts{0}-$nhets)/2,$nhets ,$line); $tags{HWE} = sprintf "%.6f", $$opts{hwe}; if ( $$opts{fill_icf} ) { $tags{ICF} = sprintf "%.5f", $$opts{icf}; } } } $$line[7] = $$opts{vcf}->add_info_field($$line[7],%tags); } # Wigginton 2005, PMID: 15789306 sub eval_hwe { my ($obs_hom1,$obs_hom2,$obs_hets , $line) = @_; if ( $obs_hom1 + $obs_hom2 + $obs_hets == 0 ) { return 1; } my $obs_homc = $obs_hom1 < $obs_hom2 ? $obs_hom2 : $obs_hom1; my $obs_homr = $obs_hom1 < $obs_hom2 ? $obs_hom1 : $obs_hom2; my $rare_copies = 2 * $obs_homr + $obs_hets; my $genotypes = $obs_hets + $obs_homc + $obs_homr; my @het_probs = ((0) x ($rare_copies+1)); # start at midpoint my $mid = int($rare_copies * (2 * $genotypes - $rare_copies) / (2 * $genotypes)); # check to ensure that midpoint and rare alleles have same parity if (($rare_copies & 1) ^ ($mid & 1)) { $mid++; } my $curr_hets = $mid; my $curr_homr = ($rare_copies - $mid) / 2; my $curr_homc = $genotypes - $curr_hets - $curr_homr; $het_probs[$mid] = 1.0; my $sum = $het_probs[$mid]; for ($curr_hets=$mid; $curr_hets > 1; $curr_hets -= 2) { $het_probs[$curr_hets - 2] = $het_probs[$curr_hets] * $curr_hets * ($curr_hets - 1.0) / (4.0 * ($curr_homr + 1.0) * ($curr_homc + 1.0)); $sum += $het_probs[$curr_hets - 2]; # 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote $curr_homr++; $curr_homc++; } $curr_hets = $mid; $curr_homr = int(($rare_copies - $mid) / 2); $curr_homc = $genotypes - $curr_hets - $curr_homr; for ($curr_hets = $mid; $curr_hets <= $rare_copies - 2; $curr_hets += 2) { $het_probs[$curr_hets + 2] = $het_probs[$curr_hets] * 4.0 * $curr_homr * $curr_homc /(($curr_hets + 2.0) * ($curr_hets + 1.0)); $sum += $het_probs[$curr_hets + 2]; # add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote $curr_homr--; $curr_homc--; } for (my $i = 0; $i <= $rare_copies; $i++) { $het_probs[$i] /= $sum; } my $p_hwe = 0.0; # p-value calculation for p_hwe for (my $i = 0; $i <= $rare_copies; $i++) { if ($het_probs[$i] > $het_probs[$obs_hets]) { next; } $p_hwe += $het_probs[$i]; } return $p_hwe > 1.0 ? 1.0 : $p_hwe; } sub fill_type { my ($opts,$line) = @_; my @types; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt); if ( $type eq 'i' ) { push @types, $len>0 ? 'ins' : 'del'; } elsif ( $type eq 's' ) { push @types, $len==1 ? 'snp' : 'mnp'; } elsif ( $type eq 'o' ) { push @types, 'complex'; } elsif ( $type eq 'b' ) { push @types, 'break'; } elsif ( $type eq 'u' ) { push @types, 'other'; } } $$line[7] = $$opts{vcf}->add_info_field($$line[7],TYPE=>(@types ? join(',',@types) : undef)); } # Stage the lines and then apply filtering if requested, otherwise just print the line sub output_line { my ($opts,$line) = @_; if ( defined $line ) { if ( $$opts{fill_ac_an} ) { fill_ac_an_hwe($opts,$line); } if ( $$opts{fill_type} ) { fill_type($opts,$line); } } if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) ) { # No filters requested, print the line print_line($opts, $line); return; } if ( defined $line ) { # Local filters return the line back immediately if ( scalar keys %{$$opts{filters}} ) { $line = apply_local_filters($opts,$line); } if ( exists($$opts{udef_filters}) ) { $line = apply_user_defined_filters($opts,$line); } } # Staging filters may return nothing or multiple lines. If $line is not defined, they will # empty the buffers my @lines; if ( exists($$opts{filters}{SnpGap}) ) { @lines = apply_snpgap_filter($opts,$line); if ( defined $line && !scalar @lines ) { return; } } elsif ( defined $line ) { @lines=($line); } if ( exists($$opts{filters}{GapWin}) ) { my @tmp; if ( !defined $line ) { push @lines,undef; } for my $line (@lines) { push @tmp, apply_gapwin_filter($opts,$line); } @lines = @tmp; } if ( exists($$opts{SnpCluster_count}) ) { my @tmp; if ( !defined $line ) { push @lines,undef; } for my $line (@lines) { push @tmp, apply_snpcluster_filter($opts,$line); } @lines = @tmp; } for my $line (@lines) { print_line($opts, $line); } } sub remove_tags { my ($opts,$line) = @_; # Remove INFO tags for my $tag (keys %{$$opts{remove}{INFO}}) { my $ifrom=0; my $ito; my $tag_len = length($tag); while (1) { $ifrom = index($$line[7],$tag,$ifrom); if ( $ifrom==-1 ) { last; } if ( $ifrom!=0 && substr($$line[7],$ifrom-1,1) ne ';' ) { $ifrom++; next; } if ( length($$line[7])!=$ifrom+$tag_len ) { my $c = substr($$line[7],$ifrom+$tag_len,1); if ( $c ne ';' && $c ne '=' ) { $ifrom+=$tag_len; next; } } $ito = index($$line[7],';',$ifrom+1); last; } if ( !defined $ito ) { next; } # not found my $out; if ( $ifrom>0 ) { $out .= substr($$line[7],0,$ifrom-1); if ( $ito!=-1 ) { $out .= ';'; } } if ( $ito!=-1 ) { $out .= substr($$line[7],$ito+1); } $$line[7] = defined $out ? $out : '.'; } # Remove FORMAT tags for my $tag (keys %{$$opts{remove}{FORMAT}}) { my $idx = $$opts{vcf}->get_tag_index($$line[8],$tag,':'); if ( $idx==-1 ) { next; } for (my $i=8; $i<@$line; $i++) { $$line[$i] = $$opts{vcf}->remove_field($$line[$i],$idx,':'); } } # Remove FILTER if ( exists($$opts{remove}{FILTER}) ) { $$line[6] = ref($$opts{remove}{FILTER}) eq 'HASH' ? $$opts{vcf}->add_filter($$line[6],%{$$opts{remove}{FILTER}}) : 'PASS'; } # Remove ID and QUAL if ( exists($$opts{remove}{ID}) ) { $$line[2] = '.' } if ( exists($$opts{remove}{QUAL}) ) { $$line[5] = '.' } } sub apply_user_defined_filters { my ($opts,$line) = @_; our($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF); $CHROM = $$line[0]; $POS = $$line[1]; $FAIL = 1; $PASS = 0; $RECORD = $line; $VCF = $$opts{vcf}; my %filters = (); if ( $$opts{udef_filters_typecheck_needed} ) { # Check if the line has an indel, SNP or both for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt); if ( exists($$opts{udef_filters}{$type}) ) { %filters = ( %filters, %{$$opts{udef_filters}{$type}} ); } } # Return if the line does not have the wanted variant type if ( !scalar %filters ) { return $line; } } else { %filters = %{$$opts{udef_filters}{all}}; } my %apply; for my $filter (values %filters) { if ( exists($$filter{info_tag}) ) { $MATCH = $$opts{vcf}->get_info_field($$line[7],$$filter{info_tag}); if ( !defined $MATCH ) { next; } } elsif ( exists($$filter{format_tag}) ) { my $idx = $$opts{vcf}->get_tag_index($$line[8],$$filter{format_tag},':'); if ( $idx<0 ) { next; } $MATCH = $$opts{vcf}->get_sample_field($line,$idx); } $apply{ $$filter{name} } = &{$$filter{test}} == $PASS ? 0 : 1; } if ( scalar keys %apply ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply); } return $line; } sub apply_local_filters { my ($opts,$line) = @_; if ( !defined $line ) { return; } my $filters = $$opts{filters}; my %apply; if ( exists($$filters{RefN}) ) { $apply{RefN} = ($$line[3]=~/N/) ? 1 : 0; } if ( exists($$filters{Qual}) && $$line[5] ne '.' ) { $apply{Qual} = $$line[5] < $$filters{Qual} ? 1 : 0; } if ( exists($$filters{HWE_G3}) && $$line[7]=~/G3=([^,]+),([^,]+),/ ) { my ($rr,$ra); $rr = $1; $ra = $2; $apply{HWE_G3} = 0; if ( $$line[7]=~/HWE_G3=([^;\t]+)/ && $1<$$filters{HWE_G3} ) { my $p = 2*$rr + $ra; if ( $p>0 && $p<1 && (1-$ra)/($p*(1-$p))<0 ) { $apply{HWE_G3} = 1; } #printf "xHWE: f=%f rr=$rr ra=$ra hwe=$1 p=$p ($$line[1])\n"; } } if ( exists($$filters{HWE}) ) { $apply{HWE} = $$opts{hwe}<$$filters{HWE} && $$opts{icf}<0 ? 1 : 0; } if ( exists($$filters{VDB}) && $$line[7]=~/VDB=([^;,\t]+)/ ) { $apply{VDB} = $1 < $$filters{VDB} ? 1 : 0; } if ( exists($$filters{MinDP}) or exists($$filters{MaxDP}) ) { my $dp; if ( $$line[7]=~/DP=(\d+)/ ) { $dp = $1; } elsif ( $$line[7]=~/DP4=(\d+),(\d+),(\d+),(\d+)/ ) { $dp = $1 + $2 + $3 + $4; } if ( defined $dp ) { if ( exists($$filters{MinDP}) ) { $apply{MinDP} = $dp < $$filters{MinDP} ? 1 : 0; } if ( exists($$filters{MaxDP}) ) { $apply{MaxDP} = $dp > $$filters{MaxDP} ? 1 : 0; } } } if ( exists($$filters{MinAB}) && $$line[7]=~/DP4=\d+,\d+,(\d+),(\d+)/ ) { $apply{MinAB} = $1 + $2 < $$filters{MinAB} ? 1 : 0; } if ( exists($$filters{MinMQ}) && $$line[7]=~/MQ=(\d+)/ ) { $apply{MinMQ} = $1 < $$filters{MinMQ} ? 1 : 0; } if ( (exists($$filters{StrandBias}) or exists($$filters{BaseQualBias}) or exists($$filters{MapQualBias}) or exists($$filters{EndDistBias})) && $$line[7]=~/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ ) { if ( exists($$filters{StrandBias}) ) { $apply{StrandBias} = $1 < $$filters{StrandBias} ? 1 : 0; } if ( exists($$filters{BaseQualBias}) ) { $apply{BaseQualBias} = $2 < $$filters{BaseQualBias} ? 1 : 0; } if ( exists($$filters{MapQualBias}) ) { $apply{MapQualBias} = $3 < $$filters{MapQualBias} ? 1 : 0; } if ( exists($$filters{EndDistBias}) ) { $apply{EndDistBias} = $4 < $$filters{EndDistBias} ? 1 : 0; } } if ( scalar keys %apply ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply); } return $line; } sub apply_snpgap_filter { my ($opts,$line) = @_; if ( !exists($$opts{SnpGap_buffer}) ) { $$opts{SnpGap_buffer}=[]; } my $vcf = $$opts{vcf}; my $win = $$opts{filters}{SnpGap}; my $buffer = $$opts{SnpGap_buffer}; my ($indel_chr,$indel_pos,$to); if ( defined $line ) { # There may be multiple variants, look for an indel. Anything what is not ref can be filtered. my $is_indel = 0; my $can_be_filtered = 0; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt); if ( $type eq 'i' ) { $is_indel = 1; $indel_chr = $$line[0]; $indel_pos = $$line[1]+1; } elsif ( $type ne 'r' ) { $can_be_filtered = 1; } } # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not # correct as the indel can begin anywhere in the VCF4.x record with # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove # some valid SNPs. $to = $is_indel ? $indel_pos+length($$line[3])-1 : $$line[1]; push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, can_be_filtered=>$can_be_filtered, is_indel=>$is_indel }; } my $n = @$buffer; # Is the new line an indel? If yes, check the distance to all previous lines if ( defined $indel_chr ) { for (my $i=0; $i<$n-1; $i++) { my $buf = $$buffer[$i]; if ( $$buf{chr} ne $indel_chr ) { next; } if ( !$$buf{can_be_filtered} ) { next; } if ( $$buf{is_indel} ) { next; } if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; } } } if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} ) { # There are not enough rows in the buffer: the SnpGap window spans them all. Wait until there is more rows # or a new chromosome return (); } # 'Look-behind' filtering was done above, now comes 'look-ahead' filtering my $indel_to; for (my $i=0; $i<$n; $i++) { my $buf = $$buffer[$i]; if ( $$buf{is_indel} ) { $indel_to = $$buf{to}; $indel_chr = $$buf{chr}; next; } if ( !defined $indel_to ) { next; } if ( !$$buf{can_be_filtered} ) { next; } if ( $$buf{chr} ne $indel_chr ) { undef $indel_to; next; } if ( $$buf{from}<=$indel_to+$win-1 ) { $$buf{exclude}=1; } } # Output. If no $line was given, output everything $to = $$buffer[-1]{from}-$win; my $chr = $$buffer[-1]{chr}; my @out; while (@$buffer) { if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; } my $buf = shift(@$buffer); if ( $$buf{exclude} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>0); } push @out,$$buf{line}; } return @out; } sub apply_gapwin_filter { my ($opts,$line) = @_; if ( !exists($$opts{GapWin_buffer}) ) { $$opts{GapWin_buffer}=[]; } my $vcf = $$opts{vcf}; my $win = $$opts{filters}{GapWin}; my $buffer = $$opts{GapWin_buffer}; my $n = @$buffer; my ($indel_chr,$indel_pos,$to); if ( defined $line ) { # There may be multiple variants, only indels can be filtered my $is_indel = 0; my $indel_len = 0; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt); if ( $type eq 'i' ) { $is_indel = 1; $indel_chr = $$line[0]; $indel_pos = $$line[1] + 1; $indel_len = abs($len); # This may remove valid insertions but also artefacts last; } } $to = $$line[1] + $indel_len; my $af = 0; if ( $is_indel ) { # Collect allele frequency to make an educated guess which of the indels to keep $af = $vcf->get_info_field($$line[7],'AF'); if ( !defined $af ) { $af = $vcf->get_info_field($$line[7],'AF1'); # assuming that all records have the same set of annotations, otherwise comparing later AC with AF will be wrong if ( !defined $af ) { $af = $vcf->get_info_field($$line[7],'AC'); } } if ( !defined $af ) { $af=0 } else { $af = $vcf->get_field($af,0,',') } } push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, is_indel=>$is_indel, AF=>$af, exclude=>0 }; # printf "%d-%d\t%d-%d\n", $$buffer[0]{from},$$buffer[0]{to},$$buffer[-1]{from},$$buffer[-1]{to}; # Update the latest gap position and check if the buffer can be flushed if ( !exists($$opts{GapWin_chr}) ) { $$opts{GapWin_chr} = $$line[0]; $$opts{GapWin_to} = $$line[1]; } my $flush = ( $$opts{GapWin_chr} eq $$line[0] && $$line[1]<=$$opts{GapWin_to} ) ? 0 : 1; if ( $is_indel ) { # Check distance to previous indels and set the exclude flags for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{is_indel} ) { next; } if ( $$buffer[$i]{to}>=$indel_pos-$win ) { $$buffer[$i]{exclude}=1; $$buffer[-1]{exclude}=1; } } if ( $$opts{GapWin_chr} ne $$line[0] or $to+$win>$$opts{GapWin_to} ) { $$opts{GapWin_to} = $to+$win; } } $$opts{GapWin_chr} = $$line[0]; if ( !$flush ) { return (); } if ( !$is_indel ) { $$opts{GapWin_to} = 0; } } # Let one of the gaps go through. It may not be the best one, but as there are more # it is likely that at least one of them is real. Better to have the wrong one # than miss it completely. Base the decision on AF. If not present, let the first # one through. my $max_af=-1; for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{exclude} ) { next; } if ( $max_af<$$buffer[$i]{AF} ) { $max_af=$$buffer[$i]{AF} } } for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{exclude} ) { next; } if ( $max_af==$$buffer[$i]{AF} ) { $$buffer[$i]{exclude}=0; last; } } my @out; for (my $i=0; $i<$n; $i++) { my $buf = shift(@$buffer); if ( $$buf{exclude} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>0); } push @out,$$buf{line}; } return @out; } sub apply_snpcluster_filter { my ($opts,$line) = @_; my $buffer = $$opts{SnpCluster_buffer}; my $n = @$buffer; # The buffer is empty and the line contains only reference alleles, print directly if ( $n==0 && defined $line && $$line[4] eq '.' ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],'SnpCluster'=>0); return $line; } # Store the line in buffer and check how many lines can be printed my $to; # All lines up to and including this index will be printed my $win = $$opts{SnpCluster_win}; if ( defined $line ) { # Exclude REF (and maybe also other filters?) form SnpCluster my $can_be_filtered = $$line[4] eq '.' ? 0 : 1; push @$buffer, { line=>$line, chr=>$$line[0], pos=>$$line[1], can_be_filtered=>$can_be_filtered, in_cluster=>0 }; $n++; # Does the buffer hold enough lines now? my $last_chr = $$buffer[-1]{chr}; my $last_pos = $$buffer[-1]{pos}; for (my $i=$n-1; $i>=0; $i--) { my $buf = $$buffer[$i]; if ( $$buf{chr} ne $last_chr ) { $to=$i; last; } if ( $last_pos - $$buf{pos} >= $win ) { $to=$i; last; } } if ( !defined $to ) { return; } } if ( !defined $to ) { $to=$n-1; } # Calculate the number of variants within the window my $count = 0; my $max_count = $$opts{SnpCluster_count}; my $start_chr = $$buffer[0]{chr}; my $start_pos = $$buffer[0]{pos}; my $idx; for ($idx=0; $idx<$n; $idx++) { my $buf = $$buffer[$idx]; if ( $$buf{chr} ne $start_chr ) { last; } if ( $$buf{pos} - $win >= $start_pos ) { last; } if ( $$buf{can_be_filtered} ) { $count++; } } # If a SNP cluster was found, set the in_cluster flag for all relevant sites. # The buffer will be flushed and the orphans would pass unnoticed. if ( $count>=$max_count ) { for (my $i=0; $i<$idx; $i++) { if ( $$buffer[$i]{can_be_filtered} ) { $$buffer[$i]{in_cluster}=1; } } } # Now output the lines, adding or removing the filter my @out = (); for (my $i=0; $i<=$to; $i++) { my $buf = shift(@$buffer); if ( $$buf{in_cluster} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0); } push @out,$$buf{line}; } # Output all non-variant lines at the beggining of the buffer while (@$buffer) { if ( $$buffer[0]{can_be_filtered} ) { last; } my $buf = shift(@$buffer); $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0); push @out,$$buf{line}; } return @out; } sub print_line { my ($opts, $line) = @_; if ( !defined $line ) { return; } if ( $$opts{hard_filter} && $$line[6] ne '.' && $$line[6] ne 'PASS' ) { return; } print join("\t",@$line) . "\n"; } #--------------------------------- package Reader; use strict; use warnings; use Carp; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{delim} ) { $$self{delim} = qr/\t/; } if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0) if ( !$$self{from} ) { $$self{from} = 1; } # the index of the from column if ( !$$self{to} ) { $$self{to} = 2; } # the index of the to column return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub open { my ($self,%args) = @_; if ( !$$self{file} ) { return; } $self->close(); open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!"); } sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } close($$self{fh}); delete($$self{fh}); delete($$self{buffer}); } sub unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } sub next_line { my ($self) = @_; if ( !$$self{fh} ) { return undef; } # Run in dummy mode if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } my $line; # Skip comments while (1) { $line = readline($$self{fh}); if ( !defined $line ) { return undef; } if ( $line=~/^#/ ) { next; } last; } my @items = split($$self{delim},$line); chomp($items[-1]); return \@items; } vcftools-0.1.15/src/perl/vcf-compare000077500000000000000000001600561307140004000173130ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); if ( exists($$opts{plot}) ) { plot_stats($opts); } else { compare_vcfs($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n", "Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n", " vcf-compare -p plots chr1.cmp chr2.cmp ...\n", "Options:\n", " -a, --apply-filters Ignore lines where FILTER column is anything else than PASS or '.'\n", " -c, --chromosomes Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n", " -d, --debug Debugging information. Giving the option multiple times increases verbosity\n", " -g, --cmp-genotypes Compare genotypes, not only positions\n", " --ignore-indels Exclude sites containing indels from genotype comparison\n", " -m, --name-mapping Use with -g when comparing files with differing column names. The argument to this options is a\n", " comma-separated list or one mapping per line in a file. The names are colon separated and must\n", " appear in the same order as the files on the command line.\n", " --INFO [] Calculate genotype errors by INFO. Use zero based indecies if field has more than one value. Can be\n", " given multiple times.\n", " -p, --plot Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n", " -R, --refseq Compare the actual sequence, not just positions. Use with -w to compare indels.\n", " -r, --regions Process the given regions (comma-separated list or one region per line in a file).\n", " -s, --samples Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n", " -t, --title Title for graphs (see also -p)\n", " -w, --win In repetitive sequences, the same indel can be called at different positions. Consider\n", " records this far apart as matching (be it a SNP or an indel).\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], positions => 0, INFOgroup => [ ], INFOgroupIdx => { }, }; while (my $arg=shift(@ARGV)) { if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; } if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; } if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; } if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; } if ( $arg eq '--INFO' ) { # --INFO IMP2 1 (calculate errors by second value of INFO/IMP2 my $infoTag = shift(@ARGV); unshift @{$$opts{INFOgroup}}, $infoTag; if ($ARGV[0] =~ /^\d+$/ ) { $$opts{INFOgroupIdx}{$infoTag} = shift(@ARGV); } next; } if ( $arg eq '--error-by-gl' ) { $$opts{err_by_gl}=1; next; } if ( $arg eq '-a' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; } if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; } if ( $arg eq '-s' || $arg eq '--samples' ) { my $samples = shift(@ARGV); my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples); $$opts{samples} = \@samples; next; } if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; } if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; } if ( -e $arg ) { push @{$$opts{files}}, $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{files}) ) { error("What files should be compared?\n") } return $opts; } sub read_list { my ($fname) = @_; my @regions; if ( -e $fname ) { open(my $rgs,'<',$fname) or error("$fname: $!"); while (my $line=<$rgs>) { chomp($line); push @regions, $line; } close($rgs); } else { @regions = split(/,/,$fname); } return (@regions); } sub read_mappings_list { my ($fname,$files) = @_; my @maps = read_list($fname); my %mapping; for my $map (@maps) { my @items = split(/:/,$map); if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); } for (my $i=1; $i<@$files; $i++) { $mapping{$$files[$i]}{$items[$i]} = $items[0]; warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n"); } } return \%mapping; } sub compare_vcfs { my ($opts) = @_; $$opts{match} = {}; $$opts{hapls} = {}; # Open the VCF files and initialize the list of chromosomes my @vcfs; my (@regions,%has_chrom,$mappings); if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); } if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); } print "# This file was generated by vcf-compare.\n"; print "# The command line was: ", join(' ',@{$$opts{args}}), "\n"; print "#\n"; if ( $$opts{debug} ) { print "#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n", "#SD The columns are: \n", "#SD 1 .. chromosome\n", "#SD 2 .. position\n", "#SD 3 .. number of Hom_RR matches\n", "#SD 4 .. number of Het_RA matches\n", "#SD 5 .. number of Hom_AA matches\n", "#SD 6 .. number of Hom_RR mismatches\n", "#SD 7 .. number of Het_RA mismatches\n", "#SD 8 .. number of Hom_AA mismatches\n", "#SD 9 .. site's non-reference discordance rate\n"; print "#AM ALT mismatches. The columns are:\n", "#AM 1 .. chromosome\n", "#AM 2 .. position\n", "#AM 3 .. ALT in the first file\n", "#AM 4 .. differing ALT\n"; print "#RM REF mismatches. The columns are:\n", "#RM 1 .. chromosome\n", "#RM 2 .. position\n", "#RM 3 .. REF in the first file\n", "#RM 4 .. differing REF\n"; } my $ifile = 0; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $$vcf{vcf_compare_ID} = $ifile++; $vcf->parse_header(); $vcf->close(); $$vcf{nread} = 0; push @vcfs, $vcf; # Update the list of known chromosomes if ( !exists($$opts{regions_list}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @regions, $chr; } } # Check if column names need to be renamed if ( defined $mappings && exists($$mappings{$$vcf{file}}) ) { $$vcf{_col_mapping} = $$mappings{$$vcf{file}}; for my $name (keys %{$$vcf{_col_mapping}}) { if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); } my $new_name = $$vcf{_col_mapping}{$name}; $$vcf{_col_mapping_rev}{$new_name} = $name; } } } # Include only matching samples in haplotype comparison if ( $$opts{cmp_genotypes} ) { my %all_samples; for my $vcf (@vcfs) { if ( exists $$opts{samples} ) { for my $sample (@{$$opts{samples}}) { if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; } if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ } } } else { my @samples = $vcf->get_samples(); for my $sample (@samples) { if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; } $all_samples{$sample}++ } } } my @include_samples; while (my ($sample,$count)=each %all_samples) { if ( $count != scalar @vcfs ) { next; } push @include_samples, $sample; } if ( !@include_samples ) { error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n"); } $$opts{gt_samples_compared} = scalar @include_samples; for my $vcf (@vcfs) { my @include; if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; } else { for my $sample (@include_samples) { push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample } } $vcf->set_samples(include=>\@include); } } # Go through all the files simultaneously and get the stats. for my $region (@regions) { # Open files for my $vcf (@vcfs) { delete($$vcf{last_line}); $vcf->open(region=>$region); delete($$vcf{eof}); } do_region_stats($opts,\@vcfs); } report_stats($opts,\@vcfs); for my $vcf (@vcfs) { if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); } } } sub report_stats { my ($opts,$vcfs) = @_; # if ( $$opts{debug} ) # { # use Data::Dumper; print Dumper($opts); # } my (@counts,%totals); while (my ($key,$num) = each %{$$opts{match}}) { my @files = split(/'/,$key); for my $file (@files) { $totals{$file} += $num; } push @counts, {count=>$num, files=>[@files]}; } print "#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n", "#VN The columns are: \n", "#VN 1 .. number of sites unique to this particular combination of files\n", "#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n"; for my $rec (sort {$$a{count}<=>$$b{count}} @counts) { my $num = $$rec{count}; my $files = $$rec{files}; print "VN\t$num"; for my $file (@$files) { printf "\t$file (%.1f%%)", $num*100.0/$totals{$file}; } print "\n"; } if ( $$opts{refseq} && $$opts{indels} ) { print "#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n", "#IN .. todo\n", "#IN Number of matching indel haplotypes shared across:\n"; while (my ($file,$stat) = each %{$$opts{indels}}) { print "IN\t$file\n"; my $match = $$stat{match} ? $$stat{match} : 0; my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0; printf "\t\tNumber of matches: %d\n", $match; printf "\t\t mismatches: %d\n", $mismatch; printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch); } } print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"; printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0; printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0; printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0; printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0; printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0; my $out; for my $vcf (@$vcfs) { if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; } if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; } my $diff = $$vcf{nread}-$totals{$$vcf{file}}; my $reported = $totals{$$vcf{file}}; my $total = $$vcf{nread}; $out .= sprintf "SN\tNumber of lost sites:\t%d\t%.1f%%\t%d\t%d\t%s\n", $diff,$diff*100.0/$total,$total,$reported,$$vcf{file}; } if ( $out ) { print "# Number of sites lost due to grouping (e.g. duplicate sites): lost, %lost, read, reported, file\n"; print $out; } if ( !$$opts{cmp_genotypes} ) { return; } my %summary; for my $id (keys %{$$opts{hapls}}) { for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; } $$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; } $$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; } $$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; } $$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'}; $summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'}; $summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'}; } for my $key (qw(het_RA_ het_AA_)) { if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; } $$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'}; if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; } $$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'}; if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; } $$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'}; } } print "#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n", "#GS The columns are:\n", "#GS 1 .. variant type\n", "#GS 2 .. number of mismatches\n", "#GS 3 .. number of matches\n", "#GS 4 .. discordance\n"; print_gs($opts,\%summary); print "\n", "#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n", "#GC The columns are:\n", "#GC 1 .. Sample\n", "#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n", "#GC 7-9 .. Gtype lost: total het_RA het_AA \n", "#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n", "#GC 15-17 .. Phase lost: total het_RA het_AA \n", "#GC 18 .. Phase gained\n", "#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n", "#GC 24 .. Phased matches: het_RA \n", "#GC 25 .. Misphased matches: het_RA \n"; for my $id (keys %{$$opts{hapls}}) { print "GC\t$id"; for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; } for my $key (qw(total_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; } for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; } for my $key (qw(total_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; } if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; } print "\t",$$opts{hapls}{$id}{phase_gained}; for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; } for my $key (qw(het_RA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; } for my $key (qw(het_RA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; } print "\n"; } print "#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n", "#AF The columns are:\n", "#AF 1 .. Non-ref allele count\n", "#AF 2 .. Hom(RR) matches\n", "#AF 3 .. Het(RA) matches\n", "#AF 4 .. Hom(AA) matches\n", "#AF 5 .. Het(AA) matches\n", "#AF 6 .. Hom(RR) mismatches\n", "#AF 7 .. Het(RA) mismatches\n", "#AF 8 .. Hom(AA) mismatches\n", "#AF 9 .. Het(AA) mismatches\n"; for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}}) { print "AF\t$ac"; for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0; } for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0; } print "\n"; } for my $infoTag ( @{$$opts{INFOgroup}} ) { print "#INFO/".$infoTag." Number of matching and mismatching genotypes vs INFO/". $infoTag. (exists($$opts{INFOgroupIdx}{$infoTag}) ? "[".$$opts{INFOgroupIdx}{$infoTag}."]" : ""). ". Use `^INFO/".$infoTag." | cut -f 2-` to extract this part.\n", "#INFO/".$infoTag." The columns are:\n", "#INFO/".$infoTag." 1 .. INFO/". $infoTag. (exists($$opts{INFOgroupIdx}{$infoTag}) ? "[".$$opts{INFOgroupIdx}{$infoTag}."]\n" : "\n"), "#INFO/".$infoTag." 2 .. Hom(RR) matches\n", "#INFO/".$infoTag." 3 .. Het(RA) matches\n", "#INFO/".$infoTag." 4 .. Hom(AA) matches\n", "#INFO/".$infoTag." 5 .. Het(AA) matches\n", "#INFO/".$infoTag." 6 .. Hom(RR) mismatches\n", "#INFO/".$infoTag." 7 .. Het(RA) mismatches\n", "#INFO/".$infoTag." 8 .. Hom(AA) mismatches\n", "#INFO/".$infoTag." 9 .. Het(AA) mismatches\n", "#INFO/".$infoTag." 10 .. Non-reference Discordance Rate\n"; for my $info (sort {$a<=>$b} keys %{$$opts{counts_by_INFO}{$infoTag}}) { print "INFO/".$infoTag."\t$info"; my $nonRefMatches=-$$opts{counts_by_INFO}{$infoTag}{$info}{"hom_RR_"}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{"hom_RR_"}{matches} : 0; my $mismatches=0; for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} : 0; $nonRefMatches += $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} : 0; } for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} : 0; $mismatches += $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} : 0; } printf "\t%.2f\n",$mismatches*100.0/($mismatches+$nonRefMatches); } } print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n"; print "#DP The columns are:\n"; print "#DP 1 .. depth\n"; print "#DP 2 .. RR matches\n"; print "#DP 3 .. RA matches\n"; print "#DP 4 .. AA matches\n"; print "#DP 5 .. RR -> RA mismatches\n"; print "#DP 6 .. RR -> AA mismatches\n"; print "#DP 7 .. RA -> RR mismatches\n"; print "#DP 8 .. RA -> AA mismatches\n"; print "#DP 9 .. AA -> RR mismatches\n"; print "#DP 10 .. AA -> RA mismatches\n"; for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}}) { print "DP\t$dp"; for my $type (qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)) { printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0; } print "\n"; } if ( exists($$opts{counts_by_gl}) ) { print "#EQ Errors by quality. Use `grep ^EQ | cut -f 2-` to extract this part.\n"; print "#EQ The columns are:\n"; print "#EQ 1 .. GL\n"; print "#EQ 2 .. number of matches\n"; print "#EQ 3 .. number of mismatches\n"; for my $qual (sort {$a<=>$b} keys %{$$opts{counts_by_gl}}) { printf "EQ\t%s\t%d\t%d\n", $qual, $$opts{counts_by_gl}{$qual}{match}?$$opts{counts_by_gl}{$qual}{match}:0, $$opts{counts_by_gl}{$qual}{mismatch}?$$opts{counts_by_gl}{$qual}{mismatch}:0; } } if ( $$opts{debug} ) { print "#MT Mismatch Types\n"; for my $t1 (keys %{$$opts{mismatch_types}}) { for my $t2 (keys %{$$opts{mismatch_types}{$t1}}) { print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n"; } } } } sub print_gs { my ($opts,$stats) = @_; my ($ndr_ms,$ndr_m,@summary); for my $key (qw(hom_RR het_RA hom_AA het_AA)) { my $m = $$stats{"${key}_"}{match}; my $ms = $$stats{"${key}_"}{mismatch}; if ( !$m ) { $m=0; } if ( !$ms ) { $ms=0; } my $err = $m?$ms*100.0/($m+$ms):0; printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$err; $ndr_ms += $ms; $ndr_m += $key eq 'hom_RR' ? 0 : $m; if ( $key eq 'het_AA' ) { next; } if ( $key=~/_(.+)$/ ) { push @summary, sprintf "%s %.2f", $1,$err; } } my $err = $ndr_m+$ndr_ms ? $ndr_ms*100.0/($ndr_m+$ndr_ms) : 0; unshift @summary, sprintf "NDR %.2f", $err; printf "SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $err; print "SN\tSummary:\t", join(', ', @summary), "\n"; } sub read_stats { my ($stats,$file) = @_; open(my $fh,'<',$file) or error("$file: $!"); while (my $line=<$fh>) { if ( $line=~/^#/ ) { next; } my @items = split(/\t/,$line); chomp($items[-1]); if ( $items[0] eq 'DP' ) { my $dp = $items[1]; $$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10]; $$stats{dp}{RR}{RR}{$dp} += $items[2]; $$stats{dp}{RA}{RA}{$dp} += $items[3]; $$stats{dp}{AA}{AA}{$dp} += $items[4]; $$stats{dp}{RR}{RA}{$dp} += $items[5]; $$stats{dp}{n}{RR}{RA} += $items[5]; $$stats{dp}{RR}{AA}{$dp} += $items[6]; $$stats{dp}{n}{RR}{AA} += $items[6]; $$stats{dp}{RA}{RR}{$dp} += $items[7]; $$stats{dp}{n}{RA}{RR} += $items[7]; $$stats{dp}{RA}{AA}{$dp} += $items[8]; $$stats{dp}{n}{RA}{AA} += $items[8]; $$stats{dp}{AA}{RR}{$dp} += $items[9]; $$stats{dp}{n}{AA}{RR} += $items[9]; $$stats{dp}{AA}{RA}{$dp} += $items[10]; $$stats{dp}{n}{AA}{RA} += $items[10]; } elsif ( $items[0] eq 'AF' ) { my $af = $items[1]; $$stats{af}{RR}{$af}{matches} += $items[2]; $$stats{af}{RA}{$af}{matches} += $items[3]; $$stats{af}{AA}{$af}{matches} += $items[4]; $$stats{af}{RR}{$af}{mismatches} += $items[6]; $$stats{af}{RA}{$af}{mismatches} += $items[7]; $$stats{af}{AA}{$af}{mismatches} += $items[8]; } elsif ( $items[0] eq 'GS' ) { my $type = $items[1]; $$stats{gs}{$type.'_'}{mismatch} += $items[2]; $$stats{gs}{$type.'_'}{match} += $items[3]; } elsif ( $items[0] eq 'EQ' ) { my $gl = $items[1]; $$stats{counts_by_gl}{$gl}{mismatch} += $items[2]; $$stats{counts_by_gl}{$gl}{match} += $items[3]; } } close($fh); } sub make_dir { my ($prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub plot_stats { my ($opts) = @_; my $stats = {}; for my $file (@{$$opts{files}}) { read_stats($stats,$file); plot_site_ndr($opts,$file); } make_dir($$opts{plot}); plot_dp($opts,$$stats{dp}); plot_af($opts,$$stats{af}); plot_ndr($opts,$$stats{af}); plot_dp_ndr($opts,$$stats{dp}); plot_eq($opts,$$stats{counts_by_gl}); print_gs($opts,$$stats{gs}); } sub plot { my ($file) = @_; system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file"); } sub plot_site_ndr { my ($opts,$file) = @_; my ($fname,$gp,$start_chr,@counts); my $start_pos = -1; my $count = 0; my $numerator = 0; my $denominator = 0; open(my $fh,'<',$file) or error("$file: $!"); while (my $line=<$fh>) { if ( !($line=~/^SD/) ) { next; } my @items = split(/\t/,$line); my $chr = $items[1]; my $pos = $items[2]; if ( !defined $gp ) { $fname = "$$opts{plot}-ndr-$chr-$pos.gp"; open($gp,'>',$fname) or error("$fname: $!"); print $gp q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-ndr-$chr-$pos.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Alternate allele frequency" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set y2tics set xtic rotate by -45 plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; } if ( $start_pos==-1 ) { $start_pos = $pos; $start_chr = $chr; } $numerator += $items[6] + $items[7] + $items[8]; $denominator += $items[6] + $items[7] + $items[8] + $items[4] + $items[5]; $count += $denominator; if ( $start_pos+50_000 > $pos && $start_chr eq $chr ) { next; } printf $gp "$start_pos\t%.2f\n", $denominator ? $numerator*100.0/$denominator : 0; push @counts, "$start_pos\t$count\n"; $numerator = 0; $denominator = 0; $count = 0; $start_pos = $pos; $start_chr = $chr; } close($fh); # Was the ^SD section found? if ( !@counts ) { return; } push @counts, "$start_pos\t$count\n"; printf $gp "$start_pos\t%.2f\n", $denominator ? $numerator*100.0/$denominator : 0; print $gp "end\n"; print $gp join('',@counts), "end\n"; close($gp); plot("$fname"); } sub plot_dp_ndr { my ($opts,$stats) = @_; my ($numerator,$denominator); for my $agt (keys %$stats) { if ( $agt eq 'n' or $agt eq 'ndist' ) { next; } for my $bgt (keys %{$$stats{$agt}}) { if ( $bgt eq 'n' ) { next; } if ( $agt eq 'RR' && $bgt eq 'RR' ) { next; } for my $dp (keys %{$$stats{$agt}{$bgt}}) { if ( $agt ne $bgt ) { $$numerator{$dp} += $$stats{$agt}{$bgt}{$dp}; $$denominator{$dp} += $$stats{$agt}{$bgt}{$dp}; } else { $$denominator{$dp} += $$stats{$agt}{$bgt}{$dp}; } } } } open(my $fh,'>',"$$opts{plot}-dp-ndr.gp") or error("$$opts{plot}-dp-ndr.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 600,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-dp-ndr.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Depth" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set y2tics plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; for my $dp (sort {$a<=>$b} keys %$denominator) { printf $fh "%d\t%.2f\n", $dp,$$denominator{$dp} ? $$numerator{$dp}*100.0/$$denominator{$dp} : 0; } print $fh "end\n"; for my $dp (sort {$a<=>$b} keys %$denominator) { printf $fh "%d\t%d\n", $dp,$$denominator{$dp}; } print $fh "end\n"; close($fh); plot("$$opts{plot}-dp-ndr.gp"); } sub plot_dp { my ($opts,$stats) = @_; my $out; my @plots; for my $agt (sort keys %$stats) { if ( $agt eq 'n' or $agt eq 'ndist' ) { next; } for my $bgt (sort keys %{$$stats{$agt}}) { if ( $bgt eq 'n' ) { next; } if ( $agt eq $bgt ) { next; } for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}}) { $out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100.0/$$stats{n}{$agt}{$bgt} : 0) . "\n"; } $out .= "end\n"; push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"]; } } open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!"); print $fh q[ set terminal png size 600,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-dp.png" . q[" set ylabel 'Fraction of GTs [%]' set y2label 'Number of GTs total' set y2tics set ytics nomirror set xlabel 'Depth' set xrange [:20] ]; if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n]; print $fh $out; for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}}) { print $fh "$dp\t$$stats{ndist}{$dp}\n"; } print $fh "end\n"; close($fh); plot("$$opts{plot}-dp.gp"); } sub plot_af { my ($opts,$stats) = @_; open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-af.png" . q[" set grid back lc rgb "#dddddd" set xlabel "Non-reference allele frequency" set ylabel "Concordance" set y2label "Number of genotypes" set yrange [0.0:1.0] set y2tics set key center plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \ '-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \ '-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \ '-' with points pt 20 lc rgb "red" title "HomRef", \ '-' with points pt 20 lc rgb "green" title "Het", \ '-' with points pt 20 lc rgb "blue" title "HomAlt" ]; for my $type (qw(RR RA AA)) { for my $af (sort {$a<=>$b} keys %{$$stats{$type}}) { print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n"; } print $fh "end\n"; } for my $type (qw(RR RA AA)) { for my $af (sort {$a<=>$b} keys %{$$stats{$type}}) { my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}; print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n"; } print $fh "end\n"; } close($fh); plot("$$opts{plot}-af.gp"); } sub plot_ndr { my ($opts,$stats) = @_; open(my $fh,'>',"$$opts{plot}-ndr.gp") or error("$$opts{plot}-ndr.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-ndr.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Alternate allele frequency" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set xrange [0.0:1.0] set y2tics plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; my $afs; for my $type (qw(RA AA)) { for my $af (keys %{$$stats{$type}}) { $$afs{$af}{m} += $$stats{$type}{$af}{matches}; $$afs{$af}{mi} += $$stats{$type}{$af}{mismatches}; } } for my $type (qw(RR)) { for my $af (keys %{$$stats{$type}}) { $$afs{$af}{mi} += $$stats{$type}{$af}{mismatches}; } } my @afs = sort { $a<=>$b } keys %$afs; my $iafs = 0; my $bin_size = 0.02; my @dp; for (my $i=0; $i<=1/$bin_size; $i++) { my $from = $i*$bin_size; my $to = ($i+1)*$bin_size; my ($m,$mi,$af) = (0,0,0); while ( $iafs<@afs && $afs[$iafs]>=$from && $afs[$iafs]<$to ) { $af = $afs[$iafs]; $m += $$afs{$af}{m}; $mi += $$afs{$af}{mi}; $iafs++; } if ( !($m+$mi) ) { next; } printf $fh "$af\t%.2f\n", $m+$mi ? $mi*100.0/($m+$mi) : 0; push @dp, sprintf "$af\t%d",$m+$mi; } print $fh "end\n"; print $fh join("\n",@dp), "\nend\n"; close($fh); plot("$$opts{plot}-ndr.gp"); } sub plot_eq { my ($opts,$stats) = @_; if ( !scalar keys %$stats ) { return; } open(my $fh,'>',"$$opts{plot}-eq.gp") or error("$$opts{plot}-eq.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-eq.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "GL" set ylabel "Number of matches (log)" set y2label "Number of mismatches (log)" set y2tics set ytics nomirror set log y set log y2 plot '-' with lines lw 1 title "Matches", \ '-' axes x1y2 with lines lw 1 title "Mismatches" ]; for my $gl (sort {$a<=>$b} keys %$stats) { print $fh "$gl\t$$stats{$gl}{match}\n"; } print $fh "end\n"; for my $gl (sort {$a<=>$b} keys %$stats) { print $fh "$gl\t$$stats{$gl}{mismatch}\n"; } print $fh "end\n"; close($fh); plot("$$opts{plot}-eq.gp"); } sub do_region_stats { my ($opts,$vcfs) = @_; my $refseq; if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); } my $nvcfs = scalar @$vcfs; my $debug = $$opts{debug} ? $$opts{debug} : 0; my $match = $$opts{match}; my $win = $$opts{win} ? $$opts{win} : 0; while (1) { my $grp = read_next_group($opts,$vcfs,$win); if ( !$grp || !scalar @$grp ) { last } if ( $debug>1 ) { print "Group:\n"; for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; } print "\n"; } my %files; for my $rec (@$grp) { $files{$$rec{vcf}{file}} = 1; } my $key = join(q['],sort(keys %files)); $$match{$key}++; my $npresent = scalar keys %files; if ( $npresent == $nvcfs ) { ref_alt_stats($opts,$grp); } if ( $npresent>1 && defined $refseq ) { cmp_sequence($opts,$grp,$refseq); } if ( $$opts{cmp_genotypes} ) { # Check that in the group there is one record for each file if ( $npresent==$nvcfs && scalar @$grp==$nvcfs ) { cmp_genotypes($opts,$grp); } } } } sub cmp_sequence { my ($opts,$grp,$fa_refseq) = @_; # Detailed comparison will be performed only if there are indels or complex # substitutions, SNPs are interesting only in their presence. There can be # more events from the same file present simultaneously and at multiple # positions. They all are treated as separate variants and if any of them # yields a haplotype present in all files, match is reported. # Note that the original version of the code expected all alternate # variants to be present on a single VCF line and was able to compare # consecutive non-overlapping events as one sequence. However, because the # the major producer of indel calls (Dindel) does report one variant per # line, this idea was abandoned. # Check if there are any interesting events. my %has_indels; my %events_per_file; my $vcf = $$grp[0]{vcf}; for (my $igrp=0; $igrp<@$grp; $igrp++) { my $rec = $$grp[$igrp]; my $ifile = $$rec{vcf}{vcf_compare_ID}; my $ref_len = length($$rec{ref}); my @alts = split(/,/,$$rec{alt}); for my $alt (@alts) { if ( $alt eq '.' ) { next; } if ( $alt=~/^$$rec{pos}, alt=>$alt, ref_len=>$ref_len }; # Do complex checking of event type only if it is still not certain if this is waste of time or not if ( exists($has_indels{$ifile}) ) { next; } if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; } elsif ( $ref_len>1 ) { my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt); if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; } } } } # Return if there is nothing interesting if ( scalar keys %has_indels < 2 ) { return; } for my $ifile (keys %events_per_file) { if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); } } # Cache the reference sequence chunk my $ref_from = $$grp[0]{pos} - $$opts{win}; my $ref_to = $$grp[-1]{pos} + $$opts{win}; my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to); # For each file get all possible sequences for my $events (values %events_per_file) { for my $variant (@$events) { my $pos = $$variant{pos}; my $len = $pos - $ref_from; my $seq = $len>0 ? substr($refseq,0,$len) : ''; $seq .= $$variant{alt}; $pos += $$variant{ref_len}; if ( $pos<=$ref_to ) { $seq .= substr($refseq,$pos-$ref_from); } $$variant{seq} = $seq; $$variant{length} = length($seq); } } # Now compare the variants: is there a sequence shared across all files? my $match = 1; my @keys = keys %events_per_file; for (my $ikey=0; $ikey<@keys; $ikey++) { my $ivars = $events_per_file{$ikey}; for (my $jkey=0; $jkey<$ikey; $jkey++) { my $jvars = $events_per_file{$jkey}; my $found = 0; for my $ivar (@$ivars) { for my $jvar (@$jvars) { if ( $$ivar{length} != $$jvar{length} ) { next; } if ( $$ivar{seq} ne $$jvar{seq} ) { next; } $found=1; last; } } if ( !$found ) { $match=0; last; } } if ( !$match ) { last; } } my $key = join(q['],sort(values %has_indels)); if ( $match ) { $$opts{indels}{$key}{match}++; } else { $$opts{indels}{$key}{mismatch}++; } } sub ref_alt_stats { my ($opts,$grp) = @_; my $ref = $$grp[0]{ref}; my $alt = join(',',sort split(/,/,$$grp[0]{alt})); my $alt_mismatch; for (my $i=1; $i<@$grp; $i++) { my $rec = $$grp[$i]; if ( $ref ne $$rec{ref} ) { $$opts{ref_mismatch}++; if ( $$opts{debug} ) { print "RM\t$$grp[0]{chr}\t$$grp[0]{pos}\t$$grp[0]{ref}\t$$rec{ref}\n"; } return; } my $tmp = join(',',sort split(/,/,$$rec{alt})); if ( $alt ne $tmp ) { $alt_mismatch = $tmp; } } if ( $alt ne '.' ) { if ( defined $alt_mismatch ) { $$opts{alt_mismatch}++; if ( $$opts{debug} ) { print "AM\t$$grp[0]{chr}\t$$grp[0]{pos}\t$alt\t$alt_mismatch\n"; } } else { $$opts{alt_match}++; } } $$opts{ref_match}++; } sub snp_type { my ($als,$ref) = @_; if ( @$als==1 ) { return $$als[0] eq $ref ? 'hom_RR_' : 'hom_AA_'; } # Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA) if ( $$als[0] eq $$als[1] ) { if ( $$als[0] eq $ref ) { return 'hom_RR_'; } else { return 'hom_AA_'; } } else { if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; } else { return 'het_AA_'; } } } sub cmp_genotypes { my ($opts,$grp) = @_; my $nrecs = @$grp; my $hapls = $$opts{hapls}; # Break the VCF lines into hashes (required by parse_haplotype) for my $grp_rec (@$grp) { $$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line}); if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; } if ( exists($$grp_rec{vcf}{_col_mapping}) ) { my %new_cols; while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}}) { $new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori}; delete($$grp_rec{rec}{gtypes}{$name_ori}); } while (my ($name,$hash) = each %new_cols) { $$grp_rec{rec}{gtypes}{$name} = $hash; } } } if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); } my $ref = $$grp[0]{rec}{REF}; my %gtype_matches = (); my %gtype_mismatches = (); my $min_dp; my $ndp3 = 0; for my $id (keys %{$$grp[0]{rec}{gtypes}}) { my (@sorted_als1,$nploid,$type,$max_gl); my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id); if ( !$is_empty1 ) { @sorted_als1 = sort @$als1; $nploid = scalar @sorted_als1; $type = snp_type($als1,$ref); } if ( exists($$opts{high_confidence_gls}) ) { my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL}); if ( @gls!=3 or $gls[0] eq '.' ) { next; } @gls = sort {$b<=>$a} @gls; if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; } } if ( exists($$opts{err_by_gl}) && exists($$grp[0]{rec}{gtypes}{$id}{GL}) ) { for my $gl (split(/,/,$$grp[0]{rec}{gtypes}{$id}{GL})) { if ( !defined $max_gl or $gl>$max_gl ) { $max_gl = $gl; } } } # There may be multiple files entering the comparison. Report match only if all are present and all match. # Report mismatch if all are present and they do not match. Otherwise report lost/gained event. my $phase_match = 1; my $phase_mismatch = 0; my $gtype_match = 1; my $gtype_lost = 0; my $gtype_gained = 0; my $phase_lost = 0; my $phase_gained = 0; my $type2; for (my $i=1; $i<$nrecs; $i++) { my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id); if ( $is_empty1 ) { $gtype_match = 0; if ( !$is_empty2 ) { $gtype_gained = 1; $type = snp_type($als2,$ref); } if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; } last; } elsif ( $is_empty2 ) { $gtype_match = 0; $gtype_lost = 1; last; } if ( $is_phased1 ) { if ( !$is_phased2 ) { $phase_lost = 1; $phase_match = 0; } } elsif ( $is_phased2 ) { $phase_gained = 1; $phase_match = 0; } else { $phase_match = 0; } # Consider different number of alleles as mismatch (C vs C/C) if ( scalar @$als1 != scalar @$als2 ) { $gtype_match = 0; if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ } last; } if ( exists($$opts{err_by_gl}) && exists($$grp[$i]{rec}{gtypes}{$id}{GL}) ) { for my $gl (split(/,/,$$grp[$i]{rec}{gtypes}{$id}{GL})) { if ( !defined $max_gl or $gl>$max_gl ) { $max_gl = $gl; } } } my @sorted_als2 = sort @$als2; for (my $ial=0; $ial<$nploid; $ial++) { if ( $sorted_als1[$ial] ne $sorted_als2[$ial] ) { $gtype_match = 0; if ( $$opts{debug} ) { my $type2 = snp_type($als2,$ref); $$opts{mismatch_types}{$type}{$type2}++; } last; } } if ( !$gtype_match ) { if ( !defined $type2 && !$is_empty2 ) { $type2 = snp_type($als2,$ref); } last; } # They match, check also if their phase agrees if ( $phase_match && $is_phased1 && $is_phased2 ) { for (my $ial=0; $ial<$nploid; $ial++) { if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_mismatch=1; last; } } } } if ( $gtype_gained ) { $$hapls{$id}{$type.'gtype_gained'}++; if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ } next; } if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; } if ( $phase_mismatch ) { $$hapls{$id}{$type.'phase_mismatch'}++; } if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ } elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ } my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1; if ( $gtype_match ) { $$hapls{$id}{$type.'gtype_match'}++; if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ } $gtype_matches{$type}++; $$opts{counts_by_dp}{$dp}{$type.'-'.$type}++; if ( defined $max_gl ) { my $gl = sprintf "%.2f", $max_gl; $$opts{counts_by_gl}{$gl}{match}++; } } elsif ( defined $type ) { $$hapls{$id}{$type.'gtype_mismatch'}++; $gtype_mismatches{$type}++; $$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++; if ( defined $max_gl ) { my $gl = sprintf "%.2f", $max_gl; $$opts{counts_by_gl}{$gl}{mismatch}++; } } } $$opts{hapls_ncmp}++; my %infoGroup; for my $infoTag ( @{$$opts{INFOgroup}} ) { if ( exists($$grp[1]{rec}{INFO}{$infoTag}) ) { if( exists($$opts{INFOgroupIdx}{$infoTag}) ) { my @arr = split(/,/,$$grp[1]{rec}{INFO}{$infoTag}); $infoGroup{$infoTag} = sprintf "%.2f", $arr[$$opts{INFOgroupIdx}{$infoTag}]; } else { $infoGroup{$infoTag} = sprintf "%.2f", $$grp[1]{rec}{INFO}{$infoTag}; } } } # Store the number of matching types by AC my $af; if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) ) { $af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1}; } elsif ( !$$opts{all_samples_af} ) { my $ac = 0; my $an = 0; if ( exists($gtype_matches{hom_AA_}) ) { $ac += 2*$gtype_matches{hom_AA_}; $an += 2*$gtype_matches{hom_AA_}; } if ( exists($gtype_mismatches{hom_AA_}) ) { $ac += 2*$gtype_mismatches{hom_AA_}; $an += 2*$gtype_mismatches{hom_AA_}; } if ( exists($gtype_matches{het_RA_}) ) { $ac += $gtype_matches{het_RA_}; $an += 2*$gtype_matches{het_RA_}; } if ( exists($gtype_mismatches{het_RA_}) ) { $ac += $gtype_mismatches{het_RA_}; $an += 2*$gtype_mismatches{het_RA_}; } if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; } if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; } $af = sprintf "%.2f", $an>0 ? $ac/$an : 0; } else { my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes}); $af = sprintf "%.2f", $an>0 ? $ac/$an : 0; } for my $type (keys %gtype_matches) { for my $infoTag ( @{$$opts{INFOgroup}} ) { $$opts{counts_by_INFO}{$infoTag}{$infoGroup{$infoTag}}{$type}{matches} += $gtype_matches{$type}; } $$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type}; $$opts{gtypes_cmp_total} += $gtype_matches{$type}; } for my $type (keys %gtype_mismatches) { for my $infoTag ( @{$$opts{INFOgroup}} ) { $$opts{counts_by_INFO}{$infoTag}{$infoGroup{$infoTag}}{$type}{mismatches} += $gtype_mismatches{$type}; } $$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type}; $$opts{gtypes_cmp_total} += $gtype_mismatches{$type}; } if ( $$opts{debug} ) { my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0; my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0; my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0; my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0; my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0; my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0; my $denom = $het_ra_m+$hom_aa_m+$hom_rr_mm+$het_ra_mm+$hom_aa_mm; my $ndr = sprintf "%.2f", $denom ? ($hom_rr_mm+$het_ra_mm+$hom_aa_mm)*100.0/$denom : 0; print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$hom_rr_m\t$het_ra_m\t$hom_aa_m\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$ndr\n"; } } sub read_next_group { my ($opts,$vcfs,$win) = @_; my @grp; my $prev_vcf; my $start; while (1) { my $min_vcf = get_min_position($opts,$vcfs); if ( !$min_vcf ) { last; } if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; } $prev_vcf = $$min_vcf{buf}[0]; if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} ) { my $rec = shift(@{$$min_vcf{buf}}); push @grp,$rec; $start = $$rec{pos}; next; } } return \@grp; } sub get_min_position { my ($opts,$vcfs) = @_; my ($min_pos,$min_vcf); for my $vcf (@$vcfs) { # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($opts,$vcf); } if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; } my $line = $$vcf{buf}[0]; # Designate this position as the minimum of all the files if: # .. is this the first file? if ( !$min_pos ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } # .. has this file lower position? if ( $min_pos>$$line{pos} ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } } return $min_vcf; } sub read_line { my ($opts,$vcf) = @_; if ( $$vcf{eof} ) { return; } my @items; my $line; while ( !defined $line ) { $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } @items = split(/\t/,$line); if ( $$opts{apply_filters} ) { if ( $items[6] ne 'PASS' && $items[6] ne '.' ) { $line = undef; next; } } } $$vcf{nread}++; my $chr = $items[0]; my $pos = $items[1]; my $ref = uc($items[3]); my $alt = uc($items[4]); if ( $$vcf{buf} && @{$$vcf{buf}} ) { my $prev = $$vcf{buf}[-1]; if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); } } push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf }; return; } vcftools-0.1.15/src/perl/vcf-concat000077500000000000000000000253101307140004000171250ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); if ( $$opts{check_columns} ) { check_columns($opts); } elsif ( !exists($$opts{sort}) ) { concat($opts); } else { concat_merge($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Convenience tool for concatenating VCF files (e.g. VCFs split by chromosome).\n", " In the basic mode it does not do anything fancy except for a sanity check that all\n", " files have the same columns. When run with the -s option, it will perform a partial\n", " merge sort, looking at limited number of open files simultaneously.\n", "Usage: vcf-concat [OPTIONS] A.vcf.gz B.vcf.gz C.vcf.gz > out.vcf\n", "Options:\n", " -c, --check-columns Do not concatenate, only check if the columns agree.\n", " -f, --files Read the list of files from a file.\n", " -p, --pad-missing Write '.' in place of missing columns. Useful for joining chrY with the rest.\n", " -s, --merge-sort Allow small overlaps in N consecutive files.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { files=>[] }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-p' || $arg eq '--pad-missing' ) { $$opts{pad_missing}=1; next; } if ( $arg eq '-s' || $arg eq '--merge-sort' ) { $$opts{sort}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--check-columns' ) { $$opts{check_columns}=1; next; } if ( $arg eq '-f' || $arg eq '--files' ) { my $files = shift(@ARGV); open(my $fh,'<',$files) or error("$files: $!"); while (my $line=<$fh>) { chomp($line); push @{$$opts{files}},$line; } close($fh); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{files}},$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( ! @{$$opts{files}} ) { error("No files to concat?\n") } return $opts; } sub can_be_padded { my ($opts,$cols1,$cols2) = @_; if ( @$cols1<@$cols2 ) { error(sprintf "Not ready for this, sorry, expected fewer columns (%d!<%d)", @$cols1,@$cols2); } my $has1 = {}; my $has2 = {}; for (my $i=0; $i<@$cols1; $i++) { $$has1{$$cols1[$i]} = $i; } for (my $i=0; $i<@$cols2; $i++) { if ( !exists($$has1{$$cols2[$i]}) ) { error("The column [$$cols2[$i]] not seen previously."); } $$has2{$$cols2[$i]} = $i; } my @map; for (my $i=0; $i<@$cols1; $i++) { my $cname = $$cols1[$i]; push @map, exists($$has2{$cname}) ? $$has2{$cname} : -1; } return \@map; } sub check_columns { my ($opts) = @_; my @columns; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); if ( @columns ) { my $different_order; my $different_columns; if ( @columns != @{$$vcf{columns}} ) { warn("Different number of columns in [$file].\n"); } if ( $$opts{pad_missing} && can_be_padded($opts,\@columns,$$vcf{columns}) ) { next; } for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { if ( !exists($$vcf{has_column}{$columns[$i]}) ) { warn("The column names do not match; the column \"$columns[$i]\" no present in [$file].\n"); $different_columns = $columns[$i]; } elsif ( !defined $different_order ) { $different_order = $columns[$i]; } } } if ( defined $different_order && !defined $different_columns ) { warn("The columns ordered differently in [$file]. Use vcf-shuffle-cols to reorder.\n"); } } else { @columns = @{$$vcf{columns}}; } $vcf->close(); } } sub concat { my ($opts) = @_; my @columns; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); my $map; if ( @columns ) { if ( @columns != @{$$vcf{columns}} ) { if ( !$$opts{pad_missing} ) { error(sprintf "Different number of columns in [%s], expected %d, found %d\n", $file,scalar @columns,scalar @{$$vcf{columns}}); } $map = can_be_padded($opts,\@columns,$$vcf{columns}); } else { my $different_order; for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { if ( !exists($$vcf{has_column}{$columns[$i]}) ) { error("The column names do not match; the column \"$columns[$i]\" no present in [$file].\n"); } elsif ( !defined $different_order ) { $different_order = $columns[$i]; } } } if ( defined $different_order ) { error("The columns ordered differently in [$file]. Use vcf-shuffle-cols to reorder.\n"); } } } else { @columns = @{$$vcf{columns}}; print $vcf->format_header(); } while (my $line=$vcf->next_line()) { if ( defined $map ) { my @line = split(/\t/,$line); chomp($line[-1]); my @out; for my $idx (@$map) { if ( $idx==-1 ) { push @out,'.'; } else { push @out,$line[$$map[$idx]] } } print join("\t",@out),"\n"; } else { print $line; } } } } sub get_chromosomes { my ($files) = @_; my @out; my %has_chrm; for my $file (@$files) { my $vcf = Vcf->new(file=>$file); my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrm{$chr}) ) { next; } $has_chrm{$chr} = 1; push @out,$chr; } } return \@out; } sub concat_merge { my ($opts) = @_; my $header_printed = 0; my $chroms = get_chromosomes($$opts{files}); for my $chr (@$chroms) { my $reader = Reader->new(files=>$$opts{files},nsort=>$$opts{sort},seq=>$chr,header_printed=>$header_printed); $header_printed = 1; $reader->open_next(); while (1) { my $line = $reader->next_line(); if ( !defined $line ) { if ( !$reader->open_next() ) { last; } next; } print $line; } } if ( !$header_printed ) { my $vcf = Vcf->new(file=>$$opts{files}[0]); $vcf->parse_header(); print $vcf->format_header(); } } #--------------------------------- package Reader; use strict; use warnings; use Carp; use Vcf; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{files} ) { $self->throw("Expected the files option.\n"); } if ( !$$self{nsort} ) { $$self{nsort} = 2; } if ( $$self{nsort}>@{$$self{files}} ) { $$self{nsort} = scalar @{$$self{files}}; } $$self{idxs} = undef; $$self{vcfs} = undef; return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub print_header { my ($self,$vcf) = @_; if ( $$self{header_printed} ) { return; } print $vcf->format_header(); $$self{header_printed} = 1; } # Open VCF, parse header, check column names and when callled for the first time, output the VCF header. sub open_vcf { my ($self,$file) = @_; my $vcf = Vcf->new(file=>$file,region=>$$self{seq},print_header=>1); $vcf->parse_header(); if ( !exists($$self{columns}) ) { $$self{columns} = [ @{$$vcf{columns}} ]; } else { if ( @{$$self{columns}} != @{$$vcf{columns}} ) { $self->throw("Different number of columns in [$file].\n"); } for (my $i=0; $i<@{$$self{columns}}; $i++) { if ( $$vcf{columns}[$i] ne $$self{columns}[$i] ) { $self->throw("The column names do not agree in [$file].\n"); } } } $self->print_header($vcf); return $vcf; } sub open_next { my ($self) = @_; if ( !defined $$self{idxs} ) { for (my $i=0; $i<$$self{nsort}; $i++) { $$self{idxs}[$i] = $i; } } else { my $prev = $$self{idxs}[-1]; shift(@{$$self{idxs}}); shift(@{$$self{vcfs}}); if ( $prev+1 < @{$$self{files}} ) { # New file to be opened push @{$$self{idxs}}, $prev+1; } } for (my $i=0; $i<@{$$self{idxs}}; $i++) { if ( exists($$self{vcfs}[$i]) ) { next; } my $idx = $$self{idxs}[$i]; $$self{vcfs}[$i] = $self->open_vcf($$self{files}[$idx]); } if ( !@{$$self{idxs}} ) { return 0; } return 1; } sub next_line { my ($self) = @_; my $min = $$self{vcfs}[0]->next_line(); if ( !defined $min ) { return undef; } if ( !($min=~/^(\S+)\t(\d+)/) ) { $self->throw("Could not parse the line: $min\n"); } my $min_chr = $1; my $min_pos = $2; my $min_vcf = $$self{vcfs}[0]; for (my $i=1; $i<@{$$self{vcfs}}; $i++) { if ( !exists($$self{vcfs}[$i]) ) { next; } my $line = $$self{vcfs}[$i]->next_line(); if ( !defined $line ) { next; } if ( !($line=~/^(\S+)\t(\d+)/) ) { $self->throw("Could not parse the line: $line\n"); } my $chr = $1; my $pos = $2; if ( $chr ne $min_chr ) { $self->throw("FIXME: When run with the -s option, only one chromosome can be present.\n"); } if ( $min_pos > $pos ) { $min_pos = $pos; $min_vcf->_unread_line($min); $min_vcf = $$self{vcfs}[$i]; $min = $line; } else { $$self{vcfs}[$i]->_unread_line($line); } } return $min; } vcftools-0.1.15/src/perl/vcf-consensus000077500000000000000000000220121307140004000176720ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); do_consensus($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz > out.fa\n", "Options:\n", " -h, -?, --help This help message.\n", " -H, --haplotype Apply only variants for the given haplotype (1,2)\n", " -i, --iupac-codes Apply variants in the form of IUPAC ambiguity codes\n", " -s, --sample If not given, all variants are applied\n", "Examples:\n", " # Get the consensus for one region. The fasta header lines are then expected\n", " # in the form \">chr:from-to\".\n", " samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-s' || $arg eq '--sample' ) { $$opts{sample}=shift(@ARGV); next; } if ( $arg eq '-i' || $arg eq '--iupac-codes' ) { $$opts{iupac}=1; next; } if ( $arg eq '-H' || $arg eq '--haplotype' ) { $$opts{haplotype}=shift(@ARGV); next; } if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( $$opts{iupac} ) { delete($$opts{iupac}); $$opts{iupac}{CT} = 'Y'; $$opts{iupac}{TC} = 'Y'; $$opts{iupac}{AG} = 'R'; $$opts{iupac}{GA} = 'R'; $$opts{iupac}{AT} = 'W'; $$opts{iupac}{TA} = 'W'; $$opts{iupac}{GC} = 'S'; $$opts{iupac}{CG} = 'S'; $$opts{iupac}{TG} = 'K'; $$opts{iupac}{GT} = 'K'; $$opts{iupac}{CA} = 'M'; $$opts{iupac}{AC} = 'M'; $$opts{iupac}{AA} = 'A'; $$opts{iupac}{CC} = 'C'; $$opts{iupac}{GG} = 'G'; $$opts{iupac}{TT} = 'T'; } if ( exists($$opts{haplotype}) && !exists($$opts{sample}) ) { error("Expected -s option with -H.\n"); } return $opts; } sub do_consensus { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcf_file}); $vcf->parse_header; if ( exists($$opts{sample}) ) { if ( !exists($$vcf{has_column}{$$opts{sample}}) ) { error("No such sample: $$opts{sample}"); } $$opts{vcf} = $vcf; $$opts{sample_col} = $$vcf{has_column}{$$opts{sample}}; } my $chrs = $vcf->get_chromosomes(); my %chrs = map { $_=>0 } @$chrs; my ($chr,$vcf_pos,$warned,$vcf_line); while (my $line=) { if ( $line=~/^>([^:\s]+)/ ) { $chr = $1; for my $line (@{$$vcf{buffer}}) { apply_variant($opts,$line); } flush_fa_buffer($opts,0); my $rest = $'; if ( $rest=~/^:(\d+)-\d+$/ ) { print STDERR "Looks as fasta file snippet, the sequence $chr starts at position $1\n"; $$opts{fa_pos} = $1; } else { $$opts{fa_pos} = 1; } $$opts{fa_idx} = 0; $$opts{fa_frz} = 0; if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; } my $region = $$opts{fa_pos} > 1 ? "$chr:$$opts{fa_pos}" : $chr; $vcf->open(region=>$region); print $line; next; } chomp($line); if ( !$$opts{case_known} ) { if ( uc($line) eq $line ) { $$opts{case_known} = 'u'; } elsif ( lc($line) eq $line ) { $$opts{case_known} = 'l'; } else { $$opts{case_known} = 'u'; } } $$opts{fa_buf} .= $line; $$opts{fa_len} += length($line); while ( defined($vcf_line = $vcf->next_data_array()) ) { # can the beginning of the buffer be printed? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1] ) { $vcf->_unread_line($vcf_line); flush_fa_buffer($opts,60); last; } # is the buffer long enough? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1]+length($$vcf_line[3]) ) { $vcf->_unread_line($vcf_line); last; } apply_variant($opts,$vcf_line); } if ( !defined $vcf_line ) { flush_fa_buffer($opts,60); } } flush_fa_buffer($opts,0); for my $chr (keys %chrs) { if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); } } } sub flush_fa_buffer { my ($opts,$len) = @_; while ( $$opts{fa_len} && $$opts{fa_len}>=60 ) { print substr($$opts{fa_buf},0,60,''), "\n"; $$opts{fa_len} -= 60; $$opts{fa_pos} += 60 - $$opts{fa_idx}; $$opts{fa_idx} = 0; } if ( $len or !$$opts{fa_len} ) { return; } print $$opts{fa_buf},"\n"; $$opts{fa_pos} += $$opts{fa_len}-$$opts{fa_idx}; $$opts{fa_len} = 0; $$opts{fa_buf} = ''; $$opts{fa_idx} = 0; } sub apply_variant { my ($opts,$vline) = @_; if ( $$vline[4] eq '.' ) { return; } my $hap = exists($$opts{haplotype}) ? $$opts{haplotype} : 0; my $alt; if ( !exists($$opts{sample_col}) ) { # No sample requested, applying all sites, first ALT my $idx; $alt = ($idx=index($$vline[4],','))==-1 ? $$vline[4] : substr($$vline[4],0,$idx); if ( exists($$opts{iupac}) && length($$vline[3])==1 && length($alt)==1 ) { $alt = uc($$vline[3].$alt); if ( !exists($$opts{iupac}{$alt}) ) { error("No IUPAC code for \"$alt\"\n"); } $alt = $$opts{iupac}{$alt}; } } else { my $igt = $$opts{vcf}->get_tag_index($$vline[8],'GT',':'); if ( $igt==-1 ) { return; } my $gt = $$opts{vcf}->get_field($$vline[$$opts{sample_col}-1],$igt); my @als = $$opts{vcf}->split_gt($gt); if ( $hap ) { # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect if ( $hap <= @als && $als[$hap-1] ne '0' ) { $alt = $$opts{vcf}->get_field($$vline[4],$als[$hap-1]-1,','); } } else { if ( exists($$opts{iupac}) && length($$vline[3])==1 ) # only for SNPs and with -i { my @alts; for my $al (@als) { if ( $al eq '.' ) { last; } if ( $al eq '0' ) { push @alts,uc($$vline[3]); } else { $alt = $$opts{vcf}->get_field($$vline[4],$al-1,','); push @alts, uc($alt); if ( length($alt)!=1 ) { last; } } } if ( @alts==2 ) { if ( !exists($$opts{iupac}{$alts[0].$alts[1]}) ) { error("No IUPAC code for \"$alts[0]/$alts[1]\"\n"); } $alt = $$opts{iupac}{$alts[0].$alts[1]}; } elsif ( length($alts[0])==1 ) { if ( !exists($$opts{iupac}{$alts[0].$alts[0]}) ) { error("No IUPAC code for \"$alts[0]/$alts[0]\"\n"); } $alt = $$opts{iupac}{$alts[0].$alts[0]}; } } else { for my $al (@als) { if ( $al eq '0' or $al eq '.' ) { next; } $alt = $$opts{vcf}->get_field($$vline[4],$al-1,','); last; } } } if ( !defined $alt or $alt eq $$vline[3] ) { return; } } if ( $$vline[1] <= $$opts{fa_frz} ) { print STDERR "Note: Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n"; return; } my $pos = $$vline[1] - $$opts{fa_pos} + $$opts{fa_idx}; if ( $pos<0 or $pos>=$$opts{fa_len} ) { error("FIXME: $$vline[0]:$$vline[1] .. $$opts{fa_pos},$pos,$$opts{fa_len},$$opts{fa_frz}\n"); } # Sanity check my $ref_len = length($$vline[3]); if ( $$vline[3] ne uc(substr($$opts{fa_buf},$pos,$ref_len)) ) { error(sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n", substr($$opts{fa_buf},$pos,$ref_len), substr($$opts{fa_buf},$pos+1,$ref_len+5), $$vline[3], $$opts{fa_frz}?$$opts{fa_frz}:0); } if ( $$opts{case_known} eq 'l' ) { $alt = lc($alt); } my $alt_len = length($alt); substr($$opts{fa_buf},$pos,$ref_len,$alt); $$opts{fa_len} += $alt_len - $ref_len; $$opts{fa_pos} += $ref_len; # position with respect to the original reference sequence $$opts{fa_idx} += $alt_len; # position in the modified sequence $$opts{fa_frz} = $$vline[1] + $ref_len - 1; # freeze changes until this position } vcftools-0.1.15/src/perl/vcf-contrast000077500000000000000000000211351307140004000175140ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); query_vcf($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "About: Finds differences amongst samples adding NOVELGT, NOVELAL and NOVELTY annotations to INFO field.\n", " Note that haploid genotypes are internally treated as homozygous diploid genotypes, therefore\n", " \"0/1\" and \"1\" are considered different genotypes.\n", "Usage: vcf-contrast + - [OPTIONS] file.vcf.gz\n", "Options:\n", " + List of samples where unique variant is expected\n", " - List of background samples\n", " -d, --min-DP Minimum depth across all - samples\n", " -f, --apply-filters Skip sites with FILTER column different from PASS or \".\"\n", " -n, --novel-sites Print only records with novel genotypes\n", " -h, -?, --help This help message.\n", "Example:\n", " # Test if any of the samples A,B is different from all C,D,E\n", " vcf-contrast +A,B -C,D,E -m file.vcf.gz\n", "\n", " # Same as above but printing only sites with novel variants and table output\n", " vcf-contrast -n +A,B -C,D,E -m file.vcf.gz | vcf-query -f '\%CHROM \%POS\\t\%INFO/NOVELTY\\t\%INFO/NOVELAL\\t\%INFO/NOVELGT[\\t\%SAMPLE \%GTR \%PL]\\n'\n", "\n", " # Similar to above but require minimum mapping quality of 20\n", " vcf-annotate -f MinMQ=20 file.vcf.gz | vcf-contrast +A,B,C -D,E,F -f\n", "\n"; exit -1; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args=>[$0, @ARGV], }; while (defined(my $arg=shift(@ARGV))) { if ( -e $arg ) { $$opts{vcf}=$arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-d' || $arg eq '--min-DP' ) { $$opts{min_dp}=shift(@ARGV); next; } if ( $arg eq '-n' || $arg eq '--novel-sites' ) { $$opts{novel_only}=1; next; } if ( $arg eq '-f' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg=~/^\+/ && !exists($$opts{var_samples}) ) { @{$$opts{var_samples}}=split(/,/,$'); next } if ( $arg=~/^-/ && !exists($$opts{bg_samples}) ) { @{$$opts{bg_samples}}=split(/,/,$'); next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{var_samples}) ) { error("Missing the list of variant samples (+).\n") } if ( !exists($$opts{bg_samples}) ) { error("Missing the list of background samples (-).\n") } return $opts; } sub init_columns { my ($vcf,@samples) = @_; my @out; for my $sample (@samples) { push @out, $vcf->get_column_index($sample); } return \@out; } sub query_vcf { my ($opts) = @_; my $vcf = exists($$opts{vcf}) ? Vcf->new(file=>$$opts{vcf}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header; $vcf->add_header_line({key=>'INFO',ID=>'NOVELAL',Number=>'.',Type=>'String',Description=>'List of samples with novel alleles'}); $vcf->add_header_line({key=>'INFO',ID=>'NOVELGT',Number=>'.',Type=>'String',Description=>'List of samples with novel genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'NOVELTY',Number=>'1',Type=>'Integer',Description=>'vcf-contrast novelty score'}); $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print $vcf->format_header(); $$opts{var_cols} = init_columns($vcf,@{$$opts{var_samples}}); $$opts{bg_cols} = init_columns($vcf,@{$$opts{bg_samples}}); while (my $rec=$vcf->next_data_array) { if ( $$opts{apply_filters} && $$rec[6] ne '.' && $$rec[6] ne 'PASS' ) { next; } if ( $$rec[4] eq '.' ) { next; } my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my ($novel,$novelal,$novelgt) = contrast($opts,$vcf,$rec); if ( $novel ) { my %info = ( NOVELTY=>$novel ); if ( scalar keys %$novelal ) { my @tmp; for my $col (keys %$novelal) { push @tmp, $$vcf{columns}[$col]; } $info{NOVELAL} = join(',',@tmp); } elsif ( scalar keys %$novelgt ) { my @tmp; for my $col (keys %$novelgt) { push @tmp, $$vcf{columns}[$col]; } $info{NOVELGT} = join(',',@tmp); } $$rec[7]=$vcf->add_info_field($$rec[7],%info); } elsif ( $$opts{novel_only} ) { next; } print $vcf->format_line($rec); } } sub contrast { my ($opts,$vcf,$rec) = @_; my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my $has_PL = $ipl<0 ? 0 : 1; my $igt; if ( !$has_PL ) { $igt = $vcf->get_tag_index($$rec[8],'GT',':'); if ( $igt<0 ) { error("GT not available: $$rec[0]:$$rec[1]\n"); } } my $idp; if ( exists($$opts{min_dp}) ) { $idp = $vcf->get_tag_index($$rec[8],'DP',':'); if ( $idp<0 ) { error("todo: DP not available"); } } my @x = split(/,/, $$rec[4]); my $n_als = 1 + scalar @x; my (@bg_pls, @bg_als, @bg_gts, @var_pls,@var_gts, $min_dp); for my $bg_col (@{$$opts{bg_cols}}) { if ( defined $idp ) { my $dp = $vcf->get_field($$rec[$bg_col],$idp); if ( !defined $min_dp or $min_dp>$dp ) { $min_dp=$dp; } } my @gt; if ( $has_PL ) { my $pl = $vcf->get_field($$rec[$bg_col],$ipl); ($pl, @gt) = likely_gt($pl, $n_als); push @bg_pls, $pl; } else { my $gt = $vcf->get_field($$rec[$bg_col],$igt); @gt = $vcf->split_gt($gt); } push @bg_als, \@gt; push @bg_gts, join('/',sort(@gt)); } if ( defined $min_dp && $min_dp<$$opts{min_dp} ) { return undef; } my %novel_gt; my %novel_al; my $min_score; for my $var_col (@{$$opts{var_cols}}) { my (@var_als,$var_pl); if ( $has_PL ) { $var_pl = $vcf->get_field($$rec[$var_col],$ipl); ($var_pl,@var_als) = likely_gt($var_pl, $n_als); @var_als = sort @var_als; push @var_pls, $var_pl; } else { my $gt = $vcf->get_field($$rec[$var_col],$igt); @var_als = sort($vcf->split_gt($gt)); } my $var_gt = join('/',sort(@var_als)); push @var_gts, $var_gt; my $bg_score; my %als; for (my $i=0; $i<@{$$opts{bg_cols}}; $i++) { my $score; if ( $has_PL ) { if ( $var_pls[0] eq '.' or substr($bg_pls[$i],0,1) eq '.' ) { next; } $score = same_pls($var_pl, $bg_pls[$i]); } else { if ( $var_als[0] eq '.' or $bg_als[$i][0] eq '.' ) { next; } $score = same_gts(\@var_als, $bg_als[$i]); } if ( !defined $bg_score or $score<$bg_score ) { $bg_score = $score; } for my $al (@{$bg_als[$i]}) { $als{$al} = 1; } if ( $var_gt ne $bg_gts[$i] ) { $novel_gt{$var_col} = 1; } } if ( !$bg_score ) { next; } if ( !defined $min_score or $min_score>$bg_score ) { $min_score = $bg_score; } for my $al (@var_als) { if ( !exists($als{$al}) ) { $novel_al{$var_col} = 1; } } } if ( !$min_score ) { return undef; } if ( !scalar keys %novel_gt && !scalar keys %novel_al ) { return undef; } return ($min_score,\%novel_al,\%novel_gt); } sub likely_gt { my ($pl, $nals) = @_; my @pls = split(/,/,$pl); my ($min,$imin,$jmin); if ( $nals==@pls ) { # haploid: treat as fake diploid my @out_pls; $min = $pls[0]; $imin = 0; for (my $i=1; $i<@pls; $i++) { if ( $min>$pls[$i] ) { $min = $pls[$i]; $imin = $i; } } for (my $i=0; $i<$nals; $i++) { for (my $j=0; $j<$i; $j++) { push @out_pls,255; } push @out_pls, $pls[$i]; } return (join(',',@out_pls), $imin,$imin); } # diploid my $idx=0; my $i = 0; while ($idx<@pls) { if ( $pls[$idx] eq '.' ) { return '.'; } for (my $j=0; $j<=$i; $j++) { if ( $idx>=@pls ) { error("Unexpected number of PL values with n_als=$nals: $pl\n"); } if ( !defined $min or $min>$pls[$idx] ) { $min=$pls[$idx]; $imin=$i; $jmin=$j; } $idx++; } $i++; } return ($pl,$jmin,$imin); } sub same_pls { my ($pla,$plb) = @_; my @pla = split(/,/,$pla); my @plb = split(/,/,$plb); my $min; my $imin; for (my $i=0; $i<@pla; $i++) { if ( !defined $min or $pla[$i]+$plb[$i]<$min ) { $min=$pla[$i]+$plb[$i]; $imin=$i; } } return $min; } sub same_gts { my ($gta,$gtb) = @_; if ( @$gta != @$gtb ) { return 255; } for (my $i=0; $i<@$gta; $i++) { if ( $$gta[$i] ne $$gtb[$i] ) { return 255; } } return 0; } vcftools-0.1.15/src/perl/vcf-convert000077500000000000000000000140331307140004000173360ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; use FindBin; use lib "$FindBin::Bin"; use FaSlice; my $opts = parse_params(); convert_file($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Convert between VCF versions.\n", "Usage: cat in.vcf | vcf-convert [OPTIONS] > out.vcf\n", "Options:\n", " -r, --refseq The reference sequence in samtools faindexed fasta file. (Not required with SNPs only.)\n", " -v, --version 4.0, 4.1, 4.2\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { version=>'4.2' }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-v' || $arg eq '--version' ) { $$opts{version}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub convert_file { my ($opts) = @_; # How to recognise a number my $FLOAT_RE = qr/^\-?\d+\.?\d*(?:[eE][+-]\d+)?$/; my $vcf_in = Vcf->new(fh=>\*STDIN); my $vcf_out = Vcf->new(version=>$$opts{version}); if ( $$opts{version} < $$vcf_in{version} ) { warn("Downgrading of VCF versions is experimental: expect troubles!\n"); } # Convert the header $vcf_in->parse_header(); for (my $i=1; $i<@{$$vcf_in{header_lines}}; $i++) { $vcf_out->add_header_line($$vcf_in{header_lines}[$i]); } $vcf_out->add_columns(@{$$vcf_in{columns}}); print $vcf_out->format_header(); # Convert each data line my $fa; while (my $x=$vcf_in->next_data_hash()) { # Convert missing (default) FORMAT values for my $gt (values %{$$x{gtypes}}) { for my $field (@{$$x{FORMAT}}) { # Skip the GT tag, so that ploidy information is not lost ("./." would become ".") if ( $field eq 'GT' ) { next; } if ( $field eq 'FT' && $$gt{$field} eq $$vcf_in{filter_passed} ) { $$gt{$field}=$$vcf_out{filter_passed}; } if ( exists($$vcf_in{defaults}{$field}) && $$vcf_in{defaults}{$field} eq $$gt{$field} ) { $$gt{$field} = $$vcf_out{defaults}{$field}; next; } if ( !exists($$gt{$field}) ) { next; } if ( exists($$vcf_in{header}{FORMAT}{$field}{default}) && $$vcf_in{header}{FORMAT}{$field}{default} eq $$gt{$field} ) { delete($$gt{$field}); next; } } } # Change missing QUAL: In case they are numbers, do numeric comparison, as -1.0 is sometimes used instead of -1 if ( $$x{QUAL} eq $$vcf_in{defaults}{QUAL} or ($$x{QUAL}=~$FLOAT_RE && $$vcf_in{defaults}{QUAL}=~$FLOAT_RE && $$x{QUAL}==$$vcf_in{defaults}{QUAL}) ) { $$x{QUAL} = $$vcf_out{defaults}{QUAL}; } for (my $i=0; $i<@{$$x{FILTER}}; $i++) { if ( $$x{FILTER}[$i] eq $$vcf_in{filter_passed} ) { $$x{FILTER}[$i] = $$vcf_out{filter_passed}; } } # Parse the ALT column and see if there are indels my $has_indel = 0; for my $alt (@{$$x{ALT}}) { my ($type,$len,$ht) = $vcf_in->event_type($x,$alt); if ( $type eq 's' or $type eq 'r' ) { next; } if ( $type ne 'i' ) { error("FIXME: expected indel at $$x{CHROM}:$$x{POS}\n"); } $has_indel = 1; } # If there is an indel, new REF and ALT must be changed if ( $has_indel ) { my $map = {}; my $alt_to_mapref = {}; for my $alt (@{$$x{ALT}}) { my ($type,$len,$ht) = $vcf_in->event_type($x,$alt); if ( $type eq 's' or $type eq 'r' ) { $$alt_to_mapref{$alt} = { ref=>$$x{REF}, alt=>$alt }; $$map{$$x{REF}}{$alt} = 1; next; } if ( $type eq 'i' && $len>0 ) { my $tmp = $$x{REF}.$ht; $$alt_to_mapref{$alt} = { ref=>$$x{REF}, alt=>$tmp }; $$map{$$x{REF}}{$tmp} = 1; next; } elsif ( $type eq 'i' && $len<0 ) { if ( !$fa ) { if ( !$$opts{refseq} ) { error("Indels present, missing the -r option.\n"); } $fa = FaSlice->new(file=>$$opts{refseq},size=>1_000_000); } my $ref = $fa->get_slice($$x{CHROM},$$x{POS},$$x{POS}+abs($len)); my $nref1 = uc(substr($ref,0,1)); my $oref1 = uc(substr($$x{REF},0,1)); # Sanity check if ( $oref1 ne $nref1 ) { error("Sanity check failed: the ref does not agree at $$x{CHROM}:$$x{POS} .. [$nref1] in .fa, [$oref1] in .vcf\n"); } $$alt_to_mapref{$alt} = { ref=>$ref, alt=>$nref1 }; $$map{$ref}{$nref1} = 1; next; } else { error("Uh, FIXME: $$x{CHROM}:$$x{POS} [$type] [$len] [$ht]\n"); } } $$x{REF} = $vcf_out->fill_ref_alt_mapping($map); if ( !defined $$x{REF} ) { error("Failed on line $$x{CHROM}:$$x{POS}\n"); } for (my $i=0; $i<@{$$x{ALT}}; $i++) { my $ori_ref = $$alt_to_mapref{$$x{ALT}[$i]}{ref}; my $ori_alt = $$alt_to_mapref{$$x{ALT}[$i]}{alt}; $$x{ALT}[$i] = $$map{$ori_ref}{$ori_alt}; } } print $vcf_out->format_line($x); } } vcftools-0.1.15/src/perl/vcf-fix-newlines000077500000000000000000000043721307140004000202730ustar00rootroot00000000000000#!/usr/bin/env perl # # Authors: Adam Auton, Petr Danecek # (C) 2011 use strict; use warnings; use Carp; my $opts = parse_params(); fix_file($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Reads in a VCF file with any (commonly used) newline representation and outputs with the\n", " current system's newline representation.\n", "Usage: vcf-fix-newlines [OPTIONS]\n", "Options:\n", " -i, --info Report if the file is consistent with the current platform based.\n", " -h, -?, --help This help message.\n", "Example:\n", " vcf-fix-newlines -i file.vcf\n", " vcf-fix-newlines file.vcf.gz > out.vcf\n", " cat file.vcf | vcf-fix-newlines > out.vcf\n", "\n"; } sub parse_params { my $opts = {}; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-i' || $arg eq '--info' ) { $$opts{info}=1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( !exists($$opts{vcf}) && -e $arg ) { $$opts{vcf}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub fix_file { my ($opts) = @_; my $fh = \*STDIN; if ( $$opts{vcf} ) { if ( $$opts{vcf}=~/\.gz/i ) { open($fh,"gunzip -c $$opts{vcf} |") or error("gunzip -c $$opts{vcf}: $!\n"); } else { open($fh,'<',$$opts{vcf}) or error("$$opts{vcf}: $!\n"); } } # Read a small 1kb sample binmode $fh or error("binmode: $!"); local $/ = \1024; my $buf = <$fh>; if ( !defined $buf ) { error("No data read.\n"); } # Check the origin my ($in,$nl); if ( $buf=~/\015\012/ ) { $in = 'Windows'; $nl=$&; } elsif ( $buf=~/\015/ && !($buf=~/\012/) ) { $in = 'Old Mac'; $nl=$&; } elsif ( $buf=~/\012/ && !($buf=~/\015/) ) { $in = 'UNIX'; $nl=$&; } else { error("FIXME: Unable to determine the system which produced the file.\n"); } if ( defined $in ) { warn("The file was generated on $in compatible system.\n"); } if ( $$opts{info} ) { close($fh); return; } if ( $nl eq "\n" ) { warn("No conversion needed.\n"); return; } # Read the file and do the conversion local $/ = $nl; $buf .= <$fh>; $buf =~ s/$nl/\n/g; print $buf; while($buf = <$fh>) { $buf =~ s/$nl/\n/g; print $buf; } close($fh); } vcftools-0.1.15/src/perl/vcf-fix-ploidy000077500000000000000000000244651307140004000177540ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); fix_ploidy($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "Usage: cat broken.vcf | vcf-fix-ploidy [OPTIONS] > fixed.vcf\n", "Options:\n", " -a, --assumed-sex M or F, required if the list is not complete in -s\n", " -l, --fix-likelihoods Add or remove het likelihoods (not the default behaviour)\n", " -p, --ploidy Ploidy definition. The default is shown below.\n", " -s, --samples List of sample sexes (sample_name [MF]).\n", " -h, -?, --help This help message.\n", "Default ploidy definition:\n", " ploidy =>\n", " {\n", " X =>\n", " [\n", " # The pseudoautosomal regions 60,001-2,699,520 and 154,931,044-155,270,560 with the ploidy 2\n", " { from=>1, to=>60_000, M=>1 },\n", " { from=>2_699_521, to=>154_931_043, M=>1 },\n", " ],\n", " Y =>\n", " [\n", " # No chrY in females and one copy in males\n", " { from=>1, to=>59_373_566, M=>1, F=>0 },\n", " ],\n", " MT =>\n", " [\n", " # Haploid MT in males and females\n", " { from=>1, to => 16_569, M=>1, F=>1 },\n", " ],\n", " }\n", "\n"; exit -1; } sub parse_params { my $opts = { ploidy => { X => [ { from=>1, to=>60_000, M=>1 }, { from=>2_699_521, to=>154_931_043, M=>1 }, ], Y => [ { from=>1, to=>59_373_566, M=>1, F=>0 }, ], MT => [ { from=>1, to => 16_569, M=>1, F=>1 }, ], }, }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-p' || $arg eq '--ploidy' ) { my $file=shift(@ARGV); my $x=do $file; $$opts{ploidy}=$x; next } if ( $arg eq '-s' || $arg eq '--samples' ) { $$opts{samples}=shift(@ARGV); next } if ( $arg eq '-a' || $arg eq '--assumed-sex' ) { $$opts{assumed_sex}=shift(@ARGV); next } if ( $arg eq '-l' || $arg eq '--fix-likelihoods' ) { $$opts{fix_likelihoods}=1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{samples}) ) { error("Missing the -s option.\n") } return $opts; } sub fix_ploidy { my ($opts) = @_; my $vcf = $$opts{vcf} = Vcf->new(fh=>\*STDIN); $vcf->parse_header(); init_regions($opts); print $vcf->format_header; my @samples = $vcf->get_samples; my ($prev_chr,$prev_pos,$regions,$iregion,$nregions); my %nchanged; while (my $line = $vcf->next_line) { my $rec = $vcf->next_data_array($line); if ( !defined $prev_chr or $$rec[0] ne $prev_chr ) { $prev_chr = $$rec[0]; $prev_pos = $$rec[1]; if ( exists($$opts{regions}{$prev_chr}) ) { $regions = $$opts{regions}{$prev_chr}; $iregion = 0; $nregions = @$regions; } else { $regions = undef; } } $prev_chr = $$rec[0]; $prev_pos = $$rec[1]; my $samples; if ( defined $regions ) { if ( $prev_pos >= $$regions[$iregion]{from} && $prev_pos <= $$regions[$iregion]{to} ) { $samples = $$regions[$iregion]{samples}; } else { while ( $iregion<$nregions && $$regions[$iregion]{to}<$prev_pos ) { $iregion++; } if ( $iregion>=$nregions ) { undef $regions; } elsif ( $prev_pos >= $$regions[$iregion]{from} && $prev_pos <= $$regions[$iregion]{to} ) { $samples = $$regions[$iregion]{samples}; } } } if ( !defined $samples ) { print $line; next; } my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my $igl = $vcf->get_tag_index($$rec[8],'GL',':'); if ( $igt==-1 ) { print $line; next; } my @alt = split(/,/,$$rec[4]); my $nals = $alt[0] eq '.' ? 1 : 1 + scalar @alt; my $changed = 0; my $nrec = @$rec; for (my $isample=9; $isample<$nrec; $isample++) { my $sample = $samples[$isample-9]; if ( !exists($$samples{$sample}) ) { next; } my $gt = $vcf->get_field($$rec[$isample],$igt); my ($pl,$gl); if ( $$opts{fix_likelihoods} && $ipl != -1 ) { $pl = $vcf->get_field($$rec[$isample],$ipl); } if ( $$opts{fix_likelihoods} && $igl != -1 ) { $gl = $vcf->get_field($$rec[$isample],$igl); } my ($new_gt, $new_pl, $new_gl); if ( !$$samples{$sample} ) { # missing genotype - leave it as it is unless it must be removed if ( $gt ne '.' && $gt ne './.' ) { my (@als) = $vcf->split_gt($gt); if ( defined $pl && $pl ne '.' ) { ($new_pl) = reploid_g($rec, 1, $nals, $pl, scalar @als, 1); } if ( defined $gl && $gl ne '.' ) { ($new_gl) = reploid_g($rec, -1, $nals, $gl, scalar @als, 1); } $new_gt = '.'; $nchanged{removed}{$sample}++; } } else { my (@als) = $vcf->split_gt($gt); if ( $$samples{$sample} != @als ) { $new_gt = join('/',($als[0]) x $$samples{$sample}); if ( defined $pl && $pl ne '.' ) { ($new_pl,$new_gt) = reploid_g($rec, 1, $nals, $pl, scalar @als, $$samples{$sample}); } if ( defined $gl && $gl ne '.' ) { ($new_gl,$new_gt) = reploid_g($rec, -1, $nals, $gl, scalar @als, $$samples{$sample}); } } } if ( defined $new_gt ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_gt,$igt,':'); $changed++; } if ( defined $new_pl ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_pl,$ipl,':'); $changed++; } if ( defined $new_gl ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_gl,$igl,':'); $changed++; } } if ( $changed ) { print join("\t",@$rec),"\n"; } else { print $line; } } # Output stats for my $key (sort keys %nchanged) { for my $sample (sort keys %{$nchanged{$key}}) { print STDERR "$sample\t$$opts{samples}{$sample}\t$key\t$nchanged{$key}{$sample}\n"; } } } sub reploid_g { my ($rec, $extr,$nals,$str,$n,$m) = @_; my @vals = split(/,/,$str); if ( $n==2 && $m==1 ) { my @out; my $d = 1; my $k = 0; my ($imin,$min); for (my $i=0; $i<$nals; $i++) { if ( $k>=@vals ) { error("Cannot reploid $$rec[0]:$$rec[1], too few values in $str: $nals, $n->$m ($i,$d,$k)\n"); } if ( $vals[$k] ne '.' && (!defined $min or $min>$extr*$vals[$k]) ) { $min = $extr*$vals[$k]; $imin = $i; } push @out, $vals[$k]; $d++; $k += $d; } my $gt = defined $imin ? $imin : 0; return (join(',',@out), $gt); } elsif ( $n==1 && $m==2 ) { my @out; my ($imin,$min); for (my $i=0; $i<$nals; $i++) { for (my $j=0; $j<=$i; $j++) { push @out, $i==$j ? $vals[$i] : '.'; if ( $vals[$i] ne '.' && (!defined $min or $min>$extr*$vals[$i]) ) { $min = $extr*$vals[$i]; $imin = $i; } } } my $gt = defined $imin ? $imin : 0; return (join(',',@out), "$gt/$gt" ); } else { error("Only diploid/haploid cases handled in this version, sorry."); } } sub init_regions { my ($opts) = @_; open(my $fh,'<',$$opts{samples}) or error("$$opts{samples}: $!"); my (%sexes,%samples); while (my $line=<$fh>) { $line =~ s/^\s*//; $line =~ s/\s*$//; if ( !($line=~/^(\S+)\s+(\S+)$/) ) { error("Could not parse the sample file $$opts{sample}, the offending line was: $line"); } push @{$sexes{$2}}, $1; $samples{$1} = $2; } close($fh); $$opts{samples} = \%samples; for my $sample ($$opts{vcf}->get_samples()) { if ( !exists($samples{$sample}) ) { if ( !exists($$opts{assumed_sex}) ) { error("Could not determine the sex of the sample \"$sample\". Would the -a option help here?\n"); } $samples{$sample} = $$opts{assumed_sex}; push @{$sexes{$$opts{assumed_sex}}}, $sample; } } # Create a quick look-up structure for my $chr (keys %{$$opts{ploidy}}) { if ( ref($$opts{ploidy}{$chr}) ne 'ARRAY' ) { error("Uh, expected list reference for $chr regions.\n"); } my $prev; for my $reg (sort { $$a{from}<=>$$b{to} } @{$$opts{ploidy}{$chr}}) { my $from = $$reg{from}; my $to = $$reg{to}; if ( defined $prev && $prev>=$from ) { error("FIXME: Overlapping regions $chr:$prev>=$from\n"); } $prev = $to; my $region; for my $sex (keys %sexes) { if ( !exists($$reg{$sex}) ) { next; } for my $sample (@{$sexes{$sex}}) { $$region{samples}{$sample} = $$reg{$sex}; } } if ( !defined $region ) { next; } $$region{from} = $from; $$region{to} = $to; push @{$$opts{regions}{$chr}}, $region; } } } vcftools-0.1.15/src/perl/vcf-haplotypes000066400000000000000000000430141307140004000200440ustar00rootroot00000000000000#!/usr/bin/env perl # Authors: gevorgyana@MedImmune.com, slidelt@medimmune.com use strict; use warnings; #no warnings "experimental::smartmatch"; use Carp; use Vcf; my $opts = parse_params(); do_haplotypes($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz \n", "Options:\n", " -h, -?, --help This help message.\n", " -H, --haplotypes [] Apply variants for the given haplotypes (1,2). If not given, both haplotypes are applied.\n", " -g, --gff filename Reposition the features in the given gff file with indels for each haplotypes\n", " -s, --samples [] Apply variants for the given samples. If not given, all variants are applied\n", " -o, --output [] Write output files to the given directory. If not given, set to './'\n", " -g, --gff [] GFF input file (optional).", "Examples:\n", " # Get the consensus for one region. The fasta header lines are then expected\n", " # in the form \">chr:from-to\".\n", " samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-s' || $arg eq '--samples' ) { $$opts{samples} = shift(@ARGV); next; } if ( $arg eq '-H' || $arg eq '--haplotypes' ) { $$opts{haplotypes} = shift(@ARGV); next; } if ( $arg eq '-o' || $arg eq '--output' ) { $$opts{output} = shift(@ARGV); next; } if ( $arg eq '-g' || $arg eq '--gff' ) { $$opts{gff_file} = shift(@ARGV); next; } if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub any (&@) { my ($f, $records) = @_; foreach ( @$records ) { return 1 if $f->(); } return 0; } sub all (&@) { my ($f, $records) = @_; foreach ( @$records ) { return 0 unless $f->(); } return 1; } sub do_haplotypes { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcf_file}); $vcf->parse_header; parse_options( $opts, $vcf ); my $records = []; open_records($opts, $records); my $chrs = $vcf->get_chromosomes(); my %chrs = map { $_=>0 } @$chrs; my $gff; if ($$opts{gff_file}){ $gff = Gff->new(file=>$$opts{gff_file}); my $lines = $gff->read_header(); $_->print_gff ( @$lines ) for @$records; } my ($chr,$vcf_pos,$warned,$vcf_line,$case_known); while (my $line=) { next if ( $line =~ /\A#/ ); if ( $line=~/^>([^:\s]+)/ ) { $_->flush_fa_buffer(0) for @$records; $chr = $1; my $rest = $'; my $fa_pos = ($rest=~/^:(\d+)-(\d+)$/) ? $1 : 1; if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; } my $region = $fa_pos > 1 ? "$chr:$fa_pos" : $chr; $vcf->open(region=>$region); $_->do_header($fa_pos, ">$chr\n") for @$records; $gff->open($region) if defined $gff; next; } chomp($line); unless ( defined $case_known ) { if ( uc($line) eq $line ) { $case_known = 'u'; } elsif ( lc($line) eq $line ) { $case_known = 'l'; } else { $case_known = 'u'; } } $_->add_line($line) for @$records; read_features($gff, $records) if defined $gff; while ( defined($vcf_line = $vcf->next_data_array()) ) { #can the beginning of the buffer be printed? if ( all {$_->buf_end_pos() <= $$vcf_line[1]} $records ){ $vcf->_unread_line($vcf_line); flush ($records, 60); last; } # is the buffer long enough? if ( any {$_->buf_end_pos() < $$vcf_line[1]+length($$vcf_line[3]) } $records ) { $vcf->_unread_line($vcf_line); last; } apply_variants($vcf,$records,$vcf_line,$case_known) } flush ($records, 60) if ( !defined $vcf_line ); } flush ($records, 0); for my $chr (keys %chrs) { if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); } } } sub flush { my ( $records, $len ) = @_; for ( @$records ) { $_->flush_fa_buffer($len); $_->flush_features() if $$_{gff}; }; } sub parse_options { my ( $opts, $vcf ) = @_; my @samples; if ( exists( $$opts{samples} ) ) { @samples = split( ",\\s*", $$opts{samples} ); foreach my $sample (@samples) { if ( !exists( $$vcf{has_column}{$sample} ) ) { error("No such sample: $sample"); } push @{ $$opts{sample_col} }, $$vcf{has_column}{$sample}; } } else { @samples = splice @{$$vcf{columns}}, 9; my @columns = @{$$vcf{has_column}}{@samples}; $$opts{sample_col} = \@columns; } $$opts{samples} = \@samples; my @haplotypes; my @allowed = qw/1 2/; if ( exists( $$opts{haplotypes} )) { @haplotypes = split( ",\\s*", $$opts{haplotypes} ); for my $h (@haplotypes){ unless (($h ~~ @allowed)){ error("Illegal haplotype number: \"$h\". Only 1 and 2 are allowed."); } } } @haplotypes = @allowed unless @haplotypes; $${opts{haplotypes}} = \@haplotypes; $$opts{output} = './' unless exists $$opts{output}; } sub open_records { my ( $opts, $records ) = @_; foreach my $sample ( @{ $$opts{samples} } ) { my $sample_col = shift @{ $$opts{sample_col} }; foreach my $haplotype ( @{ $$opts{haplotypes} } ) { my $name = $sample."_".$haplotype; my $record = Record->new( name => $name, hap => $haplotype, col => $sample_col, fastafile => "$$opts{output}/$name.fa", gff => $$opts{gff_file} ); push @$records, $record; } } } sub apply_variants { my ($vcf, $records, $vline, $case_known) = @_; if ( $$vline[4] eq '.' ) { return; } my $igt = $vcf->get_tag_index($$vline[8],'GT',':'); if ( $igt==-1 ) { return; } for my $record ( @$records ) { my $hap = $$record{hap}; my $alt; my $gt = $vcf->get_field($$vline[$$record{col}-1],$igt); my @als = $vcf->split_gt($gt); # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect if ( $hap <= @als && $als[$hap-1] ne '0' ) { $alt = $vcf->get_field($$vline[4],$als[$hap-1]-1,','); } next if ( !defined $alt ); if ( $case_known eq 'l' ) { $alt = lc($alt); } $record->apply_variant($vline, $alt); } } sub read_features { my ($gff, $records) = @_; while ( my $feature = $gff->next_feature() ){ if ( all { $_->buf_end_pos() <= $$feature{start} } $records ) { $gff->unread($feature); last; } $_->add_feature($feature) for @$records; } } package Record; use List::Util qw (first max); sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; open my $fh, '>', $$self{fastafile} or die "Cannot open $$self{fastafile}: $!"; $$self{fh} = $fh; if ($$self{gff}){ (my $gff_file = $$self{fastafile}) =~ s/\.fa$/.gff/; open my $gh, '>', $gff_file or die "Cannot open $gff_file: $!"; $$self{gh} = $gh; $$self{features} = []; } return $self; } sub flush_fa_buffer { my ($self, $len) = @_; while ( $$self{fa_len} && $$self{fa_len}>=60 ) { $self->print_fasta(substr($$self{fa_buf},0,60,''), "\n"); $$self{fa_len} -= 60; $$self{fa_pos} += 60 - $$self{fa_idx}; $$self{fa_idx} = 0; } if ( $len or !$$self{fa_len} ) { return; } $self->print_fasta($$self{fa_buf}, "\n"); $$self{fa_pos} += $$self{fa_len}-$$self{fa_idx}; $$self{fa_len} = 0; $$self{fa_buf} = ''; $$self{fa_idx} = 0; } sub do_header { my ($self, $fa_pos, $header) = @_; $$self{fa_buf} = ''; $$self{fa_pos} = $fa_pos; $$self{fa_idx} = 0; $$self{fa_frz} = 0; $self->print_fasta($header); $$self{gff_pos} = $fa_pos; $$self{full_fa_len} = $fa_pos; $$self{gff_indel} = 0; $$self{gff_start} = $fa_pos - 1; } sub add_line { my ($self, $line) = @_; $$self{fa_buf} .= $line; my $len = length($line); $$self{fa_len} += $len; if ( $$self{gff} ){ $$self{full_fa_len} += $len; $#{$$self{features}} = max ( $#{$$self{features}}, $$self{full_fa_len} - 1 - $$self{gff_pos} ); } } sub buf_end_pos { my ($self) = @_; return $$self{fa_pos} + $$self{fa_len} - $$self{fa_idx}; } sub apply_variant { my ($self, $vline, $alt) = @_; if ( $alt =~ /<.*>/ ){ $self->print_warning("Imprecise structural variant at $$vline[1]: $alt, cannot apply.\n"); return; } if ( $$vline[1] <= $$self{fa_frz} ) { $self->print_warning("Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n"); return; } my $pos = $$vline[1] - $$self{fa_pos} + $$self{fa_idx}; if ( $pos<0 or $pos>=$$self{fa_len} ) { die("FIXME: $$vline[0]:$$vline[1] .. $$self{fa_pos},$pos,$$self{fa_len},$$self{fa_frz}\n"); } # Sanity check my $ref_len = length($$vline[3]); if ( $$vline[3] ne uc(substr($$self{fa_buf},$pos,$ref_len)) ) { die (sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n", substr($$self{fa_buf},$pos,$ref_len), substr($$self{fa_buf},$pos+1,$ref_len+5), $$vline[3], $$self{fa_frz}?$$self{fa_frz}:0); } my $alt_len = length($alt); if ( $$self{gff} && $alt_len != $ref_len ){ $self->apply_indel_to_features($$vline[1], $ref_len, $alt_len); } substr($$self{fa_buf},$pos,$ref_len,$alt); $$self{fa_len} += $alt_len - $ref_len; $$self{fa_pos} += $ref_len; # position with respect to the original reference sequence $$self{fa_idx} += $alt_len; # position in the modified sequence $$self{fa_frz} = $$vline[1] + $ref_len - 1; # freeze changes until this position } sub print_warning { my ($self, @args) = @_; print STDERR $$self{name}, "\t", @args; } sub print_fasta { my ($self, @args) = @_; my $fh = $$self{fh}; print $fh @args; } sub add_feature { my ( $self, $feature ) = @_; my $startRef = { type => 's', feature => $feature }; my $endRef = { type => 'e', feature => $feature }; my $indel = - $$self{gff_pos} + $$self{gff_indel}; push @{$$self{features}[$$feature{start} + $indel]}, $startRef; push @{$$self{features}[$$feature{end} + $indel]}, $endRef; } sub apply_indel_to_features { my ( $self, $vpos, $ref_len, $alt_len ) = @_; my $pos = $vpos - $$self{gff_pos} + $$self{gff_indel}; my $l = $alt_len - $ref_len; if ( $l < 0 ) { $self->apply_deletion_to_features($pos + $alt_len, $l); } else { $self->apply_insertion_to_features($pos + $ref_len, $l); } $$self{full_fa_len} += $l; $$self{gff_indel} += $l; } sub remove_feature { my ( $self, $pos, $featRef ) = @_; my $index = first { $$self{features}[$pos][$_] == $featRef } 0 .. $#{ $$self{features}[$pos] }; # Remove the position from the array splice( @{ $$self{features}[$pos] }, $index, 1 ); # No more features here so undef it undef $$self{features}[$pos] unless $$self{features}[$pos]; } sub apply_deletion_to_features { my ( $self, $s, $l ) = @_; my $e = $s - $l - 1; my %olFeatRefs = (); for ( my $i = $s ; $i <= $e ; $i++ ) { next unless defined $$self{features}[$i]; while ( my $featRef = shift @{ $$self{features}[$i] } ) { my $startFeatRef = $olFeatRefs{ $$featRef{feature} }; if ( defined $startFeatRef) { #Overlaps whole feature - delete it $self->remove_feature( $e + 1, $startFeatRef ); $self->print_warning( "Feature deleted: ", "$$startFeatRef{feature}{feature} ", "($$startFeatRef{feature}{start}-$$startFeatRef{feature}{end})", "$$startFeatRef{feature}{rest}\n" ); } elsif ( $$featRef{type} eq 's' ) { push @{$$self{features}[$e + 1]}, $featRef; $olFeatRefs{ $$featRef{feature} } = $featRef; } elsif ( $$featRef{type} eq 'e' ) { push @{$$self{features}[$s - 1]}, $featRef; } } } splice( @{$$self{features}}, $s, $e - $s + 1 ); } sub apply_insertion_to_features { my ( $self, $s, $l ) = @_; my @repl; $#repl = $l - 1; splice( @{$$self{features}}, $s, 0, @repl ); } sub flush_features { my ($self, @args) = @_; my $end = $self->buf_end_pos() + $$self{gff_indel} - $$self{gff_pos}; my %ranges = (); for my $i ( 0..$end ) { my $featureRefs = $$self{features}[$i]; next unless $featureRefs; for my $featureRef ( @{$featureRefs} ){ my ( $type, $feature ) = @{$featureRef}{ 'type', 'feature' }; if ($type eq 's'){ @{$ranges{$feature}}{$type, 'feature'} = ($i, $feature); } elsif ( exists $ranges{$feature} ) { #type is 'e' $ranges{$feature}{$type} = $i; } } } #sort by start and end (if existing) positions my @sorted = sort { my $c = $$a{s} <=> $$b{s}; if ( ! $c && exists $$a{e} && exists $$b{e} ){ $c = $$a{e} <=> $$b{e}; } $c; } values %ranges; #first range whose end position has not been read yet my $first_unflushable = first { ! defined $$_{e} } @sorted; #its index, or end of buffer if all ranges have end positions my $first_unflushable_idx = defined $first_unflushable ? $$first_unflushable{s} : $end; #flush the features that can be flushed, sorted by start and end $self->print_feature($_) for grep { $$_{s} < $first_unflushable_idx } @sorted; #remove the features flushed splice @{ $$self{features} }, 0, $first_unflushable_idx; #store the offset of the feature list $$self{gff_pos} += $first_unflushable_idx; } sub print_feature { my ( $self, $range ) = @_; my ( $s, $e, $feature ) = @{$range}{ qw/ s e feature / }; $s += $$self{gff_pos} - $$self{gff_start}; $e += $$self{gff_pos} - $$self{gff_start}; my $line = join "\t", (@{$feature}{ qw/ seqname source feature / }, $s, $e, $$feature{rest}); $self->print_gff($line, "\n"); } sub print_gff { my ($self, @args) = @_; my $gh = $$self{gh}; print $gh @args; } package Gff; sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; $$self{buffer} = []; $$self{fieldnames} = [ qw/ seqname source feature start end rest / ]; return $self; } sub open : method { my ($self, $region) = @_; $self->close(); open my $fh, "tabix $$self{file} $region |" or die "Cannot read tabix header from $$self{file}: $!"; $$self{fh} = $fh; } sub next_feature { my ($self) = @_; return shift(@{$$self{buffer}}) if @{$$self{buffer}}; my $line = readline($$self{fh}); return undef unless $line; chomp ($line); my %feature; @feature{@{$$self{fieldnames}}} = split /\t/, $line, @{$$self{fieldnames}}; return \%feature; } sub unread { my ($self,$feature) = @_; unshift @{$$self{buffer}}, $feature; } sub close : method { my ($self) = @_; close delete $$self{fh} if $$self{fh}; } sub read_header { my ($self) = @_; open TABIX, "tabix -h $$self{file} '' |" or die "Cannot read tabix header from $$self{file}: $!"; my @lines = ; close TABIX; return \@lines; } vcftools-0.1.15/src/perl/vcf-indel-stats000077500000000000000000000104301307140004000201020ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); do_stats($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Currently calculates in-frame ratio.\n", "Usage: vcf-indel-stats [OPTIONS] < in.vcf > out.txt\n", "Options:\n", " -h, -?, --help This help message.\n", " -e, --exons Tab-separated file with exons (chr,from,to; 1-based, inclusive)\n", " -v, --verbose\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-e' || $arg eq '--exons' ) { $$opts{exons}=shift(@ARGV); next; } if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose}=1; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub init_regions { my ($opts) = @_; my $exname = $$opts{exons}=~/\.gz$/i ? "gunzip -c $$opts{exons} |" : "<$$opts{exons}"; open(my $exfh, $exname) or error("$exname: $!"); my %regs; while (my $line=<$exfh>) { my ($chr,$from,$to) = split(/\t/,$line); chomp($to); push @{$regs{$chr}}, [$from,$to]; } close($exfh); for my $chr (keys %regs) { $regs{$chr} = [ sort { if ($$a[0]==$$b[0]) {return $$a[1]<=>$$b[1]} else {return $$a[0]<=>$$b[0]} } @{$regs{$chr}} ]; } $$opts{regs} = \%regs; $$opts{iregs} = {}; } sub do_stats { my ($opts) = @_; init_regions($opts); my $vcf = Vcf->new(fh=>\*STDIN); $vcf->parse_header; $$opts{in_frame} = $$opts{out_frame} = 0; my ($prev_chr,$prev_pos); my $ntot=0; while (my $line=$vcf->next_line) { if ( substr($line,0,1) eq '#' ) { next; } my $i=0; my $j; $j=index($line,"\t",$i); my $chr=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); my $pos=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); $i=$j+1; $j=index($line,"\t",$i); my $ref=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); my $alt=substr($line,$i,$j-$i); $i=$j+1; if ( defined $prev_chr && $prev_chr eq $chr && $prev_pos>$pos ) { error("The VCF file must be sorted"); } $prev_chr = $chr; $prev_pos = $pos; if ( $alt eq '.' ) { next; } #print "[$chr] [$pos] [$ref] [$alt]\n"; my $is_indel; $i=0; while (($j=index($alt,',',$i))!=-1) { my ($type,$len,$ht) = $vcf->event_type($ref,substr($alt,$i,$j-$i)); if ( $type eq 'i' or $type eq 'o' ) { check_csq($opts,$chr,$pos,$len); } $i = $j+1; } my ($type,$len,$ht) = $vcf->event_type($ref,substr($alt,$i)); if ( $type eq 'i' or $type eq 'o' ) { $ntot++; check_csq($opts,$chr,$pos,$len); } } printf "total\t%d\n", $ntot; printf "in-frame\t%d\n", $$opts{in_frame}; printf "frameshift\t%d\n", $$opts{out_frame}; printf "ratio\t%f\n", ($$opts{out_frame}+$$opts{in_frame})?$$opts{out_frame}/($$opts{out_frame}+$$opts{in_frame}) : 0; } sub check_csq { my ($opts,$chr,$pos,$len) = @_; my $opos = $pos; if ( !exists($$opts{regs}{$chr}) ) { return; } my $regs = $$opts{regs}{$chr}; my $ir = exists($$opts{iregs}{$chr}) ? $$opts{iregs}{$chr} : 0; while ( $ir<@$regs && $$regs[$ir][1] <= $pos ) { $ir++; } $$opts{iregs}{$chr} = $ir; if ( $ir>=@$regs ) { return; } my $reg_to = $$regs[$ir][1]; if ( $reg_to<=$pos ) { return; } my $reg_from = $$regs[$ir][0]; my $to = $len<0 ? $pos-$len : $pos+1; if ( $to<$reg_from ) { return; } $pos++; if ( $pos<$reg_from ) { $len += $reg_from-$pos; $pos = $reg_from; } if ( $reg_to<$to && $len<0 ) { $len += $to-$reg_to; } #print "\tinside $$regs[$ir][0] - $$regs[$ir][1] ($pos,$to,$len)\n"; #if ( $len%3 || ($pos-$reg_from)%3 ) { $$opts{out_frame}++; } if ( $len%3 ) { $$opts{out_frame}++; } else { $$opts{in_frame}++; } if ( $$opts{verbose} ) { print "$chr\t$opos\t$$regs[$ir][0]\t$$regs[$ir][1]\t", ($len%3 ? 'frameshift':'inframe') ,"\n"; } } vcftools-0.1.15/src/perl/vcf-isec000077500000000000000000000512651307140004000166110ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); vcf_isec($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Create intersections, unions, complements on bgzipped and tabix indexed VCF or tab-delimited files.\n", " Note that lines from all files can be intermixed together on the output, which can yield\n", " unexpected results.\n", "Usage: vcf-isec [OPTIONS] file1.vcf file2.vcf ...\n", "Options:\n", " -a, --apply-filters Ignore lines where FILTER column is anything else than PASS or '.'\n", " -c, --complement Output positions present in the first file but missing from the other files.\n", " -d, --debug Debugging information\n", " -f, --force Continue even if the script complains about differing columns, VCF versions, etc.\n", " -o, --one-file-only Print only entries from the left-most file. Without -o, all unique positions will be printed.\n", " -n, --nfiles [+-=] Output positions present in this many (=), this many or more (+), or this many or fewer (-) files.\n", " -p, --prefix If present, multiple files will be created with all possible isec combinations. (Suitable for Venn Diagram analysis.)\n", " -r, --regions Do only the given regions (comma-separated list or one region per line in a file).\n", " -t, --tab Tab-delimited file with indexes of chromosome and position columns. (1-based indexes)\n", " -w, --win In repetitive sequences, the same indel can be called at different positions. Consider\n", " records this far apart as matching (be it a SNP or an indel).\n", " -h, -?, --help This help message.\n", "Examples:\n", " bgzip file.vcf; tabix -p vcf file.vcf.gz\n", " bgzip file.tab; tabix -s 1 -b 2 -e 2 file.tab.gz\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { positions=>0, args=>[$0, @ARGV], force=>0, split=>0, report_from_all=>1, apply_filters=>0 }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-p' || $arg eq '--prefix' ) { my $prefix = shift(@ARGV); $$opts{prefix} = init_outdir($opts,$prefix); $$opts{split} = 1; next; } if ( $arg eq '-f' || $arg eq '--force' ) { $$opts{force}=1; next; } if ( $arg eq '-a' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{chromosomes}=shift(@ARGV); next; } if ( $arg eq '-o' || $arg eq '--one-file-only' ) { $$opts{report_from_all}=0; next; } if ( $arg eq '-c' || $arg eq '--complement' ) { $$opts{complement}=1; next; } if ( $arg eq '-n' || $arg eq '--nfiles' ) { my $nfiles = shift(@ARGV); if ( !($nfiles=~/^([\-+=])(\d+)$/) ) { error("Could not parse: [$nfiles]\n"); } $$opts{isec_op} = $1; $$opts{isec_nfiles} = $2; next; } if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}=1; next; } if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--tab' ) { my $tab = shift(@ARGV); my ($chr,$pos,$file) = split(/:/,$tab); push @{$$opts{files}}, Reader->new(file=>$file,chr=>$chr-1,pos=>$pos-1); next; } if ( -e $arg ) { push @{$$opts{files}}, $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{files}) ) { error("What files should be intersected?\n") } if ( !$$opts{force} ) { $SIG{__WARN__} = sub { error(@_); } } return $opts; } sub init_outdir { my ($opts,$prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub read_chrom_list { my ($fname) = @_; my @chroms; if ( -e $fname ) { open(my $chrms,'<',$fname) or error("$fname: $!"); while (my $line=<$chrms>) { chomp($line); push @chroms, $line; } close($chrms); } else { @chroms = split(/,/,$fname); } return (@chroms); } sub check_columns { my ($opts,$vcfs) = @_; # Do the check for VCF files only for (my $ivcf=0; $ivcf<@$vcfs; $ivcf++) { if ( !exists($$vcfs[$ivcf]{has_column}) ) { next; } for (my $jvcf=0; $jvcf<$ivcf; $jvcf++) { if ( !exists($$vcfs[$jvcf]{has_column}) ) { next; } if ( scalar @{$$vcfs[$ivcf]{columns}} != scalar @{$$vcfs[$jvcf]{columns}} ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The number of sample columns is different:\n", (@icols>9 ? scalar @icols - 9 : 0), ": ", join(',',@icols[9..$#icols]),"\n", scalar @jcols - 9, ": ", join(',',@jcols[9..$#jcols]),"\n", ); return; } for my $cname (keys %{$$vcfs[$ivcf]{has_column}}) { if ( !exists($$vcfs[$jvcf]{has_column}{$cname}) or $$vcfs[$ivcf]{has_column}{$cname}!=$$vcfs[$jvcf]{has_column}{$cname} ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The column names do not match (e.g. $cname):\n", join(',',@icols[9..$#icols]),"\n", join(',',@jcols[9..$#jcols]),"\n", ); return; } } for my $cname (keys %{$$vcfs[$jvcf]{has_column}}) { if ( !exists($$vcfs[$ivcf]{has_column}{$cname}) ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The column names do not match (e.g. $cname):\n", join(',',@icols[9..$#icols]),"\n", join(',',@jcols[9..$#jcols]),"\n", ); return; } } } } } sub vcf_isec { my ($opts) = @_; $$opts{match} = {}; # Open the VCF files and initialize the list of chromosomes my @vcfs; my (@chroms,%has_chrom); if ( exists($$opts{chromosomes}) ) { @chroms = read_chrom_list($$opts{chromosomes}); } my $source; my $vcf_version; my $vcf_version_warned; for (my $ifile=0; $ifile<@{$$opts{files}}; $ifile++) { my $file = $$opts{files}[$ifile]; my ($vcf,$file_name); if ( ref($file) eq '' ) { $vcf = Vcf->new(file=>$file); $file_name = $file; } else { $vcf = $file; $file_name = $$file{file}; } $vcf->parse_header(); $vcf->close(); $$vcf{nread} = 0; push @vcfs, $vcf; # Check if the VCF versions are identical if ( ref($file) eq '' ) { if ( !defined $vcf_version ) { $vcf_version = $$vcf{version} } if ( $vcf_version ne $$vcf{version} && !$vcf_version_warned ) { warn("Warning: Mixed VCF format versions, use vcf-convert to unify.\n"); $vcf_version_warned = 1; } } # Update the list of known chromosomes if ( !exists($$opts{chromosomes}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @chroms, $chr; } } if ( $ifile ) { # To get the missig fields filled by the default values if ( !$vcfs[0]{delim} ) { for my $hline (@{$$vcf{header_lines}}) { $vcfs[0]->add_header_line($hline,silent=>1); } } $source .= ','; } $source .= "$ifile:$file_name"; $$vcf{vcf_isec_ID} = $ifile; } check_columns($opts,\@vcfs); $$opts{vcfs} = \@vcfs; if ( !$vcfs[0]{delim} && !$$opts{split} ) { $vcfs[0]->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcfs[0]->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp'); $vcfs[0]->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'},silent=>1); print $vcfs[0]->format_header(); } # Go through all the files simultaneously and get the stats. for my $chr (@chroms) { # Open files for my $vcf (@vcfs) { delete($$vcf{last_line}); $vcf->open(region=>$chr); delete($$vcf{eof}); } do_chrm_isec($opts,\@vcfs); } for my $vcf (@vcfs) { if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); } } } sub do_chrm_isec { my ($opts,$vcfs) = @_; my $debug = $$opts{debug} ? 1 : 0; my $win = $$opts{win} ? $$opts{win} : 0; my $complement = $$opts{complement} ? 1 : 0; my $report_from_all = $$opts{report_from_all} ? 1 : 0; my $nfiles = scalar @{$$opts{files}}; my $isec_nfiles = $nfiles; my $isec_op = '='; if ( exists($$opts{isec_nfiles}) ) { $isec_nfiles = $$opts{isec_nfiles}; $isec_op = $$opts{isec_op}; } my $split = $$opts{split}; while (1) { my $grp = read_next_group($opts,$vcfs,$win); if ( !$grp || !scalar @$grp ) { last } if ( $debug ) { print "Group:\n"; for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; } print "\n"; } my %files; my %srcs; for my $rec (@$grp) { my $vcf = $$rec{vcf}; my $src = $$vcf{vcf_isec_ID}; push @{$files{$src}}, $rec; if ( !$$vcf{delim} ) { # This is a VCF, check filters my $fltr = $$rec{line}[6]; if ( !$split && $fltr ne $$vcf{filter_passed} && $fltr ne $$vcf{defaults}{default} ) { $src .= 'f'; } } $srcs{$$rec{pos}}{$src} = $rec; } if ( $split ) { write_line($opts,$grp,\%srcs); next; } my $nmatches = scalar keys %files; if ( $complement ) { my $src = $$vcfs[0]{vcf_isec_ID}; if ( !exists($files{$src}) ) { next; } if ( $nmatches!=1 ) { next; } } elsif ( $isec_op eq '=' && $isec_nfiles!=$nmatches ) { next; } elsif ( $isec_op eq '+' && $isec_nfiles>$nmatches ) { next; } elsif ( $isec_op eq '-' && $isec_nfiles<$nmatches ) { next; } # The hits are sorted by position in @$grp my ($prev_chr,$prev_pos,$prev_id); for my $rec (@$grp) { if ( !$report_from_all && $$rec{vcf}{vcf_isec_ID}!=0 ) { next; } elsif ( defined $prev_chr && $prev_chr eq $$rec{chr} && $prev_pos eq $$rec{pos} && $prev_id ne $$rec{vcf}{vcf_isec_ID} ) { next; } if ( !$$rec{vcf}{delim} ) { # This is a VCF file, add annotation my @tags = split(/;/,$$rec{line}[7]); my $i; for ($i=0; $i<@tags; $i++) { if ( $tags[$i] eq '.' or $tags[$i]=~/^SF=/ ) { last; } } my $src = join(',',sort keys %{$srcs{$$rec{pos}}}); $tags[$i] = 'SF='.$src; $$rec{line}[7] = join(';',@tags); print join("\t",@{$$rec{line}}) . "\n"; } else { print $$rec{line}; } $prev_chr = $$rec{chr}; $prev_pos = $$rec{pos}; $prev_id = $$rec{vcf}{vcf_isec_ID}; } } } sub write_line { my ($opts,$grp,$srcs) = @_; for my $hash (values %$srcs) { my $src = join('_',sort keys %$hash); if ( !exists($$opts{out_files}{$src}) ) { my $id = (sort keys %$hash)[0]; my $vcf = $$opts{vcfs}[$id]; $$opts{out_vcfs}{$src} = $vcf; $$opts{out_recs}{$src} = $id; open($$opts{out_files}{$src},"| bgzip -c > $$opts{prefix}$src.vcf.gz") or error("| bgzip -c > $$opts{prefix}$src.vcf.gz: $!"); if ( !exists($$opts{readme_fh}) ) { open($$opts{readme_fh},'>',"$$opts{prefix}_README") or error("$$opts{prefix}_README: $!"); print {$$opts{readme_fh}} "# This file was produced by vcf-isec. The command line was:\n#\t",join(' ',@{$$opts{args}}),"\n#\n"; } print {$$opts{readme_fh}} "Using file '$$opts{prefix}$src.vcf.gz' for records present in:\n"; for my $rec (sort values %$hash) { print {$$opts{readme_fh}} "\t$$rec{vcf}{file}\n"; } if ( !$$vcf{delim} ) { my $fnames = join(',',sort values %$hash); $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf->add_header_line({key=>'sourceFiles',value=>$fnames},append=>'timestamp'); print {$$opts{out_files}{$src}} $vcf->format_header(); } } } #use Data::Dumper; print Dumper($srcs); for my $pos (keys %$srcs) { my $src = join('_',sort keys %{$$srcs{$pos}}); my $fh = $$opts{out_files}{$src}; my $irec = $$opts{out_recs}{$src}; my $vcf = $$opts{out_vcfs}{$src}; my $rec = $$srcs{$pos}{$irec}; if ( !$$vcf{delim} ) { print $fh join("\t",@{$$rec{line}}) . "\n"; } else { print $fh $$rec{line}; } } } sub read_next_group { my ($opts,$vcfs,$win) = @_; my @grp; my $prev_vcf; my $start; while (1) { my $min_vcf = get_min_position($opts,$vcfs); # No more lines in the buffer? if ( !$min_vcf ) { last; } # Nothing new has been added? if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; } $prev_vcf = $$min_vcf{buf}[0]; # Read everything what falls in the window. The window moves to encompass complete clusters. if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} ) { my $rec = shift(@{$$min_vcf{buf}}); push @grp,$rec; $start = $$rec{pos}; next; } } return \@grp; } # Return the minimum position across all opened files. If there is no line in the file's buffer, # advance to the next line. sub get_min_position { my ($opts,$vcfs) = @_; my ($min_pos,$min_vcf); for my $vcf (@$vcfs) { # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($opts,$vcf); } if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; } my $line = $$vcf{buf}[0]; # Designate this position as the minimum of all the files if: # .. is this the first file? if ( !$min_pos ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } # .. has this file lower position? if ( $min_pos>$$line{pos} ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } } return $min_vcf; } # Read one line from a VCF or Reader, split it and save it to a buffer. sub read_line { my ($opts,$vcf) = @_; if ( $$vcf{eof} ) { return; } my $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } $$vcf{nread}++; my ($chr,$pos,$ref,$alt); if ( $$vcf{delim} ) { my @items = split($$vcf{delim},$line); # Reader object $chr = $items[$$vcf{chr}]; $pos = $items[$$vcf{pos}]; $ref = ''; $alt = ''; } else { # We are reading VCF, not a tab-delimited file. Apply filters when requested. my @items = split(/\t/,$line); while ( $$opts{apply_filters} && $items[6] ne 'PASS' && $items[6] ne '.' ) { $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } @items = split(/\t/,$line); } chomp($items[-1]); $chr = $items[0]; $pos = $items[1]; $ref = $items[3]; $alt = $items[4]; $line = \@items; } if ( $$vcf{buf} && @{$$vcf{buf}} ) { my $prev = $$vcf{buf}[-1]; if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); } } push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf }; return; } #--------------------------------- package Reader; use strict; use warnings; use Carp; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( $$self{cmd} ) { $$self{file} = ''; open($$self{fh},$$self{cmd}) or $self->throw("$$self{cmd}: $!"); } if ( !$$self{file} && !$$self{fh} ) { $self->throw("Expected the file or fh option.\n"); } if ( !$$self{delim} ) { $$self{delim} = qr/\t/; } if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0) if ( !$$self{pos} ) { $$self{pos} = 1; } # the index of the position column return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub open { my ($self,%args) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } $self->close(); if ( $$self{file}=~/\.gz$/i ) { if ( exists($args{region}) && defined($args{region}) ) { open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!"); } else { open($$self{fh},"gunzip -c $$self{file} |") or $self->throw("gunzip -c $$self{file} |: $!"); } } else { open($$self{fh},'<',$$self{file}) or $self->throw("$$self{file}: $!"); } } sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } close($$self{fh}); delete($$self{fh}); delete($$self{buffer}); } sub _unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } sub next_line { my ($self) = @_; my $line; if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } return readline($$self{fh}); } sub parse_header { my ($self) = @_; $self->open(); while (1) { my $line = $self->next_line(); if ( !$line ) { last; } if ( $line=~/^#/ ) { push @{$$self{header}},$line; next; } $self->_unread_line($line); last; } } sub format_header { my ($self) = @_; if ( $$self{header} ) { return join('',@{$$self{header}}); } return ''; } sub get_chromosomes { my ($self) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } my (@out) = `tabix -l $$self{file}`; if ( $? ) { $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]); } for (my $i=0; $i<@out; $i++) { chomp($out[$i]); } return \@out; } vcftools-0.1.15/src/perl/vcf-merge000077500000000000000000000547141307140004000167670ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); merge_vcf_files($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak join('',@msg); } die "About: Merges VCF files by position, creating multi-sample VCFs from fewer-sample VCFs.\n", " The tool requires bgzipped and tabix indexed VCF files on input. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n", " If you need to concatenate VCFs (e.g. files split by chromosome), look at vcf-concat instead.\n", "Usage: vcf-merge [OPTIONS] file1.vcf file2.vcf.gz ... > out.vcf\n", "Options:\n", " -c, --collapse treat as identical sites with differing alleles [any]\n", " -d, --remove-duplicates If there should be two consecutive rows with the same chr:pos, print only the first one.\n", " -H, --vcf-header Use the provided VCF header\n", " -h, -?, --help This help message.\n", " -r, --regions Do only the given regions (comma-separated list or one region per line in a file).\n", " -R, --ref-for-missing Use the REF allele instead of the default missing genotype. Because it is not obvious\n", " what ploidy should be used, a user-defined string is used instead (e.g. 0/0).\n", " -s, --silent Try to be a bit more silent, no warnings about duplicate lines.\n", " -t, --trim-ALTs If set, redundant ALTs will be removed\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], joiner => { 'I16' => \&joiner_dp4, 'DP4' => \&joiner_dp4, 'MQ0' => \&joiner_sum, 'DP' => \&joiner_sum, }, trim_redundant_ALTs => 0, collapse_any => 1, }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--remove-duplicates' ) { $$opts{rm_dups}=1; next; } if ( $arg eq '-R' || $arg eq '--ref-for-missing' ) { $$opts{ref_for_missing}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--trim-ALTs' ) { $$opts{trim_redundant_ALTs}=1; next; } if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{silent_dups}=1; next; } if ( $arg eq '-H' || $arg eq '--vcf-header' ) { $$opts{vcf_header}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--collapse' ) { $$opts{collapse_any} = 0; my $c = shift(@ARGV); if ( $c eq 'snps' ) { $$opts{collapse_snps}=1; } elsif ( $c eq 'indels' ) { $$opts{collapse_indels}=1; } elsif ( $c eq 'both' ) { $$opts{collapse_snps}=1; $$opts{collapse_indels}=1; } elsif ( $c eq 'any' ) { $$opts{collapse_any}=1; } elsif ( $c eq 'none' ) { $$opts{collapse_any}=0; $$opts{collapse_snps}=0; $$opts{collapse_indels}=0; } else { error("Expected one of with -c"); } next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{files}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -? for help.\n"); } if ( !exists($$opts{files}) ) { error() } return $opts; } # Returns the common prefix of the files. sub common_prefix { my ($files) = @_; my @paths; my $len = -1; for my $file (@$files) { my @path = split(m{/+},$file); if ( $len<0 || $len>scalar @path ) { $len=scalar @path; } push @paths, \@path; } my @common; for (my $i=0; $i<$len; $i++) { my $identical=1; for (my $ifile=1; $ifile[$i] ne $paths[0]->[$i] ) { $identical=0; last; } } if ( !$identical ) { last; } push @common, $paths[0]->[$i]; } return join('/+',@common); } sub read_region_list { my ($opts) = @_; my @regions = (); if ( exists($$opts{regions_list}) ) { if ( -e $$opts{regions_list} ) { open(my $rgs,'<',$$opts{regions_list}) or error("$$opts{regions_list}: $!"); while (my $line=<$rgs>) { chomp($line); push @regions, $line; } close($rgs); } else { @regions = split(/,/,$$opts{regions_list}); } } return (@regions); } sub check_AGtags_definition { my ($vcf) = @_; if ( $$vcf{version} >= 4.1 ) { return; } # Whatever is the value set to, the user takes the responsibility for the merging strategy used if ( exists($ENV{DONT_FIX_VCF40_AG_TAGS}) ) { return; } my @tags; if ( exists($$vcf{header}{INFO}{PL}) && $$vcf{header}{INFO}{PL}{Number} != -1 ) { push @tags, 'PL'; } if ( exists($$vcf{header}{INFO}{GL}) && $$vcf{header}{INFO}{GL}{Number} != -1 ) { push @tags, 'GL'; } if ( exists($$vcf{header}{INFO}{AC}) && $$vcf{header}{INFO}{AC}{Number} != -1 ) { push @tags, 'AC'; } if ( exists($$vcf{header}{INFO}{AF}) && $$vcf{header}{INFO}{AF}{Number} != -1 ) { push @tags, 'AF'; } if ( !@tags ) { return; } $ENV{DONT_FIX_VCF40_AG_TAGS} = 1; my $tags = join(',',@tags); print STDERR "Warning: The $tags tag(s) will not be merged correctly for multiallelic sites.\n", " To be handled correctly, please redefine with Number=. or set the environment\n", " variable DONT_FIX_VCF40_AG_TAGS=0.\n"; } sub init_cols { my ($opts,$vcf_out) = @_; my $prefix; my @regions = read_region_list($opts); my @vcfs; my @cols; my %has_chrom; my %col_names; my $icol = 9; my $ncols_total = 0; if ( !$$opts{has_col_names} ) { $prefix = common_prefix($$opts{files}); } # Go through all files and read header, obtain list of chromosomes. The file names will be used for columns, unless # they were read from the header. for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $$vcf{line_buffer} = []; $vcf->parse_header(); check_AGtags_definition($vcf); $vcf->close(); push @vcfs, $vcf; # Precompute the weighting factor for the QUAL column my $ncols = scalar @{$$vcf{columns}} - 9; if ( $ncols<=0 ) { $ncols = 1; } $$vcf{qual_weight} = 1.0*$ncols; $ncols_total += $ncols; # Update the list of known chromosomes if ( !exists($$opts{regions_list}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @regions, $chr; } } my $col_prefix = ''; if ( !$$opts{has_col_names} ) { # Make the column names nice - strip common prefix and the suffix .vcf.gz $col_prefix = $file; $col_prefix =~ s{^/*$prefix/*}{}; $col_prefix =~ s/\.gz$//i; $col_prefix =~ s/\.vcf$//i; $col_prefix .= '_'; } if ( !exists($$vcf{columns}) ) { error("No header present? $file\n"); } # Create good names for the columns in the merged vcf file my @vcf_cols = @{$$vcf{columns}}; $$vcf{__col_names} = []; for my $col (@vcf_cols[9..$#vcf_cols]) { my $col_name = $col; if ( $$opts{has_col_names} ) { if ( $icol >= @{$$vcf_out{columns}} ) { error("Fewer columns in the header than in the VCF files total.\n"); } $col_name = $$vcf_out{columns}[$icol]; $icol++; if ( exists($col_names{$col_name}) ) { error("The column names not unique in the header: $col_name\n"); } } else { if ( exists($col_names{$col_name}) ) { $col_name = $col_prefix.$col; } if ( exists($col_names{$col_name}) ) { warn("FIXME: the column name [$col_name] not unique.\n"); } } warn("Using column name '$col_name' for $file:$col\n"); $col_names{$col_name} = 1; push @cols, $col_name; push @{$$vcf{__col_names}}, $col_name; } } if ( $$opts{has_col_names} && $icol!=@{$$vcf_out{columns}} ) { error("More columns in the header than in the VCF files total.\n"); } # QUAL weighting for my $vcf (@vcfs) { $$vcf{qual_weight} /= $ncols_total; } $$opts{vcfs} = \@vcfs; $$opts{cols} = \@cols; $$opts{regions} = \@regions; } sub merge_vcf_files { my ($opts) = @_; # Create output VCF my $vcf_out; if ( $$opts{vcf_header} ) { $vcf_out = Vcf->new(file=>$$opts{vcf_header}); $vcf_out->parse_header(); if ( $$vcf_out{columns} && @{$$vcf_out{columns}} ) { $$opts{has_col_names}=1; } } else { $vcf_out = Vcf->new(); } $$vcf_out{trim_redundant_ALTs} = $$opts{trim_redundant_ALTs}; init_cols($opts,$vcf_out); my @regions = @{$$opts{regions}}; my @cols = @{$$opts{cols}}; my @vcfs = @{$$opts{vcfs}}; # Get the header of the output VCF ready $vcf_out->add_columns(@cols); if ( !$$vcf_out{has_header} ) { for my $vcf (@vcfs) { # To get the missig fields filled by the default values for my $hline (@{$$vcf{header_lines}}) { if ( $$hline{key} eq 'fileformat' ) { next; } $vcf_out->add_header_line($hline,silent=>1); } } } # List source files my $source; for (my $i=0; $i<@vcfs; $i++) { if ( $i ) { $source .= ','; } $source .= "$i:$vcfs[$i]{file}"; } $vcf_out->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf_out->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp'); $vcf_out->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'}); my $have_samples = @{$$vcf_out{columns}}>9 ? 1 : 0; $vcf_out->recalc_ac_an($have_samples ? 2 : 0); $vcf_out->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf_out->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); print $vcf_out->format_header(); # Go through all VCF files simultaneously and output each line, one region at a time. for my $region (@regions) { # Open files for my $vcf (@vcfs) { delete($$vcf{done}); $vcf->open(region=>$region); } while ( my $pos=advance_position($opts,\@vcfs) ) { my %out; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{FORMAT} = []; my %format; my %info; my @src_files; my %filters; my (@quals,@qual_weights,$qual_weights_sum,%ac,$an); my %ref_alt_map = (); # Find out the REFs and ALTs: in VCFv4.0, the REFs can differ and ALTs must be converted for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } if ( !exists($out{CHROM}) ) { $out{CHROM} = $$line{CHROM}; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $ref_alt_map{$ref}{$alt}=$alt; } } # Do the REF,ALT conversion only when necessary my $new_ref; if ( scalar keys %ref_alt_map > 1 ) { $new_ref = $vcf_out->fill_ref_alt_mapping(\%ref_alt_map); if ( !defined $new_ref ) { error("Failed on line $out{CHROM}:$out{POS}\n"); } } if ( !$have_samples or !$$opts{trim_redundant_ALTs} ) { # Do not loose information from the ALT column when samples are not present my %alts; for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $alts{$ref_alt_map{$ref}{$alt}}=1; } delete($alts{'.'}); $out{ALT} = [ keys %alts ]; } } for (my $ivcf=0; $ivcf<@vcfs; $ivcf++) { my $vcf = $vcfs[$ivcf]; my $line = $$vcf{last_line}; # If this file does not have a record for this position, then for all its columns output undef gtype if ( !$line ) { for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $name = $$vcf{__col_names}->[$i]; $out{gtypes}{$name}{GT} = exists($$opts{ref_for_missing}) ? $$opts{ref_for_missing} : $$vcf_out{defaults}{GT}; } next; } # Check if the site has been filtered if ( scalar @{$$line{FILTER}}>1 or ($$line{FILTER}[0] ne $$vcf{filter_passed} && $$line{FILTER}[0] ne $$vcf{defaults}{default}) ) { push @src_files,$ivcf.'f'; } else { push @src_files,$ivcf; } # Collect information for the FILTER field for my $flt (@{$$line{FILTER}}) { if ( $flt eq $$vcf{filter_passed} ) { $filters{$$vcf_out{filter_passed}} = 1; } elsif ( $flt ne $$vcf{defaults}{default} ) { $filters{$flt} = 1; } } # Collect information for the QUAL field if ( $$line{QUAL} ne $$vcf{defaults}{QUAL} && $$line{QUAL} ne $$vcf{defaults}{default} && $$line{QUAL}>0 ) { push @quals,$$line{QUAL}; push @qual_weights,$$vcf{qual_weight}; $qual_weights_sum += $$vcf{qual_weight}; } if ( $$line{ID} ne '.' && $out{ID} eq '.' ) { $out{ID}=$$line{ID}; } # Remember the FORMAT fields for my $field (@{$$line{FORMAT}}) { $format{$field} = 1; } # VCF without genotypes: calculate AC,AN if present if ( !$have_samples ) { if ( exists($$line{INFO}{AN}) ) { $an += $$line{INFO}{AN}; } if ( exists($$line{INFO}{AC}) ) { my (@acs) = split(/,/,$$line{INFO}{AC}); for (my $i=0; $i<@acs; $i++) { my $alt = $ref_alt_map{$$line{REF}}{$$line{ALT}[$i]}; $ac{$alt} += $acs[$i]; } } } # Join the INFO field for my $inf (keys %{$$line{INFO}}) { # When conflicting INFO fields are present, use the first one, unless a joining method exists if ( exists($info{$inf}) ) { if ( exists($$opts{joiner}{$inf}) ) { &{$$opts{joiner}{$inf}}(\$info{$inf},$$line{INFO}{$inf}); } next; } $info{$inf} = $$line{INFO}{$inf}; } my $ref = $$line{REF}; # The ALT column may change after the merge, take care of ALT dependent tags such as GL. if ( $have_samples ) { if ( defined $new_ref ) { $vcf->parse_AGtags($line,\%ref_alt_map,$$line{REF}); } else { $vcf->parse_AGtags($line); } } # Now fill in the genotype information for each column for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $ori_name = $$vcf{columns}->[$i+9]; my $out_name = $$vcf{__col_names}->[$i]; $out{gtypes}{$out_name} = $$line{gtypes}{$ori_name}; # This is to convert 0/1 to G/C my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($line,$ori_name); if ( defined $new_ref ) { my @als; for my $al (@$alleles) { push @als, exists($ref_alt_map{$ref}{$al}) ? $ref_alt_map{$ref}{$al} : '.'; } $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype(\@als,$seps); } else { $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype($alleles,$seps); } } $out{REF} = defined $new_ref ? $new_ref : $ref; } $out{INFO} = { %info }; $out{INFO}{SF} = join(',',@src_files); # Output the QUAL information my $qual; for (my $i=0; $i<@quals; $i++) { $qual += $quals[$i] * $qual_weights[$i] * (1.0 / $qual_weights_sum); } $out{QUAL} = defined $qual ? sprintf("%.2f",$qual) : $$vcf_out{defaults}{QUAL}; # Output the FILTER information: remove PASS or missing value if some other information # is present. delete($filters{$$vcf_out{defaults}{default}}); if ( exists($filters{$$vcf_out{filter_passed}}) && scalar keys %filters > 1 ) { delete($filters{$$vcf_out{filter_passed}}); } $out{FILTER} = [ keys %filters ]; if ( !@{$out{FILTER}} ) { push @{$out{FILTER}},$$vcf_out{defaults}{default}; } # The GT field must come as first delete($format{GT}); $out{FORMAT} = ['GT']; for my $key (keys %format) { push @{$out{FORMAT}},$key; } if ( $have_samples ) { $vcf_out->format_genotype_strings(\%out); } else { if ( defined $an ) { $out{INFO}{AN}=$an; } if ( scalar keys %ac ) { my @acs; for my $alt (@{$out{ALT}}) { # Some of the files may not have AC, the AC count can be undefined in such a case. push @acs, exists($ac{$alt}) ? $ac{$alt} : 0; } $out{INFO}{AC} = join(',',@acs); } } print $vcf_out->format_line(\%out); } } for my $vcf (@vcfs) { $vcf->close() or error("close failed: $$vcf{file}\n"); } } sub advance_position { my ($opts,$vcfs) = @_; my $min_pos; for my $vcf (@$vcfs) { fill_buffer($opts,$vcf) unless $$vcf{done}; if ( @{$$vcf{line_buffer}} && (!defined $min_pos or $min_pos>$$vcf{line_buffer}[0]{POS}) ) { $min_pos = $$vcf{line_buffer}[0]{POS}; } } if ( !defined $min_pos ) { return undef; } my ($first,$has_snp,$has_indel); for my $vcf (@$vcfs) { delete($$vcf{last_line}); if ( @{$$vcf{line_buffer}} && $min_pos ne $$vcf{line_buffer}[0]{POS} ) { next; } if ( !defined $first ) { $$vcf{last_line} = shift @{$$vcf{line_buffer}}; $first = $$vcf{last_line}; next; } my $irec; for (my $i=0; $i<@{$$vcf{line_buffer}}; $i++) { my $line = $$vcf{line_buffer}[$i]; if ( $$line{POS} ne $$first{POS} ) { last; } if ( $$opts{collapse_any} ) { $irec=$i; last; } # checking position only if ( $$opts{collapse_snps} && $$first{variant_type}&1 && $$line{variant_type}&1 ) { $irec=$i; last; } if ( $$opts{collapse_indels} && $$first{variant_type}&2 && $$line{variant_type}&2 ) { $irec=$i; last; } if ( $$vcf{line_buffer}[$i]{REF} ne $$first{REF} ) { next; } # refs do not match for my $al1 (@{$$line{ALT}}) { for my $al2 (@{$$first{ALT}}) { if ( $al1 eq $al2 ) { $irec=$i; last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { $$vcf{last_line} = splice(@{$$vcf{line_buffer}},$irec,1); } } return $min_pos; } sub fill_buffer { my ($opts,$vcf) = @_; if ( @{$$vcf{line_buffer}} && $$vcf{line_buffer}[0]{POS}!=$$vcf{line_buffer}[-1]{POS} ) { return; } while ( 1 ) { my $line = $vcf->next_data_hash(); if ( !$line ) { $$vcf{done} = 1; return; } if ( !$$opts{collapse_any} ) { for my $al (@{$$line{ALT}}) { my ($type,$len,$ht) = $vcf->event_type($$line{REF},$al); if ( $type eq 's' or $type eq 'r' ) { $$line{variant_type} |= 1; } if ( $type eq 'i' or $type eq 'o' ) { $$line{variant_type} |= 2; } } } push @{$$vcf{line_buffer}}, $line; if ( $$vcf{line_buffer}[0]{POS} != $$vcf{line_buffer}[-1]{POS} ) { return; } } } # Field joiner methods sub joiner_sum { my ($ori,$new) = @_; $$ori += $new; } sub joiner_dp4 { my ($ori,$new) = @_; my @vals1 = split(/,/,$$ori); my @vals2 = split(/,/,$new); if ( @vals1 != @vals2 ) { error("Cannot join: $$ori vs $new\n"); } for (my $i=0; $i<@vals1; $i++) { $vals1[$i] += $vals2[$i]; } $$ori = join(',',@vals1); } vcftools-0.1.15/src/perl/vcf-phased-join000077500000000000000000000553021307140004000200630ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); if ( $$opts{split_size} ) { split_vcf($opts); } else { join_vcfs($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: The script takes multiple overlapping pre-phased chunks and concatenates them into one VCF\n", " using heterozygous calls from the overlaps to determine correct phase.\n", "Usage: vcf-phased-join [OPTIONS] A.vcf B.vcf C.vcf\n", "Options:\n", " -j, --min-join-quality Quality threshold for gluing the pre-phased blocks together [10]\n", " -l, --list List of VCFs to join.\n", " -o, --output Output file name. When \"-\" is supplied, STDOUT and STDERR will be used\n", " -q, --min-PQ Break pre-phased segments if PQ value is lower in input VCFs [0.6]\n", " -h, -?, --help This help message\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], min_join_quality => 10, min_PQ => 0.6, min_BP => 1, }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-o' || $arg eq '--output' ) { $$opts{output}=shift(@ARGV); next; } if ( $arg eq '-j' || $arg eq '--min-join-quality' ) { $$opts{min_join_quality}=shift(@ARGV); next; } if ( $arg eq '-q' || $arg eq '--min-PQ' ) { $$opts{min_PQ}=shift(@ARGV); next; } if ( $arg eq '-l' || $arg eq '--list' ) { $$opts{list}=shift(@ARGV); next; } if ( $arg eq '--min-BP' ) { $$opts{min_BP}=shift(@ARGV); next; } if ( $arg eq '--split-size' ) { $$opts{split_size}=shift(@ARGV); next; } if ( $arg eq '--split-noise' ) { $$opts{split_noise}=shift(@ARGV); next; } if ( $arg eq '--split-overlap' ) { $$opts{split_overlap}=shift(@ARGV); next; } if ( $arg eq '--split-prefix' ) { $$opts{split_prefix}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{vcfs}}, $arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( exists($$opts{list}) ) { open(my $fh,'<',$$opts{list}) or error("$$opts{list}: $!"); while (my $line=<$fh>) { if ($line=~/^\s*$/) { next; } $line =~ s/^\s*//; $line =~ s/\s*$//; if ( ! -e $line ) { error("Some of the files in $$opts{list} do not exist\n"); } push @{$$opts{vcfs}},$line; } close($fh); } if ( !exists($$opts{vcfs}) ) { error("No VCF file given"); } if ( !exists($$opts{split_size}) ) { if ( @{$$opts{vcfs}}<1 ) { error("No input VCF given?\n"); } if ( @{$$opts{vcfs}}<2 ) { warn("Only one input VCF given, running in --min-PQ splitting mode.\n"); if ( $$opts{min_PQ}<=0.5 ) { warn("You better know what you're doing: --min-PQ set too low, will hardly find any split!"); } } if ( !exists($$opts{output}) ) { error("No output VCF file name given"); } } return $opts; } sub split_vcf { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcfs}[0]); $vcf->parse_header(); $$opts{vcf} = $vcf; my ($fh_next,$swap_next) = open_next_file($opts); my ($fh,$prev_boundary,$start_pos,@buffer,$prev_chr,$prev_pos,$swap); while (my $rec=$vcf->next_data_array) { my $rec_next = []; for my $col (@$rec) { push @{$rec_next}, "$col"; } my $chr = $$rec[0]; my $pos = $$rec[1]; if ( defined $prev_chr && $prev_chr ne $chr ) { last; } if ( defined $prev_pos && $pos<=$prev_pos ) { error("Not sorted or duplicate position: $chr:$prev_pos vs $chr:$pos"); } $prev_pos = $pos; $prev_chr = $chr; if ( !defined $start_pos ) { $start_pos = $pos; } my $bnd = $start_pos + int(($pos-$start_pos)/$$opts{split_size})*$$opts{split_size}; if ( $start_pos!=$bnd && abs($pos-$bnd)*2 <= $$opts{split_overlap} ) { # Known boundary if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); } if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); } next; } $bnd += $$opts{split_size}; if ( abs($pos-$bnd)*2 >= $$opts{split_overlap} ) { print $fh_next swap_gts($vcf,$swap_next,$rec); next; } # New boundary if ( !defined $prev_boundary || $prev_boundary ne $bnd ) { close($fh) unless !defined $fh; $fh = $fh_next; $swap = $swap_next; $prev_boundary = $bnd; $fh_next = undef; } if ( !defined $fh_next ) { ($fh_next,$swap_next) = open_next_file($opts); $prev_boundary = $bnd; } if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); } if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); } } if ( defined $fh ) { close($fh); } if ( defined $fh_next ) { close($fh_next); } } sub open_next_file { my ($opts) = @_; $$opts{split_ifile}++; my $fname = sprintf "%s%02d.vcf", $$opts{split_prefix},$$opts{split_ifile}; open(my $fh,'>',$fname) or error("$fname: $!"); print $fh $$opts{vcf}->format_header; my @swap; for (my $i=9; $i<@{$$opts{vcf}{columns}}; $i++) { if ( $$opts{split_ifile}==1 ) { $swap[$i-9] = -1; } else { $swap[$i-9] = int(rand(2)) ? 1 : -1; } if ( $swap[$i-9]==1 ) { printf "%s\t%s\tswapped\n",$fname,$$opts{vcf}{columns}[$i]; } } return ($fh,\@swap); } sub randomize { my ($opts,$swap,$vcf,$rec,$dist) = @_; if ( $dist>$$opts{split_overlap} ) { $dist = $$opts{split_overlap}; } my $noise = $dist/$$opts{split_overlap}; if ( exists($$opts{split_noise}) ) { $noise = $$opts{split_noise}; } my $na = 2 * (scalar @$rec - 9); my $nchanged = int($na*$noise); if ( !$nchanged ) { return swap_gts($vcf,$swap,$rec); } use List::Util 'shuffle'; my @errors = (1) x $nchanged; if ( $nchanged<$na ) { @errors = (@errors, (0) x ($na-$nchanged)); } @errors = shuffle(@errors); print "$$rec[1] .. dist=$dist, changed=$nchanged total=$na ($noise)\n"; my $itag = $vcf->get_tag_index($$rec[8],'GT',':'); my $i = -2; for (my $isample=9; $isample<@$rec; $isample++) { $i += 2; if ( !$errors[$i] && $errors[$i+1] ) { next; } my $gt = $vcf->get_field($$rec[$isample],$itag); my ($a1,$a2) = $vcf->split_gt($gt); if ( $errors[$i] ) { $a1 = $a1 ? 0 : 1; } if ( $errors[$i+1] ) { $a2 = $a2 ? 0 : 1; } $$rec[$isample] = $vcf->replace_field($$rec[$isample],"$a1|$a2",$itag,':'); } return swap_gts($vcf,$swap,$rec); } sub swap_gts { my ($vcf,$swap,$rec) = @_; my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $gts = $vcf->get_sample_field($rec,$igt); for (my $i=0; $i<@$gts; $i++) { if ( $$swap[$i]==-1 ) { next; } my ($a1,$a2) = $vcf->split_gt($$gts[$i]); $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':'); } return $vcf->format_line($rec); } sub check_columns { my ($opts) = @_; my @columns; for my $file (@{$$opts{vcfs}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); if ( @columns ) { if ( @columns != @{$$vcf{columns}} ) { warn("Different number of columns in [$file].\n"); } for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { warn("The column names do not agree in [$file].\n"); last; } } } else { @columns = @{$$vcf{columns}}; } $vcf->close(); } $$opts{nsamples} = @columns-9; } sub log_msg { my ($opts,@msg) = @_; print {$$opts{log_fh}} @msg; } sub next_vcf_file { my ($opts) = @_; if ( !exists($$opts{ifile}) ) { $$opts{ifile}=-1; } my $chr = $$opts{current_chr}; my @vcfs = @{$$opts{chroms}{$chr}}; while (1) { $$opts{ifile}++; if ( $$opts{ifile} >= @vcfs ) { return (undef,undef); } $$opts{ivcf_fname} = $vcfs[$$opts{ifile}]; my $vcf = Vcf->new(file=>$$opts{ivcf_fname}, region=>$chr, print_header=>1); $vcf->parse_header(); my $rec = $vcf->next_data_array(); if ( !defined $rec ) { next; } return ($vcf,$rec); } } sub join_vcfs { my ($opts) = @_; # Determine the chromosomes for my $vcf (@{$$opts{vcfs}}) { my @chroms = `tabix -l $vcf`; if ( $? ) { error(qq[The command "tabix -l $vcf" exited with an error. Is the file tabix indexed?\n]); } if ( !@chroms ) { warn(qq[Warning: Is the VCF file $vcf empty?\n]); } for my $chr (@chroms) { chomp($chr); push @{$$opts{chroms}{$chr}},$vcf; } } check_columns($opts); $$opts{phased_blocks} = [ (0) x $$opts{nsamples} ]; $$opts{broken_blocks} = [ (0) x $$opts{nsamples} ]; for my $chr (sort keys %{$$opts{chroms}}) { $$opts{current_chr} = $chr; join_vcfs_chr($opts); } report_stats($opts); } sub join_vcfs_chr { my ($opts) = @_; delete($$opts{ifile}); $$opts{swapped} = [ (0) x $$opts{nsamples} ]; $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; my ($vcf1,$rec1) = next_vcf_file($opts); if ( !defined $rec1 ) { error("Broken/Empty VCFs?"); } if ( $$opts{output} ne '-' ) { my $logfile = $$opts{output}; if ( $$opts{output}=~/\.[^\.]+$/ ) { $logfile = $`; } $logfile .= '.plog'; open($$opts{log_fh},'>',$logfile) or error("$logfile: $!"); open($$opts{out_fh},'>',$$opts{output}) or error("$$opts{output}: $!"); } else { $$opts{log_fh} = \*STDERR; $$opts{out_fh} = \*STDOUT; } $$opts{vcf} = $vcf1; if ( !$$opts{header_printed} ) { $$opts{header_printed} = 1; $$opts{vcf}->add_header_line({key=>'FORMAT',ID=>'PS',Number=>1,Type=>'Integer',Description=>'Phase set'}); $$opts{vcf}->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print {$$opts{out_fh}} $$opts{vcf}->format_header(); log_msg($opts, "# This file was generated by vcf-phased-join.\n"); log_msg($opts, "# The command line was: ", join(' ',@{$$opts{args}}), "\n"); log_msg($opts, "#\n"); log_msg($opts, "#PS 'Phasing Summary'. Use `grep ^PS | cut -f 2-` to extract this part.\n"); log_msg($opts, "#PS The columns are:\n"); log_msg($opts, "#PS 1,2 .. the pair of files being joined\n"); log_msg($opts, "#PS 3 .. the overlapping region used for determining the phase\n"); log_msg($opts, "#PS 4 .. sample name\n"); log_msg($opts, "#PS 5 .. did a swap occur?\n"); log_msg($opts, "#PS 6 .. quality of phase assignment\n"); log_msg($opts, "#PS 7 .. number of het genotypes used for phasing\n"); log_msg($opts, "#PS 8,9 .. log10 likelihood of phase match/mismatch\n"); } $$opts{file1} = $$opts{ivcf_fname}; my ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { # Only one non-empty VCF file present, running in --min-PQ splitting mode while ( defined($rec1 = $vcf1->next_data_array()) ) { output_line($opts,$rec1,$$opts{swapped}); } return; } else { $$opts{file2} = $$opts{ivcf_fname}; if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } my @buffer; while (1) { # is vcf1 ahead of vcf2? while ( $$rec1[1] < $$rec2[1] ) { output_line($opts,$rec1,$$opts{swapped}); $rec1 = $vcf1->next_data_array(); if ( !defined $rec1 ) { last; } if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } if ( defined $rec1 ) { while ( $$rec1[1] eq $$rec2[1] ) { push @buffer, [$rec1,$rec2]; $rec1 = $vcf1->next_data_array(); $rec2 = $vcf2->next_data_array(); if ( !defined $rec1 ) { last; } if ( !defined $rec2 ) { error("The file $$opts{file1} ended before $$opts{file2}."); } if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } if ( defined $rec1 && $$rec1[1] ne $$rec2[1] ) { error("ERROR\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n"); } } # is vcf1 done? if ( !defined $rec1 ) { flush_buffer($opts,$vcf1,\@buffer); $vcf1->close(); if ( !defined $rec2 ) { # Yes, this can happen when file1 ends exactly where file2 does $vcf2->close(); ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { last; } } $vcf1 = $vcf2; $rec1 = $rec2; $$opts{file1} = $$opts{ivcf_fname}; ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { last; } $$opts{file2} = $$opts{ivcf_fname}; if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } next; } } if ( @buffer ) { flush_buffer($opts,$vcf1,\@buffer); } do { output_line($opts,$rec1,$$opts{swapped}) unless !defined $rec1; } while ( exists($$vcf1{fh}) && defined($rec1 = $vcf1->next_data_array()) ); } sub wrong_order { my ($opts,$rec1,$rec2) = @_; if ( $$rec1[0] ne $$rec2[0] ) { error("Encountered different chromosomes in $$opts{file1} and $$opts{file2}: \"$$rec1[0]:$$rec1[1]\" vs \"$$rec2[0]:$$rec2[1]\"\n"); } if ( $$rec1[1] > $$rec2[1] ) { log_msg($opts,"WARNING\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n"); return 1; } return 0; } sub output_line { my ($opts,$rec,$swap) = @_; my $vcf = $$opts{vcf}; my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $ips = $vcf->get_tag_index($$rec[8],'PS',':'); if ( $ips==-1 ) { $$rec[8] .= ':PS'; } my $ipq = exists($$opts{min_PQ}) ? $vcf->get_tag_index($$rec[8],'PQ',':') : -1; my $breakpoints = 0; for (my $i=0; $i<@$swap; $i++) { if ( $$swap[$i]==1 ) { my $gt = $vcf->get_field($$rec[$i+9],$igt); my ($a1,$a2) = $vcf->split_gt($gt); if ( defined $a2 ) { $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':'); } } if ( $ipq!=-1 ) { my $pq = $vcf->get_field($$rec[$i+9],$ipq); if ( $pq ne '.' && $pq<$$opts{min_PQ} ) { $$opts{phasing_set}[$i]=0; $$opts{broken_blocks}[$i]++; } } if ( !$$opts{phasing_set}[$i] ) { $$opts{phasing_set}[$i] = $$rec[1]; $$opts{phased_blocks}[$i]++; $breakpoints++; } if ( $ips==-1 ) { $$rec[$i+9] .= ':'.$$opts{phasing_set}[$i]; } else { $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],$$opts{phasing_set}[$i],$ips,':'); } } print {$$opts{out_fh}} $vcf->format_line($rec); $breakpoints *= 100./$$opts{nsamples}; if ( $breakpoints>$$opts{min_BP} ) { push @{$$opts{breakpoints}}, sprintf("BP\t%s\t%d\t%.1f\n", $$rec[0],$$rec[1],$breakpoints); } } sub flush_buffer { my ($opts,$vcf,$buffer) = @_; if ( !@$buffer ) { $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; return; } my $chr = $$buffer[0][0][0]; my $from = $$buffer[0][0][1]; my $to = $$buffer[-1][0][1]; # Determine likelihoods of genotypes being swapped my @lks_match = (); my @lks_mism = (); my @nsites = (0) x $$opts{nsamples}; for my $site (@$buffer) { my $rec1 = $$site[0]; my $rec2 = $$site[1]; my $igt1 = $vcf->get_tag_index($$rec1[8],'GT',':'); my $igt2 = $vcf->get_tag_index($$rec2[8],'GT',':'); my $gts1 = $vcf->get_sample_field($rec1,$igt1); my $gts2 = $vcf->get_sample_field($rec2,$igt2); my $ngts = $$opts{nsamples}; my $nerrors = 0; my (@als1,@als2,@phased); for (my $i=0; $i<@$gts1; $i++) { if ( index($$gts1[$i],'|')==-1 or index($$gts2[$i],'|')==-1 ) { push @phased, 0; } else { push @phased, 1; } my ($a1,$a2) = $vcf->split_gt($$gts1[$i]); my ($b1,$b2) = $vcf->split_gt($$gts2[$i]); if ( !defined $a2 ) { $a2 = $a1; } # haploid genotypes if ( !defined $b2 ) { $b2 = $b1; } if ( !(($a1 eq $b1 && $a2 eq $b2) or ($a1 eq $b2 && $a2 eq $b1)) ) { $nerrors++ } push @als1, $a1,$a2; push @als2, $b1,$b2; } my $dist = $to-$$site[0][1] < $$site[0][1]-$from ? $to-$$site[0][1] : $$site[0][1]-$from; $$opts{dist_errors}{$nerrors}++; my $p = $nerrors/$ngts; if ( $p==0 ) { $p=1./$ngts; } elsif ( $p==1 ) { $p=1 - 1./$ngts; } for (my $i=0; $i<@$gts1; $i++) { if ( !$phased[$i] ) { next; } my $a1 = $als1[2*$i]; my $a2 = $als1[2*$i+1]; my $b1 = $als2[2*$i]; my $b2 = $als2[2*$i+1]; if ( $a1 eq $a2 or $b1 eq $b2 ) { next; } # homozygous GT if ( $a1 eq $b1 && $a2 eq $b2 ) { #print STDERR "$i .. counting match $a1/$a2 $b1/$b2\n"; $lks_match[$i] += log($p*$p + (1-$p)*(1-$p)); $lks_mism[$i] += log($p*(1-$p) + (1-$p)*$p); } elsif ( $a1 eq $b2 && $a2 eq $b1 ) { #print STDERR "$i .. counting mismatch $a1/$a2 $b1/$b2\n"; $lks_match[$i] += log($p*(1-$p) + (1-$p)*$p); $lks_mism[$i] += log($p*$p + (1-$p)*(1-$p)); } else { next; } # different alleles might have been selected at multiallelic sites $nsites[$i]++; } } my $file1 = $$opts{file1}; my $file2 = $$opts{file2}; my @swapped = ( (0) x $$opts{nsamples} ); my @quals = (); my $log10 = log(10); for (my $i=0; $i<$$opts{nsamples}; $i++) { if ( !defined $lks_match[$i] ) { $lks_match[$i] = $lks_mism[$i] = log(0.5); } $swapped[$i] = $lks_match[$i]>$lks_mism[$i] ? $$opts{swapped}[$i] : -1*$$opts{swapped}[$i]; $quals[$i] = abs($lks_match[$i]-$lks_mism[$i])/$log10; log_msg($opts, sprintf "PS\t%s\t%s\t$chr:$from-$to\t%s\t%d\t%.1f\t%d\t%f\t%f\n", $file1,$file2, $$vcf{columns}[$i+9], $swapped[$i]==-1?0:1, $quals[$i], $nsites[$i], $lks_match[$i]/$log10, $lks_mism[$i]/$log10); } # Do not allow segment breaking while processing the buffer: this may help sometimes, but may also make things worse. my $min_PQ = $$opts{min_PQ}; delete($$opts{min_PQ}); # In case there is no overlap, reset the phasing set if ( !@quals ) { $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; } # Output the VCF line and quality for my $site (@$buffer) { # Which of the two overlapping VCF lines to output? Take the one farther from the end. my ($rec,$swap); if ( $to-$$site[0][1] > $$site[0][1]-$from ) { $rec = $$site[0]; $swap = $$opts{swapped}; } else { # Update the phasing set ID if ( @quals ) { for (my $i=0; $i<@quals; $i++) { if ( $quals[$i] < $$opts{min_join_quality} ) { $$opts{phasing_set}[$i]=0; } } @quals = (); } $rec = $$site[1]; $swap = \@swapped; } output_line($opts,$rec,$swap); } $$opts{min_PQ} = $min_PQ; @$buffer = (); $$opts{swapped} = \@swapped; } sub report_stats { my ($opts) = @_; log_msg($opts, "#NS Number of phased segments. Use `grep ^NS | cut -f 2-` to extract this part.\n"); log_msg($opts, "#NS The columns are:\n"); log_msg($opts, "#NS 1 .. sample\n"); log_msg($opts, "#NS 2 .. number of phased blocks\n"); log_msg($opts, "#NS 3 .. number of blocks created because of low PQ\n"); log_msg($opts, "#NS 4 .. number of blocks created because of low joining quality\n"); for my $i (sort { $$opts{phased_blocks}[$b] <=> $$opts{phased_blocks}[$a] } (0..($$opts{nsamples}-1))) { log_msg($opts,sprintf "NS\t%s\t%d\t%d\t%d\n", $$opts{vcf}{columns}[9+$i],$$opts{phased_blocks}[$i],$$opts{broken_blocks}[$i],$$opts{phased_blocks}[$i]-$$opts{broken_blocks}[$i]); } log_msg($opts, "#BP Break Points. Use `grep ^BP | cut -f 2-` to extract this part.\n"); log_msg($opts, "#BP The columns are:\n"); log_msg($opts, "#BP 1 .. chromosome\n"); log_msg($opts, "#BP 2 .. position\n"); log_msg($opts, "#BP 3 .. percent of samples with breakpoint at that position\n"); for my $break (@{$$opts{breakpoints}}) { log_msg($opts,$break); } log_msg($opts, "#ED Error Distribution. Use `grep ^ED | cut -f 2-` to extract this part.\n"); log_msg($opts, "#ED The columns are:\n"); log_msg($opts, "#ED 1 .. number of GT mismatches per site not attributable to phasing\n"); log_msg($opts, "#ED 2 .. frequency \n"); for my $nerrors (sort {$a<=>$b} keys %{$$opts{dist_errors}}) { log_msg($opts, "ED\t$nerrors\t$$opts{dist_errors}{$nerrors}\n"); } } vcftools-0.1.15/src/perl/vcf-query000077500000000000000000000367301307140004000170330ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); read_data($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: vcf-query [OPTIONS] file.vcf.gz\n", "Options:\n", " -c, --columns List of comma-separated column names or one column name per line in a file.\n", " -f, --format The default is '%CHROM:%POS\\t%REF[\\t%SAMPLE=%GT]\\n'\n", " -l, --list-columns List columns.\n", " -r, --region chr:from-to Retrieve the region. (Runs tabix.)\n", " --use-old-method Use old version of API, which is slow but more robust.\n", " -h, -?, --help This help message.\n", "Expressions:\n", " %CHROM The CHROM column (similarly also other columns)\n", " %GT Translated genotype (e.g. C/A)\n", " %GTR Raw genotype (e.g. 0/1)\n", " %INFO/TAG Any tag in the INFO column\n", " %LINE Prints the whole line\n", " %SAMPLE Sample name\n", " [] The brackets loop over all samples\n", " %* All format fields printed as KEYVALUE\n", "Examples:\n", " vcf-query file.vcf.gz 1:1000-2000 -c NA001,NA002,NA003\n", " vcf-query file.vcf.gz -r 1:1000-2000 -f '%CHROM:%POS\\t%REF\\t%ALT[\\t%SAMPLE:%*=,]\\n'\n", " vcf-query file.vcf.gz -f '[%GT\\t]%LINE\\n'\n", " vcf-query file.vcf.gz -f '[%GT\\ ]%LINE\\n'\n", " vcf-query file.vcf.gz -f '%CHROM\\_%POS\\t%INFO/DP\\t%FILTER\\n'\n", "Notes:\n", " Please use `bcftools query` instead, this script will not be supported in future.\n", "\n"; } sub parse_params { my $opts = { columns=>'', format_string=>"%CHROM:%POS\t%REF[\t%SAMPLE=%GT]\n" }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '--use-old-method' ) { $$opts{use_old_method}=1; next } if ( $arg eq '-f' || $arg eq '--format' ) { $$opts{format_string}=shift(@ARGV); next } if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{columns}=shift(@ARGV); next } if ( $arg eq '-l' || $arg eq '--list-columns' ) { $$opts{list_columns}=1; next } if ( $arg eq '-r' || $arg eq '--region' ) { $$opts{region}=shift(@ARGV); next } if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( !exists($$opts{region}) && exists($$opts{file}) && ($arg=~/^[^:]+:[0-9,]+-[0-9,]+$/ or $arg=~/^[^\:]+$/) ) { $$opts{region}=$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{file}) && exists($$opts{region}) ) { error("The region cannot be used when streaming the file.\n"); } if ( exists($$opts{columns}) && -e $$opts{columns} ) { my @cols; open(my $fh,'<',$$opts{columns}) or error("$$opts{columns}: $!"); while (my $line=<$fh>) { if ( $line=~/^\s*$/ ) { next; } $line =~ s/^\s*//; $line =~ s/\s*$//; push @cols, $line; } close($fh); $$opts{columns} = join(',', @cols); } return $opts; } sub parse_format_string { my ($str,$hash) = @_; my (@arr,%idx,$join1,$join2); $str =~ s/\\n/\n/g; $str =~ s/\\t/\t/g; while ($str) { if ( !($str=~/%/) ) { push @arr,$str; last; } my $before = $`; $str = $'; my $match; if ( $str=~/^[*](.)(.)/ ) { $match = '*'; $join1=$1; $join2=$2; } elsif ( $str=~m{([A-Za-z0-9/_]+)} ) { $match = $1; } else { error("FIXME: $str"); } if ( defined $before && $before ne '' ) { push @arr,$before; } push @arr,'.'; # If the tag is not present in the VCF, a missing value ('.') will be printed instead. if ( exists($idx{$match}) ) { warn("The tag \"$match\" given multiple times, only the last occurance will be used\n"); } $idx{$match} = $#arr; $str = $'; } for (my $i=0; $i<@arr; $i++) { $arr[$i] =~ s/\\{1}//g; } $$hash{format} = \@arr; $$hash{idx} = \%idx; $$hash{join1} = $join1; $$hash{join2} = $join2; } sub parse_format { my ($opts,$cols) = @_; $$opts{before} = {}; $$opts{repeat} = {}; $$opts{after} = {}; my ($before,$repeat,$after); my $str = $$opts{format_string}; $before = $str; if ( $str=~/\[([^\]]+)\]/ ) { $before = $`; $repeat = $1; $after = $'; } if ( $before ) { parse_format_string($before,$$opts{before}); } if ( $repeat ) { parse_format_string($repeat,$$opts{repeat}); } if ( $after ) { parse_format_string($after,$$opts{after}); } } sub copy_array { my ($arr) = @_; my @out; for my $item (@$arr) { push @out,$item; } return @out; } sub get_columns { my ($vcf) = @_; my @cols = (); my $ncols = @{$$vcf{columns}}; for (my $i=9; $i<$ncols; $i++) { push @cols, $$vcf{columns}[$i]; } return \@cols; } sub get_sample_idxs { my ($vcf,@samples) = @_; my @idxs; for my $sample (@samples) { if ( !exists($$vcf{has_column}{$sample}) ) { error("No such sample: [$sample]\n"); } push @idxs, $$vcf{has_column}{$sample} - 1; } return @idxs; } sub list_columns { my ($opts) = @_; my $cols = get_columns($$opts{vcf}); for my $col (@$cols) { print "$col\n"; } } sub read_data { my ($opts) = @_; if ( exists($$opts{use_old_method}) ) { read_data_slow_hash($opts); return; } my %args = ( print_header=>1 ); if ( $$opts{region} ) { $args{region} = $$opts{region}; } if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } else { $args{fh} = \*STDIN; } my $vcf = Vcf->new(%args); $$opts{vcf} = $vcf; $vcf->parse_header(); if ( $$opts{list_columns} ) { list_columns($opts); exit; } my @cols = split(/,/,$$opts{columns}); if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } my @sample_idxs = get_sample_idxs($$opts{vcf},@cols); # The hash opts will be filled with the keys 'before','repeat','after' with formatting information parse_format($opts); while (my $line=$vcf->next_line()) { my $x = $vcf->next_data_array($line); # Fill everything what comes before the repeat [] if ( $$opts{before} ) { my (@out) = copy_array($$opts{before}{format}); while (my ($fieldname,$idx) = each %{$$opts{before}{idx}}) { if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } elsif ( substr($fieldname,0,5) eq 'INFO/' ) { $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } # Fill the repeaty stuff (the sample columns) if ( $$opts{repeat} ) { my @repeats; for my $sample_idx (@sample_idxs) { push @repeats, [ copy_array($$opts{repeat}{format}) ]; } my @alt; if ( exists($$opts{repeat}{idx}{GT}) ) { @alt = split(/,/,$$x[4]); } while (my ($fieldname,$idx) = each %{$$opts{repeat}{idx}}) { if ( $fieldname eq '*' ) { my $sep1 = $$opts{repeat}{join1}; my $sep2 = $$opts{repeat}{join2}; my @fmt = split(/:/,$$x[8]); for (my $i=0; $i<@sample_idxs; $i++) { my $sample_idx = $sample_idxs[$i]; my @tmp; my $j = 0; for my $value (split(/:/,$$x[$sample_idx])) { push @tmp, $fmt[$j++].$sep1.$value; } $repeats[$i][$idx] = join($sep2,@tmp); } next; } my $fmt_idx = $vcf->get_tag_index($$x[8],$fieldname eq 'GTR' ? 'GT' : $fieldname,':'); for (my $i=0; $i<@sample_idxs; $i++) { my $sample_idx = $sample_idxs[$i]; if ( $fmt_idx!=-1 ) { my $value = $vcf->get_field($$x[$sample_idx],$fmt_idx); if ( $fieldname eq 'GT' ) { $value = $vcf->decode_genotype($$x[3],\@alt,$value); } $repeats[$i][$idx] = $value; } } } if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { my $idx = $$opts{repeat}{idx}{SAMPLE}; for (my $i=0; $i<@cols; $i++) { $repeats[$i][$idx] = $cols[$i] } } for my $repeat (@repeats) { for (my $i=0; $i<@$repeat; $i++) { if (!defined($$repeat[$i])) { $$repeat[$i]='.'; } } print join('',@$repeat); } } # Fill everything what comes after the repeat ([]) if ( $$opts{after} ) { my (@out) = copy_array($$opts{after}{format}); while (my ($fieldname,$idx) = each %{$$opts{after}{idx}}) { if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } elsif ( substr($fieldname,0,5) eq 'INFO/' ) { $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } } sub read_data_slow_hash { my ($opts) = @_; my %args = ( print_header=>1 ); if ( $$opts{region} ) { $args{region} = $$opts{region}; } if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } else { $args{fh} = \*STDIN; } my $vcf = Vcf->new(%args); $$opts{vcf} = $vcf; $vcf->parse_header(); if ( $$opts{list_columns} ) { list_columns($opts); exit; } my @cols = split(/,/,$$opts{columns}); if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } # The hash opts will be filled with the keys 'before','repeat','after' with formatting information parse_format($opts); while (my $line=$vcf->next_line()) { my $x=$vcf->next_data_hash($line); # Fill everything what comes before the repeat [] # Code repetition and not very nice, should be changed at some point... if ( $$opts{before} ) { my (@out) = copy_array($$opts{before}{format}); while (my ($colname,$idx) = each %{$$opts{before}{idx}}) { if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } if ( $colname=~m{INFO/(.+)} ) { if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) { # It is a flag $out[$idx] = 'True'; } else { $out[$idx] = $$x{INFO}{$1}; } next; } if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } # Fill the repeaty stuff (the sample columns) if ( $$opts{repeat} ) { for my $col (@cols) { my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); my (@out) = copy_array($$opts{repeat}{format}); while (my ($colname,$idx) = each %{$$opts{repeat}{idx}}) { if ( exists($$x{gtypes}{$col}{$colname}) ) { $out[$idx] = $$x{gtypes}{$col}{$colname}; } elsif ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { $out[$$opts{repeat}{idx}{SAMPLE}] = $col; } if ( exists($$opts{repeat}{idx}{GTR}) ) { $out[$$opts{repeat}{idx}{GTR}] = $$x{gtypes}{$col}{GT}; } if ( exists($$opts{repeat}{idx}{GT}) ) { my $tmp = $$alleles[0]; for (my $i=0; $i<@$seps; $i++) { $tmp .= $$seps[$i].$$alleles[$i+1]; } $out[$$opts{repeat}{idx}{GT}] = $tmp; } if ( exists($$opts{repeat}{idx}{'*'}) ) { my $sep1 = $$opts{repeat}{join1}; my $sep2 = $$opts{repeat}{join2}; my @tmp; while (my ($key,$value)=each(%{$$x{gtypes}{$col}})) { if ( $key eq 'GT' ) { $value = $$alleles[0]; for (my $i=0; $i<@$seps; $i++) { $value .= $$seps[$i].$$alleles[$i+1]; } } push @tmp, $key.$sep1.$value; } my $idx = $$opts{repeat}{idx}{'*'}; $out[$idx] = join($sep2,@tmp); } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } # Fill everything what comes after the repeat ([]) if ( $$opts{after} ) { my (@out) = copy_array($$opts{after}{format}); while (my ($colname,$idx) = each %{$$opts{after}{idx}}) { if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } if ( $colname=~m{INFO/(.+)} ) { if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) { # It is a flag $out[$idx] = 'True'; } else { $out[$idx] = $$x{INFO}{$1}; } next; } if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } } vcftools-0.1.15/src/perl/vcf-shuffle-cols000077500000000000000000000043751307140004000202600ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); concat($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Reorder columns to match the order in the template VCF.\n", "Usage: vcf-shuffle-cols [OPTIONS] -t template.vcf.gz file.vcf.gz > out.vcf\n", "Options:\n", " -t, --template The file with the correct order of the columns.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-t' || $arg eq '--template' ) { $$opts{template}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file}=$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{template}) ) { error("Missing the -t option.\n"); } return $opts; } sub concat { my ($opts) = @_; my $tmpl = Vcf->new(file=>$$opts{template}); $tmpl->parse_header(); $tmpl->close(); my $vcf = $$opts{file} ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); # Check if one-to-one correspondence can be found and create a mapping my @new_to_old = (); for my $tcol (@{$$tmpl{columns}}) { if ( !exists($$vcf{has_column}{$tcol}) ) { error("TODO: the column names do not match\n"); } } for my $vcol (@{$$vcf{columns}}) { if ( !exists($$tmpl{has_column}{$vcol}) ) { error("TODO: the column names do not match\n"); } my $new = $$tmpl{has_column}{$vcol} - 1; my $old = $$vcf{has_column}{$vcol} - 1; $new_to_old[$new] = $old; } # Output the header with modified column order my $ncols = @{$$tmpl{columns}} - 1; my @cols = @{$$tmpl{columns}}[9..$ncols]; print $vcf->format_header(\@cols); while (my $x=$vcf->next_data_array()) { print $$x[0]; for (my $i=1; $i<=$ncols; $i++) { my $idx = $new_to_old[$i]; print "\t".$$x[$idx]; } print "\n"; } } vcftools-0.1.15/src/perl/vcf-sort000077500000000000000000000063561307140004000166560ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; my $opts = parse_params(); sort_vcf($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-sort > out.vcf\n", " cat file.vcf | vcf-sort > out.vcf\n", "Options:\n", " -c, --chromosomal-order Use natural ordering (1,2,10,MT,X) rather then the default (1,10,2,MT,X). This requires\n", " new version of the unix \"sort\" command which supports the --version-sort option.\n", " -p, --parallel Change the number of sorts run concurrently to \n", " -t, --temporary-directory Use a directory other than /tmp as the temporary directory for sorting.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-p' || $arg eq '--parallel-sort' ) { $$opts{parallel_sort}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--chromosomal-order' ) { $$opts{chromosomal_order}=1; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-t' || $arg eq '--temporary-directory' ) { $$opts{temp_dir}=shift(@ARGV); next; } if ( -e $arg ) { $$opts{file}=$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub sort_vcf { my ($opts) = @_; my $fh; if ( exists($$opts{file}) ) { if ( $$opts{file}=~/\.gz$/i ) { open($fh,"gunzip -c $$opts{file} |") or error("$$opts{file}: $!"); } else { open($fh,'<',$$opts{file}) or error("$$opts{file}: $!"); } } else { $fh = *STDIN; } my $sort_opts = check_sort_options($opts); my $cmd; if ( exists($$opts{temp_dir}) ) { $cmd = "sort $sort_opts -T $$opts{temp_dir} -k2,2n"; } else { $cmd = "sort $sort_opts -k2,2n"; } print STDERR "$cmd\n"; open(my $sort_fh,"| $cmd") or error("$cmd: $!"); my $unflushed = select(STDOUT); $| = 1; while (my $line=<$fh>) { if ( $line=~/^#/ ) { print $line; next; } print $sort_fh $line; last; } select($unflushed); while (my $line=<$fh>) { print $sort_fh $line; } } sub check_sort_options { my ($opts) = @_; my $sort_opts = join('',`sort --help`); my $has_version_sort = ( $sort_opts=~/\s+--version-sort\s+/ ) ? 1 : 0; my $has_parallel_sort = ( $sort_opts=~/\s+--parallel=/ ) ? 1 : 0; if ( $$opts{chromosomal_order} && !$has_version_sort ) { error("Old version of sort command installed, please run without the -c option.\n"); } if ( $$opts{parallel_sort} && !$has_version_sort ) { error("Old version of sort command installed, please run without the -p option.\n"); } $sort_opts = ( $$opts{chromosomal_order} && $has_version_sort ) ? '-k1,1V' : '-k1,1d'; if ( $$opts{parallel_sort} && $has_parallel_sort ) { $sort_opts .= " --parallel $$opts{parallel_sort}"; } return $sort_opts; } vcftools-0.1.15/src/perl/vcf-stats000077500000000000000000000114121307140004000170120ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use VcfStats; my $opts = parse_params(); vcf_stats($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-stats [OPTIONS] file.vcf.gz\n", "Options:\n", " -d, --dump Take an existing dump file and recreate the files (works with -p)\n", " -f, --filters List of filters such as column/field (any value), column/field=bin:max (cluster in bins),column/field=value (exact value)\n", " -p, --prefix Prefix of output files. If slashes are present, directories will be created.\n", " -s, --samples Process only the listed samples, - for none. Excluding unwanted samples may increase performance considerably.\n", " -h, -?, --help This help message.\n", "\n", "Examples:\n", " # Calculate stats separately for the filter field, quality and non-indels\n", " vcf-stats file.vcf.gz -f FILTER,QUAL=10:200,INFO/INDEL=False -p out/\n", "\n", " # Calculate stats for all samples\n", " vcf-stats file.vcf.gz -f FORMAT/DP=10:200 -p out/\n", "\n", " # Calculate stats only for the sample NA00001\n", " vcf-stats file.vcf.gz -f SAMPLE/NA00001/DP=1:200 -p out/\n", "\n", " vcf-stats file.vcf.gz > perl.dump\n", "\n"; } sub parse_params { my $opts = { filters=>{}, filter_param=>'' }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--dump' ) { $$opts{dump}=shift(@ARGV); next; } if ( $arg eq '-f' || $arg eq '--filters' ) { $$opts{filter_param}=shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } if ( $arg eq '-s' || $arg eq '--samples' ) { my $samples = shift(@ARGV); $$opts{samples} = [ split(/,/,$samples) ]; next; } if ( -e $arg ) { $$opts{file} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or nonexistent file: \"$arg\". Run -h for help.\n"); } if ( exists($$opts{dump}) && !exists($$opts{prefix}) ) { error("Expected -p option with -d.\n"); } return $opts; } sub init_filters { my ($opts,$vcf) = @_; for my $filter (split(/,/,$$opts{filter_param})) { my ($key,$value) = split(/=/,$filter); my $rec = { value=>$value, exact=>0, any=>0, bin=>0, is_flag=>0 }; if ( $key=~m{^INFO/} ) { my $tag = $'; $$rec{tag} = $tag; if ( exists($$vcf{header}{'INFO'}) && exists($$vcf{header}{'INFO'}{$tag}) && $$vcf{header}{'INFO'}{$tag}{Type} eq 'Flag' ) { $$rec{is_flag} = 1; $$rec{value} = $value eq 'False' ? 0 : 1; $key = "INFO/$tag=". ($$rec{value} ? 'True':'False'); } } elsif ( $key eq 'INFO' ) { # All INFO flags should be counted for my $tag (keys %{$$vcf{header}{'INFO'}}) { if ( $$vcf{header}{'INFO'}{$tag}{Type} ne 'Flag' ) { next; } $$opts{filters}{"INFO/$tag=True"} = { %$rec, is_flag=>1, value=>1, tag=>$tag }; } next; } if ( ! defined $value ) { $$rec{any} = 1; } elsif ( $value=~/^(.+):(.+)$/ ) { $$rec{bin} = 1; $$rec{bin_size} = $1; $$rec{max} = $2; } else { $$rec{exact} = 1; } $$opts{filters}{$key} = $rec; } } sub vcf_stats { my ($opts) = @_; if ( exists($$opts{dump}) ) { # Use existing dump to recreate the files my $vcf = VcfStats->new(file=>'/dev/null'); $$vcf{stats} = do $$opts{dump}; $vcf->save_stats($$opts{prefix}); return; } # Open the VCF file my $vcf = $$opts{file} ? VcfStats->new(file=>$$opts{file}) : VcfStats->new(fh=>\*STDIN); $vcf->parse_header(); init_filters($opts,$vcf); # Include only requested samples if ( exists $$opts{samples} ) { my @include = (); if ( scalar @{$$opts{samples}}>1 or $$opts{samples}[0] ne '-' ) { for my $sample (@{$$opts{samples}}) { push @include,$sample; } } $vcf->set_samples(include=>\@include); } while (my $rec=$vcf->next_data_hash()) { $vcf->collect_stats($rec,$$opts{filters}); } $vcf->save_stats($$opts{prefix}); } vcftools-0.1.15/src/perl/vcf-subset000077500000000000000000000170321307140004000171650ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); vcf_subset($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-subset [OPTIONS] in.vcf.gz > out.vcf\n", "Options:\n", " -a, --trim-alt-alleles Remove alternate alleles if not found in the subset\n", " -c, --columns File or comma-separated list of columns to keep in the vcf file. If file, one column per row\n", " -e, --exclude-ref Exclude rows not containing variants.\n", " -f, --force Proceed anyway even if VCF does not contain some of the samples.\n", " -p, --private Print only rows where only the subset columns carry an alternate allele.\n", " -r, --replace-with-ref Replace the excluded types with reference allele instead of dot.\n", " -t, --type Comma-separated list of variant types to include: ref,SNPs,indels,MNPs,other.\n", " -u, --keep-uncalled Do not exclude rows without calls.\n", " -h, -?, --help This help message.\n", "Examples:\n", " cat in.vcf | vcf-subset -r -t indels -e -c SAMPLE1 > out.vcf\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { exclude_ref=>0, keep_uncalled=>0, replace_with_ref=>0, private=>0, args=>[$0, @ARGV] }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-t' || $arg eq '--type' ) { my %known = ( ref=>'r', SNPs=>'s', indels=>'i', MNPs=>'m', other=>'o' ); my $types = shift(@ARGV); for my $t (split(/,/,$types)) { if ( !(exists($known{$t})) ) { error("Unknown type [$t] with -t [$types]\n"); } $$opts{types}{$known{$t}} = 1; } next; } if ( $arg eq '-a' || $arg eq '--trim-alt-alleles' ) { $$opts{'trim_alts'} = 1; next } if ( $arg eq '-e' || $arg eq '--exclude-ref' ) { $$opts{'exclude_ref'} = 1; next } if ( $arg eq '-f' || $arg eq '--force' ) { $$opts{'force'} = 1; next } if ( $arg eq '-p' || $arg eq '--private' ) { $$opts{'private'} = 1; next } if ( $arg eq '-r' || $arg eq '--replace-with-ref' ) { $$opts{'replace_with_ref'} = 1; next } if ( $arg eq '-u' || $arg eq '--keep-uncalled' ) { $$opts{'keep_uncalled'} = 1; next } if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{'columns_file'} = shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file} = $arg; next } if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !$$opts{exclude_ref} && !$$opts{'columns_file'} && !exists($$opts{'types'}) && !exists($$opts{trim_alts}) ) { error("Missing the -c or -t or -r option.\n") } return $opts; } sub read_columns { my ($fname) = @_; my @columns; if ( !-e $fname ) { @columns = split(/,/,$fname); return \@columns; } open(my $fh,'<',$fname) or error("$fname: $!"); while (my $line=<$fh>) { chomp($line); $line=~s/\s+//g; push @columns, $line; } close($fh); return \@columns; } sub check_columns { my ($opts,$vcf,$columns) = @_; my @out; for my $col (@$columns) { if ( exists($$vcf{has_column}{$col}) ) { push @out, $col; next; } my $msg = qq[No such column in the VCF file: "$col"\n]; if ( $$opts{force} ) { warn($msg); } else { error($msg); } } return \@out; } sub vcf_subset { my ($opts) = @_; my $vcf = $$opts{file} ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); my $AGtags; if ( $$opts{trim_alts} ) { $$vcf{trim_redundant_ALTs} = 1; $AGtags = $vcf->has_AGtags(); } # Init requested column info. If not present, include all columns. my $columns = exists($$opts{columns_file}) ? read_columns($$opts{columns_file}) : []; $columns = check_columns($opts,$vcf,$columns); if ( !@$columns && (my $ncols=@{$$vcf{columns}})>9 ) { push @$columns, @{$$vcf{columns}}[9..($ncols-1)]; } my $columns_to_keep = { map { $_ => 1 } @$columns }; my %has_col = map { $_ => 1 } @$columns; $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf->set_samples(include=>$columns) unless $$opts{private}; print $vcf->format_header($columns); my $check_private = $$opts{private}; while (my $x=$vcf->next_data_hash()) { my $site_has_call = 0; my $site_has_nonref = 0; my $site_is_private = 1; my $ref = $$x{REF}; for my $col (keys %{$$x{gtypes}}) { if ( !$has_col{$col} && ($site_is_private==0 || !$check_private) ) { # This column is not to be printed delete($$x{gtypes}{$col}); next; } my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); my $sample_has_call = 0; my $sample_has_nonref = 0; my @out_alleles; for (my $i=0; $i<@$alleles; $i++) { my ($type,$len,$ht) = $vcf->event_type($ref,$$alleles[$i]); $out_alleles[$i] = $$alleles[$i]; # Exclude unwanted variant types if requested if ( exists($$opts{types}) ) { if ( $type eq 's' && $len>1 ) { $type = 'm'; } elsif ( $type eq 'b' or $type eq 'u' ) { $type = 'o'; } if ( !exists($$opts{types}{$type}) ) { $out_alleles[$i] = $$opts{replace_with_ref} ? $ref : '.'; next; } $sample_has_call = 1; } elsif ( !$is_empty ) { $sample_has_call = 1; } if ( $type ne 'r' ) { $site_has_nonref = 1; $sample_has_nonref = 1; } } if ( $check_private && !$has_col{$col} ) { if ( $sample_has_nonref ) { $site_is_private=0; } delete($$x{gtypes}{$col}); next; } if ( !$sample_has_call ) { if ( $$opts{replace_with_ref} ) { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = $ref; } } else { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = '.'; } } } else { $site_has_call = 1; } $$x{gtypes}{$col}{GT} = $vcf->format_haplotype(\@out_alleles,$seps); } if ( !$site_has_call && !$$opts{keep_uncalled} ) { next; } if ( !$site_has_nonref && $$opts{exclude_ref} ) { next; } if ( $check_private && (!$site_is_private || !$site_has_nonref) ) { next; } if ( $$opts{trim_alts} && defined $AGtags ) { $vcf->remove_columns($x, keep=>$columns_to_keep); $vcf->parse_AGtags($x); } $vcf->format_genotype_strings($x); print $vcf->format_line($x,$columns); } } vcftools-0.1.15/src/perl/vcf-to-tab000077500000000000000000000047571307140004000170600ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); convert_to_tab($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-to-tab [OPTIONS] < in.vcf > out.tab\n", "Options:\n", " -h, -?, --help This help message.\n", " -i, --iupac Use one-letter IUPAC codes\n", "Notes:\n", " Please use `bcftools query` instead, this script will not be supported in future.\n", "\n"; } sub parse_params { my $opts = { iupac=>0 }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-i' || $arg eq '--iupac' ) { $$opts{iupac}=1; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( $$opts{iupac} ) { $$opts{iupac} = { 'GG' => 'G', 'CC' => 'C', 'TT' => 'T', 'AA' => 'A', 'GT' => 'K', 'TG' => 'K', 'AC' => 'M', 'CA' => 'M', 'CG' => 'S', 'GC' => 'S', 'AG' => 'R', 'GA' => 'R', 'AT' => 'W', 'TA' => 'W', 'CT' => 'Y', 'TC' => 'Y', '..' => '.', }; } return $opts; } sub convert_to_tab { my ($opts) = @_; my $iupac; if ( $$opts{iupac} ) { $iupac=$$opts{iupac}; } my $vcf = Vcf->new(fh=>\*STDIN); $vcf->parse_header(); my $header_printed=0; while (my $x=$vcf->next_data_hash()) { if ( !$header_printed ) { print "#CHROM\tPOS\tREF"; for my $col (sort keys %{$$x{gtypes}}) { print "\t$col"; } print "\n"; $header_printed = 1; } print "$$x{CHROM}\t$$x{POS}\t$$x{REF}"; for my $col (sort keys %{$$x{gtypes}}) { my ($al1,$sep,$al2) = exists($$x{gtypes}{$col}{GT}) ? $vcf->parse_alleles($x,$col) : ('.','/','.'); my $gt = $al1.'/'.$al2; if ( $iupac ) { $gt = $al1.$al2; if ( !exists($$iupac{$gt}) ) { error(qq[Unknown IUPAC code for "$al1$sep$al2" .. $$x{CHROM}:$$x{POS} $col\n]); } $gt = $$iupac{$gt}; } print "\t".$gt; } print "\n"; } } vcftools-0.1.15/src/perl/vcf-tstv000077500000000000000000000034341307140004000166610ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; my $opts = parse_params(); calc_tstv(); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "Usage: cat file.vcf | vcf-tstv\n", "Options:\n", " -h, -?, --help This help message.\n", "\n"; exit -1; } sub parse_params { my $opts = {}; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub calc_tstv { my $stats; my $n=0; my $multiallelic=0; while (my $line=) { if ( substr($line,0,1) eq '#' ) { next; } $n++; my $i=-1; for (1..3) { $i=index($line,"\t",$i+1); } my $j = index($line,"\t",$i+1); my $ref = substr($line,$i+1,$j-$i-1); if ( length($ref)>1 ) { next; } $i = index($line,"\t",$j+1); my $alt = substr($line,$j+1,$i-$j-1); if ( $alt eq '.' ) { next; } $i = index($alt,','); if ( $i!=-1 ) { $alt = substr($alt,0,$i); } # only first ALT is counted if ( length($alt)>1 ) { next; } if ( $i!=-1 ) { $multiallelic++ } $$stats{$ref.$alt}++; } my $ts = 0; for my $mut (qw(AG GA CT TC)) { if ( exists($$stats{$mut}) ) { $ts += $$stats{$mut}; } } my $tv = 0; for my $mut (qw(AC CA GT TG AT TA CG GC)) { if ( exists($$stats{$mut}) ) { $tv += $$stats{$mut}; } } my $ratio = $tv ? $ts/$tv : 0; printf "%.2f\t%d\t(ts=%d tv=%d total=%d skipped=%d multiallelic=%d)\n", $ratio,$ts+$tv, $ts,$tv,$n,$n-$ts-$tv,$multiallelic; } vcftools-0.1.15/src/perl/vcf-validator000077500000000000000000000064311307140004000176460ustar00rootroot00000000000000#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use IPC::Open3 'open3'; use IO::Select; my $opts = parse_params(); do_validation($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-validator [OPTIONS] file.vcf.gz\n", "Options:\n", " -d, --duplicates Warn about duplicate positions.\n", " -u, --unique-messages Output all messages only once.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { unique=>0, duplicates=>0 }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--duplicates' ) { $$opts{duplicates}=1; next; } if ( $arg eq '-u' || $arg eq '--unique-messages' ) { $$opts{unique}=1; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( (-e $arg or $arg=~m{^(?:ftp|http)://}) && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } error("Unknown parameter or non-existent file: \"$arg\". Run -h for help.\n"); } return $opts; } sub do_validation { my ($opts) = @_; my %opts = $$opts{file} ? (file=>$$opts{file}) : (fh=>\*STDIN); if ( !$$opts{unique} ) { my $vcf = Vcf->new(%opts, warn_duplicates=>$$opts{duplicates}); $vcf->run_validation(); return; } my ($kid_in,$kid_out,$kid_err); my $pid = open3($kid_in,$kid_out,$kid_err,'-'); if ( !defined $pid ) { error("Cannot fork: $!"); } if ($pid) { $$opts{known_lines} = []; my $sel = new IO::Select; $sel->add($kid_out,$kid_err); while(my @ready = $sel->can_read) { foreach my $fh (@ready) { my $line = <$fh>; if (not defined $line) { $sel->remove($fh); next; } print_or_discard_line($opts,$line); } } print_summary($opts); } else { my $vcf = Vcf->new(%opts, warn_duplicates=>$$opts{duplicates}); $vcf->run_validation(); return; } } sub print_or_discard_line { my ($opts,$line) = @_; my @items = split(/\s+/,$line); my $nitems = scalar @items; for my $known (@{$$opts{known_lines}}) { if ( @items != @{$$known{line}} ) { next; } my $nmatches = 0; for (my $i=0; $i<$nitems; $i++) { if ( $items[$i] eq $$known{line}[$i] ) { $nmatches++ } } if ( $nitems-$nmatches<3 ) { $$known{n}++; return; } } push @{$$opts{known_lines}}, { line=>\@items, n=>1 }; print $line; } sub print_summary { my ($opts) = @_; my $n = 0; for my $error (@{$$opts{known_lines}}) { $n += $$error{n}; } print "\n\n------------------------\n"; print "Summary:\n"; printf "\t%d errors total \n\n", $n; $n = 0; for my $error (sort {$$b{n}<=>$$a{n}} @{$$opts{known_lines}}) { if ( $n++ > 50 ) { print "\n\nand more...\n"; last; } printf "\t%d\t..\t%s\n", $$error{n},join(' ',@{$$error{line}}); } }