vcftools_0.1.11/0000755000000000000000000000000012163074506012215 5ustar rootrootvcftools_0.1.11/perl/0000755000000000000000000000000012163074506013157 5ustar rootrootvcftools_0.1.11/perl/vcf-contrast0000755000000000000000000002113512156354770015526 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); query_vcf($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "About: Finds differences amongst samples adding NOVELGT, NOVELAL and NOVELTY annotations to INFO field.\n", " Note that haploid genotypes are internally treated as homozygous diploid genotypes, therefore\n", " \"0/1\" and \"1\" are considered different genotypes.\n", "Usage: vcf-contrast + - [OPTIONS] file.vcf.gz\n", "Options:\n", " + List of samples where unique variant is expected\n", " - List of background samples\n", " -d, --min-DP Minimum depth across all - samples\n", " -f, --apply-filters Skip sites with FILTER column different from PASS or \".\"\n", " -n, --novel-sites Print only records with novel genotypes\n", " -h, -?, --help This help message.\n", "Example:\n", " # Test if any of the samples A,B is different from all C,D,E\n", " vcf-contrast +A,B -C,D,E -m file.vcf.gz\n", "\n", " # Same as above but printing only sites with novel variants and table output\n", " vcf-contrast -n +A,B -C,D,E -m file.vcf.gz | vcf-query -f '\%CHROM \%POS\\t\%INFO/NOVELTY\\t\%INFO/NOVELAL\\t\%INFO/NOVELGT[\\t\%SAMPLE \%GTR \%PL]\\n'\n", "\n", " # Similar to above but require minimum mapping quality of 20\n", " vcf-annotate -f MinMQ=20 file.vcf.gz | vcf-contrast +A,B,C -D,E,F -f\n", "\n"; exit -1; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args=>[$0, @ARGV], }; while (defined(my $arg=shift(@ARGV))) { if ( -e $arg ) { $$opts{vcf}=$arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-d' || $arg eq '--min-DP' ) { $$opts{min_dp}=shift(@ARGV); next; } if ( $arg eq '-n' || $arg eq '--novel-sites' ) { $$opts{novel_only}=1; next; } if ( $arg eq '-f' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg=~/^\+/ && !exists($$opts{var_samples}) ) { @{$$opts{var_samples}}=split(/,/,$'); next } if ( $arg=~/^-/ && !exists($$opts{bg_samples}) ) { @{$$opts{bg_samples}}=split(/,/,$'); next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{var_samples}) ) { error("Missing the list of variant samples (+).\n") } if ( !exists($$opts{bg_samples}) ) { error("Missing the list of background samples (-).\n") } return $opts; } sub init_columns { my ($vcf,@samples) = @_; my @out; for my $sample (@samples) { push @out, $vcf->get_column_index($sample); } return \@out; } sub query_vcf { my ($opts) = @_; my $vcf = exists($$opts{vcf}) ? Vcf->new(file=>$$opts{vcf}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header; $vcf->add_header_line({key=>'INFO',ID=>'NOVELAL',Number=>'.',Type=>'String',Description=>'List of samples with novel alleles'}); $vcf->add_header_line({key=>'INFO',ID=>'NOVELGT',Number=>'.',Type=>'String',Description=>'List of samples with novel genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'NOVELTY',Number=>'1',Type=>'Integer',Description=>'vcf-contrast novelty score'}); $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print $vcf->format_header(); $$opts{var_cols} = init_columns($vcf,@{$$opts{var_samples}}); $$opts{bg_cols} = init_columns($vcf,@{$$opts{bg_samples}}); while (my $rec=$vcf->next_data_array) { if ( $$opts{apply_filters} && $$rec[6] ne '.' && $$rec[6] ne 'PASS' ) { next; } if ( $$rec[4] eq '.' ) { next; } my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my ($novel,$novelal,$novelgt) = contrast($opts,$vcf,$rec); if ( $novel ) { my %info = ( NOVELTY=>$novel ); if ( scalar keys %$novelal ) { my @tmp; for my $col (keys %$novelal) { push @tmp, $$vcf{columns}[$col]; } $info{NOVELAL} = join(',',@tmp); } elsif ( scalar keys %$novelgt ) { my @tmp; for my $col (keys %$novelgt) { push @tmp, $$vcf{columns}[$col]; } $info{NOVELGT} = join(',',@tmp); } $$rec[7]=$vcf->add_info_field($$rec[7],%info); } elsif ( $$opts{novel_only} ) { next; } print $vcf->format_line($rec); } } sub contrast { my ($opts,$vcf,$rec) = @_; my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my $has_PL = $ipl<0 ? 0 : 1; my $igt; if ( !$has_PL ) { $igt = $vcf->get_tag_index($$rec[8],'GT',':'); if ( $igt<0 ) { error("GT not available: $$rec[0]:$$rec[1]\n"); } } my $idp; if ( exists($$opts{min_dp}) ) { $idp = $vcf->get_tag_index($$rec[8],'DP',':'); if ( $idp<0 ) { error("todo: DP not available"); } } my @x = split(/,/, $$rec[4]); my $n_als = 1 + scalar @x; my (@bg_pls, @bg_als, @bg_gts, @var_pls,@var_gts, $min_dp); for my $bg_col (@{$$opts{bg_cols}}) { if ( defined $idp ) { my $dp = $vcf->get_field($$rec[$bg_col],$idp); if ( !defined $min_dp or $min_dp>$dp ) { $min_dp=$dp; } } my @gt; if ( $has_PL ) { my $pl = $vcf->get_field($$rec[$bg_col],$ipl); ($pl, @gt) = likely_gt($pl, $n_als); push @bg_pls, $pl; } else { my $gt = $vcf->get_field($$rec[$bg_col],$igt); @gt = $vcf->split_gt($gt); } push @bg_als, \@gt; push @bg_gts, join('/',sort(@gt)); } if ( defined $min_dp && $min_dp<$$opts{min_dp} ) { return undef; } my %novel_gt; my %novel_al; my $min_score; for my $var_col (@{$$opts{var_cols}}) { my (@var_als,$var_pl); if ( $has_PL ) { $var_pl = $vcf->get_field($$rec[$var_col],$ipl); ($var_pl,@var_als) = likely_gt($var_pl, $n_als); @var_als = sort @var_als; push @var_pls, $var_pl; } else { my $gt = $vcf->get_field($$rec[$var_col],$igt); @var_als = sort($vcf->split_gt($gt)); } my $var_gt = join('/',sort(@var_als)); push @var_gts, $var_gt; my $bg_score; my %als; for (my $i=0; $i<@{$$opts{bg_cols}}; $i++) { my $score; if ( $has_PL ) { if ( $var_pls[0] eq '.' or substr($bg_pls[$i],0,1) eq '.' ) { next; } $score = same_pls($var_pl, $bg_pls[$i]); } else { if ( $var_als[0] eq '.' or $bg_als[$i][0] eq '.' ) { next; } $score = same_gts(\@var_als, $bg_als[$i]); } if ( !defined $bg_score or $score<$bg_score ) { $bg_score = $score; } for my $al (@{$bg_als[$i]}) { $als{$al} = 1; } if ( $var_gt ne $bg_gts[$i] ) { $novel_gt{$var_col} = 1; } } if ( !$bg_score ) { next; } if ( !defined $min_score or $min_score>$bg_score ) { $min_score = $bg_score; } for my $al (@var_als) { if ( !exists($als{$al}) ) { $novel_al{$var_col} = 1; } } } if ( !$min_score ) { return undef; } if ( !scalar keys %novel_gt && !scalar keys %novel_al ) { return undef; } return ($min_score,\%novel_al,\%novel_gt); } sub likely_gt { my ($pl, $nals) = @_; my @pls = split(/,/,$pl); my ($min,$imin,$jmin); if ( $nals==@pls ) { # haploid: treat as fake diploid my @out_pls; $min = $pls[0]; $imin = 0; for (my $i=1; $i<@pls; $i++) { if ( $min>$pls[$i] ) { $min = $pls[$i]; $imin = $i; } } for (my $i=0; $i<$nals; $i++) { for (my $j=0; $j<$i; $j++) { push @out_pls,255; } push @out_pls, $pls[$i]; } return (join(',',@out_pls), $imin,$imin); } # diploid my $idx=0; my $i = 0; while ($idx<@pls) { if ( $pls[$idx] eq '.' ) { return '.'; } for (my $j=0; $j<=$i; $j++) { if ( $idx>=@pls ) { error("Unexpected number of PL values with n_als=$nals: $pl\n"); } if ( !defined $min or $min>$pls[$idx] ) { $min=$pls[$idx]; $imin=$i; $jmin=$j; } $idx++; } $i++; } return ($pl,$jmin,$imin); } sub same_pls { my ($pla,$plb) = @_; my @pla = split(/,/,$pla); my @plb = split(/,/,$plb); my $min; my $imin; for (my $i=0; $i<@pla; $i++) { if ( !defined $min or $pla[$i]+$plb[$i]<$min ) { $min=$pla[$i]+$plb[$i]; $imin=$i; } } return $min; } sub same_gts { my ($gta,$gtb) = @_; if ( @$gta != @$gtb ) { return 255; } for (my $i=0; $i<@$gta; $i++) { if ( $$gta[$i] ne $$gtb[$i] ) { return 255; } } return 0; } vcftools_0.1.11/perl/tab-to-vcf0000755000000000000000000000442212156354770015057 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); tab_to_vcf(); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: tab-to-vcf [OPTIONS]\n", "Options:\n", " -i, --id The column ID.\n", " -r, --ref The reference sequence (optional).\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-i' || $arg eq '--id' ) { $$opts{id} = shift(@ARGV); next } if ( $arg eq '-r' || $arg eq '--ref' ) { $$opts{refseq} = shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{id}) ) { error("Missing the -i option.\n") } return $opts; } sub tab_to_vcf { my ($data,$prefix) = @_; my $refseq = $$opts{refseq} ? FaSlice->new(file=>$$opts{refseq},size=>1_000_000) : undef; my $id = $$opts{id}; my $vcf_out = Vcf->new(); $vcf_out->add_columns($id); $vcf_out->add_header_line({key=>'FORMAT',ID=>'GT',Number=>'1',Type=>'String',Description=>"Genotype"}); print $vcf_out->format_header(); while (my $line=) { if ( $line=~/^#/ ) { next; } # 11 86881024 CT my @items = split(/\t/,$line); if ( $items[2] eq '*' ) { next; } my $chr = $items[0]; my $pos = $items[1]; my $snp = $items[2]; if ( !($pos=~/^\d+$/) ) { error("Could not parse the line: $line"); } if ( !($snp=~/^([ACGT])([ACGT])$/) ) { error("Could not parse the line: $line"); } $snp = "$1/$2"; my %out; $out{CHROM} = $chr; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{REF} = $refseq->get_base($chr,$pos); $out{QUAL} = '.'; $out{FILTER} = ['.']; $out{FORMAT} = ['GT']; $out{gtypes}{$id}{GT} = $snp; $vcf_out->format_genotype_strings(\%out); print $vcf_out->format_line(\%out); } } vcftools_0.1.11/perl/vcf-fix-newlines0000755000000000000000000000437212156354770016305 0ustar rootroot#!/usr/bin/env perl # # Authors: Adam Auton, Petr Danecek # (C) 2011 use strict; use warnings; use Carp; my $opts = parse_params(); fix_file($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Reads in a VCF file with any (commonly used) newline representation and outputs with the\n", " current system's newline representation.\n", "Usage: vcf-fix-newlines [OPTIONS]\n", "Options:\n", " -i, --info Report if the file is consistent with the current platform based.\n", " -h, -?, --help This help message.\n", "Example:\n", " vcf-fix-newlines -i file.vcf\n", " vcf-fix-newlines file.vcf.gz > out.vcf\n", " cat file.vcf | vcf-fix-newlines > out.vcf\n", "\n"; } sub parse_params { my $opts = {}; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-i' || $arg eq '--info' ) { $$opts{info}=1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( !exists($$opts{vcf}) && -e $arg ) { $$opts{vcf}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub fix_file { my ($opts) = @_; my $fh = \*STDIN; if ( $$opts{vcf} ) { if ( $$opts{vcf}=~/\.gz/i ) { open($fh,"gunzip -c $$opts{vcf} |") or error("gunzip -c $$opts{vcf}: $!\n"); } else { open($fh,'<',$$opts{vcf}) or error("$$opts{vcf}: $!\n"); } } # Read a small 1kb sample binmode $fh or error("binmode: $!"); local $/ = \1024; my $buf = <$fh>; if ( !defined $buf ) { error("No data read.\n"); } # Check the origin my ($in,$nl); if ( $buf=~/\015\012/ ) { $in = 'Windows'; $nl=$&; } elsif ( $buf=~/\015/ && !($buf=~/\012/) ) { $in = 'Old Mac'; $nl=$&; } elsif ( $buf=~/\012/ && !($buf=~/\015/) ) { $in = 'UNIX'; $nl=$&; } else { error("FIXME: Unable to determine the system which produced the file.\n"); } if ( defined $in ) { warn("The file was generated on $in compatible system.\n"); } if ( $$opts{info} ) { close($fh); return; } if ( $nl eq "\n" ) { warn("No conversion needed.\n"); return; } # Read the file and do the conversion local $/ = $nl; $buf .= <$fh>; $buf =~ s/$nl/\n/g; print $buf; while($buf = <$fh>) { $buf =~ s/$nl/\n/g; print $buf; } close($fh); } vcftools_0.1.11/perl/vcf-concat0000755000000000000000000002531012156354770015137 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); if ( $$opts{check_columns} ) { check_columns($opts); } elsif ( !exists($$opts{sort}) ) { concat($opts); } else { concat_merge($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Convenience tool for concatenating VCF files (e.g. VCFs split by chromosome).\n", " In the basic mode it does not do anything fancy except for a sanity check that all\n", " files have the same columns. When run with the -s option, it will perform a partial\n", " merge sort, looking at limited number of open files simultaneously.\n", "Usage: vcf-concat [OPTIONS] A.vcf.gz B.vcf.gz C.vcf.gz > out.vcf\n", "Options:\n", " -c, --check-columns Do not concatenate, only check if the columns agree.\n", " -f, --files Read the list of files from a file.\n", " -p, --pad-missing Write '.' in place of missing columns. Useful for joining chrY with the rest.\n", " -s, --merge-sort Allow small overlaps in N consecutive files.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { files=>[] }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-p' || $arg eq '--pad-missing' ) { $$opts{pad_missing}=1; next; } if ( $arg eq '-s' || $arg eq '--merge-sort' ) { $$opts{sort}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--check-columns' ) { $$opts{check_columns}=1; next; } if ( $arg eq '-f' || $arg eq '--files' ) { my $files = shift(@ARGV); open(my $fh,'<',$files) or error("$files: $!"); while (my $line=<$fh>) { chomp($line); push @{$$opts{files}},$line; } close($fh); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{files}},$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( ! @{$$opts{files}} ) { error("No files to concat?\n") } return $opts; } sub can_be_padded { my ($opts,$cols1,$cols2) = @_; if ( @$cols1<@$cols2 ) { error(sprintf "Not ready for this, sorry, expected fewer columns (%d!<%d)", @$cols1,@$cols2); } my $has1 = {}; my $has2 = {}; for (my $i=0; $i<@$cols1; $i++) { $$has1{$$cols1[$i]} = $i; } for (my $i=0; $i<@$cols2; $i++) { if ( !exists($$has1{$$cols2[$i]}) ) { error("The column [$$cols2[$i]] not seen previously."); } $$has2{$$cols2[$i]} = $i; } my @map; for (my $i=0; $i<@$cols1; $i++) { my $cname = $$cols1[$i]; push @map, exists($$has2{$cname}) ? $$has2{$cname} : -1; } return \@map; } sub check_columns { my ($opts) = @_; my @columns; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); if ( @columns ) { my $different_order; my $different_columns; if ( @columns != @{$$vcf{columns}} ) { warn("Different number of columns in [$file].\n"); } if ( $$opts{pad_missing} && can_be_padded($opts,\@columns,$$vcf{columns}) ) { next; } for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { if ( !exists($$vcf{has_column}{$columns[$i]}) ) { warn("The column names do not match; the column \"$columns[$i]\" no present in [$file].\n"); $different_columns = $columns[$i]; } elsif ( !defined $different_order ) { $different_order = $columns[$i]; } } } if ( defined $different_order && !defined $different_columns ) { warn("The columns ordered differently in [$file]. Use vcf-shuffle-cols to reorder.\n"); } } else { @columns = @{$$vcf{columns}}; } $vcf->close(); } } sub concat { my ($opts) = @_; my @columns; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); my $map; if ( @columns ) { if ( @columns != @{$$vcf{columns}} ) { if ( !$$opts{pad_missing} ) { error(sprintf "Different number of columns in [%s], expected %d, found %d\n", $file,scalar @columns,scalar @{$$vcf{columns}}); } $map = can_be_padded($opts,\@columns,$$vcf{columns}); } else { my $different_order; for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { if ( !exists($$vcf{has_column}{$columns[$i]}) ) { error("The column names do not match; the column \"$columns[$i]\" no present in [$file].\n"); } elsif ( !defined $different_order ) { $different_order = $columns[$i]; } } } if ( defined $different_order ) { error("The columns ordered differently in [$file]. Use vcf-shuffle-cols to reorder.\n"); } } } else { @columns = @{$$vcf{columns}}; print $vcf->format_header(); } while (my $line=$vcf->next_line()) { if ( defined $map ) { my @line = split(/\t/,$line); chomp($line[-1]); my @out; for my $idx (@$map) { if ( $idx==-1 ) { push @out,'.'; } else { push @out,$line[$$map[$idx]] } } print join("\t",@out),"\n"; } else { print $line; } } } } sub get_chromosomes { my ($files) = @_; my @out; my %has_chrm; for my $file (@$files) { my $vcf = Vcf->new(file=>$file); my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrm{$chr}) ) { next; } $has_chrm{$chr} = 1; push @out,$chr; } } return \@out; } sub concat_merge { my ($opts) = @_; my $header_printed = 0; my $chroms = get_chromosomes($$opts{files}); for my $chr (@$chroms) { my $reader = Reader->new(files=>$$opts{files},nsort=>$$opts{sort},seq=>$chr,header_printed=>$header_printed); $header_printed = 1; $reader->open_next(); while (1) { my $line = $reader->next_line(); if ( !defined $line ) { if ( !$reader->open_next() ) { last; } next; } print $line; } } if ( !$header_printed ) { my $vcf = Vcf->new(file=>$$opts{files}[0]); $vcf->parse_header(); print $vcf->format_header(); } } #--------------------------------- package Reader; use strict; use warnings; use Carp; use Vcf; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{files} ) { $self->throw("Expected the files option.\n"); } if ( !$$self{nsort} ) { $$self{nsort} = 2; } if ( $$self{nsort}>@{$$self{files}} ) { $$self{nsort} = scalar @{$$self{files}}; } $$self{idxs} = undef; $$self{vcfs} = undef; return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub print_header { my ($self,$vcf) = @_; if ( $$self{header_printed} ) { return; } print $vcf->format_header(); $$self{header_printed} = 1; } # Open VCF, parse header, check column names and when callled for the first time, output the VCF header. sub open_vcf { my ($self,$file) = @_; my $vcf = Vcf->new(file=>$file,region=>$$self{seq},print_header=>1); $vcf->parse_header(); if ( !exists($$self{columns}) ) { $$self{columns} = [ @{$$vcf{columns}} ]; } else { if ( @{$$self{columns}} != @{$$vcf{columns}} ) { $self->throw("Different number of columns in [$file].\n"); } for (my $i=0; $i<@{$$self{columns}}; $i++) { if ( $$vcf{columns}[$i] ne $$self{columns}[$i] ) { $self->throw("The column names do not agree in [$file].\n"); } } } $self->print_header($vcf); return $vcf; } sub open_next { my ($self) = @_; if ( !defined $$self{idxs} ) { for (my $i=0; $i<$$self{nsort}; $i++) { $$self{idxs}[$i] = $i; } } else { my $prev = $$self{idxs}[-1]; shift(@{$$self{idxs}}); shift(@{$$self{vcfs}}); if ( $prev+1 < @{$$self{files}} ) { # New file to be opened push @{$$self{idxs}}, $prev+1; } } for (my $i=0; $i<@{$$self{idxs}}; $i++) { if ( exists($$self{vcfs}[$i]) ) { next; } my $idx = $$self{idxs}[$i]; $$self{vcfs}[$i] = $self->open_vcf($$self{files}[$idx]); } if ( !@{$$self{idxs}} ) { return 0; } return 1; } sub next_line { my ($self) = @_; my $min = $$self{vcfs}[0]->next_line(); if ( !defined $min ) { return undef; } if ( !($min=~/^(\S+)\t(\d+)/) ) { $self->throw("Could not parse the line: $min\n"); } my $min_chr = $1; my $min_pos = $2; my $min_vcf = $$self{vcfs}[0]; for (my $i=1; $i<@{$$self{vcfs}}; $i++) { if ( !exists($$self{vcfs}[$i]) ) { next; } my $line = $$self{vcfs}[$i]->next_line(); if ( !defined $line ) { next; } if ( !($line=~/^(\S+)\t(\d+)/) ) { $self->throw("Could not parse the line: $line\n"); } my $chr = $1; my $pos = $2; if ( $chr ne $min_chr ) { $self->throw("FIXME: When run with the -s option, only one chromosome can be present.\n"); } if ( $min_pos > $pos ) { $min_pos = $pos; $min_vcf->_unread_line($min); $min_vcf = $$self{vcfs}[$i]; $min = $line; } else { $$self{vcfs}[$i]->_unread_line($line); } } return $min; } vcftools_0.1.11/perl/FaSlice.pm0000644000000000000000000001400412156354770015030 0ustar rootroot# Author: petr.danecek@sanger # =head1 NAME FaSlice.pm. Module for cached access to fasta sequences, employs samtools faidx. =head1 SYNOPSIS use FaSlice; my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345); $fa->get_slice(1,12345,54321); =cut package FaSlice; use strict; use warnings; use Carp; =head2 new About : Creates new FaSlice object. Usage : my $fa = FaSlice->new(file=>'ref.fa'); Args : file .. the fasta file oob .. out-of-bounds requests: one of 'throw' (throws), 'N' (fills the missing bases by Ns), or '' (returns empty string, default) size .. size of the cached chunk read by samtools faidx (1_000_000) =cut sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{file} ) { $self->throw("Missing the parameter file\n"); } $$self{chr} = undef; $$self{from} = undef; $$self{to} = undef; if ( !$$self{size} ) { $$self{size}=1_000_000; } $$self{ncache_missed} = 0; $$self{nqueries} = 0; if ( !exists($$self{oob}) ) { $$self{oob}=''; } if ( $$self{oob} ne '' && $$self{oob} ne 'throw' && $$self{oob} ne 'N' ) { $self->throw("The value of oob not recognised: [$$self{oob}]"); } $self->chromosome_naming($$self{file}); return $self; } sub throw { my ($self,@msg) = @_; confess(@msg); } sub cmd { my ($self,$cmd) = @_; my @out = `$cmd`; if ( $? ) { my @msg = (); push @msg, qq[The command "$cmd" returned non-zero status $?]; if ( $! ) { push @msg, ": $!\n"; } else { push @msg, ".\n"; } if ( scalar @out ) { push @msg, @out; } $self->throw(@msg); } return (@out); } # Read the first file of the fasta file and make a guess: Are all chromosomes # names as 'chr1','chr2',etc or just '1','2',...? # Future TODO: more robust chromosome name mapping? sub chromosome_naming { my ($self,$fa_file) = @_; open(my $fh,'<',"$fa_file.fai") or $self->throw("$fa_file.fai: $!"); my $line=<$fh>; if ( !($line=~/^(chr)?\S+\t/) ) { chomp($line); $self->throw("FIXME: the sequence names not in '>(chr)?\\S+' format [$line] ... $fa_file.fai\n"); } close($fh); $$self{chr_naming} = defined $1 ? $1 : ''; } sub read_chunk { my ($self,$chr,$pos) = @_; $$self{chr} = $chr; $chr =~ s/^chr//; $chr = $$self{chr_naming}.$chr; my $to = $pos + $$self{size}; my $cmd = "samtools faidx $$self{file} $chr:$pos-$to"; my @out = $self->cmd($cmd) or $self->throw("$cmd: $!"); my $line = shift(@out); if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } $$self{from} = $1; my $chunk = ''; while ($line=shift(@out)) { chomp($line); $chunk .= $line; } $$self{to} = $$self{from} + length($chunk) - 1; $$self{chunk} = $chunk; $$self{ncache_missed}++; return; } =head2 get_base About : Retrieves base at the given chromosome and position Usage : my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345); Args : chromosome 1-based coordinate =cut sub get_base { my ($self,$chr,$pos) = @_; if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) { $self->read_chunk($chr,$pos); } $$self{nqueries}++; my $idx = $pos - $$self{from}; if ( $$self{from}>$$self{to} ) { if ( $$self{oob} eq '' ) { return ''; } elsif ( $$self{oob} eq 'N' ) { return 'N'; } $self->throw("No such site $chr:$pos in $$self{file}\n"); } return substr($$self{chunk},$idx,1); } =head2 get_slice About : Retrieves region Usage : my $fa = FaSlice->new(file=>'ref.fa'); $fa->get_base(1,12345,54321); Args : chromosome 1-based coordinate =cut sub get_slice { my ($self,$chr,$from,$to) = @_; if ( $to-$from >= $$self{size} ) { $self->throw("Too big region requested, $from-$to >= $$self{size}\n"); } if ( $from>$to ) { $self->throw("Expected $from>$to\n"); } if ( !$$self{chr} || $chr ne $$self{chr} || $from<$$self{from} || $to>$$self{to} ) { $self->read_chunk($chr,$from); } $$self{nqueries}++; if ( $$self{from}>$$self{to} || $$self{from}>$from || $$self{to}<$to ) { if ( $$self{oob} eq 'throw' ) { $self->throw("The region out of bounds $chr:$from-$to in $$self{file}\n"); } elsif ( $$self{oob} eq '' ) { return ''; } if ( $$self{from}>$$self{to} ) { return 'N' x ($to-$from+1); } if ( $$self{from}>$to ) { $self->throw("FIXME: this shouldn't happen $chr:$from-$to .. $$self{from},$$self{to} .. $$self{file}"); } my $lfill = ''; my $rfill = ''; if ( $$self{from}>$from ) { $lfill = 'N' x ($$self{from}-$from); $from=$$self{from}; } if ( $$self{to}<$to ) { $rfill = 'N' x ($to-$$self{to}); $to=$$self{to}; } return $lfill . substr($$self{chunk},$from-$$self{from},$to-$from+1) . $rfill; } return substr($$self{chunk},$from-$$self{from},$to-$from+1); } # http://www.illumina.com/documents/products/technotes/technote_topbot.pdf sub illumina_alleles_TOP_to_ref { my ($self,$a1,$a2,$chr,$pos,$ref) = @_; my %map = (A=>'T', C=>'G', G=>'C', T=>'A'); my %top = ( A=>{A=>-2,C=> 1,G=> 1,T=>-1}, C=>{A=> 1,C=>-2,G=>-1,T=> 0}, G=>{A=> 1,C=>-1,G=>-2,T=> 0}, T=>{A=>-1,C=> 0,G=> 0,T=>-2} ); my $stat = $top{$a1}{$a2}; if ( $stat==-2 ) { $self->throw("Expected two different bases, got $a1 and $a2.\n"); } if ( $stat==-1 ) { # Now we should do the sequence walking to see if the reference is TOP or BOT, # but we do not this in ill-to-vcf: C/G would become G/C and A/T would become T/A. return ($a1,$a2); } if ( $stat==0 ) { $self->throw("Expected Illumina TOP, got $a1 and $a2.\n"); } if ( $ref eq $a1 or $ref eq $a2 ) { return ($a1,$a2); } return ($map{$a1},$map{$a2}); } 1; vcftools_0.1.11/perl/Makefile0000644000000000000000000000111212156354770014620 0ustar rootroot BIN = fill-aa fill-an-ac fill-fs fill-ref-md5 vcf-annotate vcf-compare vcf-concat \ vcf-consensus vcf-contrast vcf-convert vcf-fix-ploidy vcf-indel-stats vcf-isec vcf-merge vcf-phased-join \ vcf-query vcf-shuffle-cols vcf-sort vcf-stats vcf-subset vcf-to-tab vcf-tstv vcf-validator MOD = FaSlice.pm Vcf.pm VcfStats.pm install: @for i in $(BIN); do cp $(CURDIR)/$$i $(BINDIR)/$$i; done; \ for i in $(MOD); do cp $(CURDIR)/$$i $(MODDIR)/$$i; done; clean: @for i in $(BIN); do rm -f $(BINDIR)/$$i; done; \ for i in $(MOD); do rm -f $(MODDIR)/$$i; done; vcftools_0.1.11/perl/fill-aa0000755000000000000000000001362312156354770014425 0ustar rootroot#!/usr/bin/env perl # # Notes: # * The AA files can be downloaded from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments # * The program runs samtools, therefore the AA files must be gzipped (not b2zipped). # # support: pd3@sanger use strict; use warnings; use Carp; use Vcf; use FindBin; use lib "$FindBin::Bin"; use FaSlice; my $opts = parse_params(); fill_aa($opts,$$opts{aa_file}); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: This script fills ancestral alleles into INFO column of VCF files. It depends on samtools,\n", " therefore the fasta sequence must be gzipped (not bgzipped!) and indexed by samtools faidx.\n", " The AA files can be downloaded from\n", " ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments\n", " and processed as shown in the example below. This is because the sequences in the original files\n", " are named as 'ANCESTOR_for_chromosome:NCBI36:1:1:247249719', but the underlying FaSplice.pm\n", " requires names as 'chr1' or '1'.\n", "Usage: fill-aa [OPTIONS] < in.vcf >out.vcf\n", "Options:\n", " -a, --ancestral-allele Prefix to ancestral allele chromosome files.\n", " -t, --type Variant types to process: all,indel,ref,snp. [all]\n", " -h, -?, --help This help message.\n", "Example:\n", " # Get the files ready: compress by gzip and index by samtools faidx. Either repeat the\n", " # following command for each file manually\n", " bzcat human_ancestor_1.fa.bz2 | sed 's,^>.*,>1,' | gzip -c > human_ancestor_1.fa.gz\n", " samtools faidx human_ancestor_1.fa.gz\n", " \n", " # .. or use this loop (tested in bash shell)\n", " ls human_ancestor_*.fa.bz2 | while read IN; do\n", " OUT=`echo \$IN | sed 's,bz2\$,gz,'`\n", " CHR=`echo \$IN | sed 's,human_ancestor_,, ; s,.fa.bz2,,'`\n", " bzcat \$IN | sed \"s,^>.*,>\$CHR,\" | gzip -c > \$OUT\n", " samtools faidx \$OUT\n", " done\n", " \n", " # After this has been done, the following command should return 'TACGTGGcTGCTCTCACACAT'\n", " samtools faidx human_ancestor_1.fa.gz 1:1000000-1000020\n", " \n", " # Now the files are ready to use with fill-aa. Note that the VCF file\n", " # should be sorted (see vcf-sort), otherwise the performance would be seriously\n", " # affected.\n", " cat file.vcf | fill-aa -a human_ancestor_ 2>test.err | gzip -c >out.vcf.gz \n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-a' || $arg eq '--ancestral-allele' ) { $$opts{aa_file} = shift(@ARGV); next } if ( $arg eq '-t' || $arg eq '--type' ) { my %known = ( snp=>'s', indel=>'i', all=>'a', ref=>'r' ); my $types = shift(@ARGV); for my $t (split(/,/,$types)) { if ( !(exists($known{$t})) ) { error("Unknown type [$t] with -t [$types]\n"); } $$opts{types}{$known{$t}} = 1; } if ( exists($$opts{types}{a}) ) { $$opts{types}{s} = 1; $$opts{types}{i} = 1; $$opts{types}{r} = 1; } next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{aa_file}) ) { error("Missing the -a option.\n") } return $opts; } sub fill_aa { my ($opts,$aa_fname) = @_; my $n_unknown = 0; my $n_filled_sites = 0; my $n_filled_bases = 0; my $vcf = Vcf->new(fh=>\*STDIN, assume_uppercase=>1); $vcf->parse_header(); $vcf->add_header_line({key=>'INFO',ID=>'AA',Number=>1,Type=>'String', Description=>'Ancestral Allele, ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/technical/reference/ancestral_alignments/README'}); print $vcf->format_header(); my $fa; my $nskipped = 0; while (my $line = $vcf->next_line() ) { my $rec = $vcf->next_data_array($line); my $chr = $$rec[0]; my $pos = $$rec[1]; my $ref = $$rec[3]; my $fname = $aa_fname; if ( ! -e $fname ) { if ( -e "$fname$chr.fa.gz" ) { $fname = "$fname$chr.fa.gz"; } else { error(qq[Neither "$fname" nor "$fname$chr.fa.gz" exists.\n]); } } if ( !$fa or $$fa{file} ne $fname ) { $fa = FaSlice->new(file=>$fname, size=>100_000); } my $ref_len = length($ref); if ( exists($$opts{types}) && !exists($$opts{types}{a}) ) { my $ok = 0; for my $alt (split(/,/,$$rec[4])) { my ($type,$len,$ht) = $vcf->event_type($ref,$alt); if ( exists($$opts{types}{$type}) ) { $ok=1; last; } } if ( !$ok ) { print $line; $nskipped++; next; } } my $aa = $ref_len==1 ? $fa->get_base($chr,$pos) : $fa->get_slice($chr,$pos,$pos+$ref_len-1); if ( $aa ) { $$rec[7] = $vcf->add_info_field($$rec[7],'AA'=>$aa); $n_filled_sites++; $n_filled_bases+=$ref_len; } else { $$rec[7] = $vcf->add_info_field($$rec[7],'AA'=>'.'); $n_unknown++; } print join("\t",@$rec),"\n"; } print STDERR "AA sites filled .. $n_filled_sites\n", "AA bases filled .. $n_filled_bases\n", "No AAs .. $n_unknown\n", "Lines skipped .. $nskipped\n"; } vcftools_0.1.11/perl/vcf-compare0000755000000000000000000015772212156354770015333 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); if ( exists($$opts{plot}) ) { plot_stats($opts); } else { compare_vcfs($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Compare bgzipped and tabix indexed VCF files. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n", "Usage: vcf-compare [OPTIONS] file1.vcf file2.vcf ...\n", " vcf-compare -p plots chr1.cmp chr2.cmp ...\n", "Options:\n", " -a, --apply-filters Ignore lines where FILTER column is anything else than PASS or '.'\n", " -c, --chromosomes Same as -r, left for backward compatibility. Please do not use as it will be dropped in the future.\n", " -d, --debug Debugging information. Giving the option multiple times increases verbosity\n", " -g, --cmp-genotypes Compare genotypes, not only positions\n", " --ignore-indels Exclude sites containing indels from genotype comparison\n", " -m, --name-mapping Use with -g when comparing files with differing column names. The argument to this options is a\n", " comma-separated list or one mapping per line in a file. The names are colon separated and must\n", " appear in the same order as the files on the command line.\n", " --INFO [] Calculate genotype errors by INFO. Use zero based indecies if field has more than one value. Can be\n", " given multiple times.\n", " -p, --plot Create plots. Multiple files (e.g. per-chromosome outputs from vcf-compare) can be given.\n", " -R, --refseq Compare the actual sequence, not just positions. Use with -w to compare indels.\n", " -r, --regions Process the given regions (comma-separated list or one region per line in a file).\n", " -s, --samples Process only the listed samples. Excluding unwanted samples may increase performance considerably.\n", " -t, --title Title for graphs (see also -p)\n", " -w, --win In repetitive sequences, the same indel can be called at different positions. Consider\n", " records this far apart as matching (be it a SNP or an indel).\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], positions => 0, INFOgroup => [ ], INFOgroupIdx => { }, }; while (my $arg=shift(@ARGV)) { if ( $arg eq '--all-samples-af' ) { $$opts{all_samples_af}=1; next; } if ( $arg eq '--INFO/AF1-af' ) { $$opts{INFO_AF1_af}=1; next; } if ( $arg eq '--ignore-indels' ) { $$opts{ignore_indels}=1; next; } if ( $arg eq '--high-conf-gls' ) { $$opts{high_confidence_gls}=shift(@ARGV); next; } if ( $arg eq '--INFO' ) { # --INFO IMP2 1 (calculate errors by second value of INFO/IMP2 my $infoTag = shift(@ARGV); unshift @{$$opts{INFOgroup}}, $infoTag; if ($ARGV[0] =~ /^\d+$/ ) { $$opts{INFOgroupIdx}{$infoTag} = shift(@ARGV); } next; } if ( $arg eq '--error-by-gl' ) { $$opts{err_by_gl}=1; next; } if ( $arg eq '-a' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg eq '-m' || $arg eq '--name-mapping' ) { $$opts{mappings_list}=shift(@ARGV); next; } if ( $arg eq '-R' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--chromosomes' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-g' || $arg eq '--cmp-genotypes' ) { $$opts{cmp_genotypes}=1; next; } if ( $arg eq '-s' || $arg eq '--samples' ) { my $samples = shift(@ARGV); my @samples = ( -e $samples ) ? read_list($samples) : split(/,/,$samples); $$opts{samples} = \@samples; next; } if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}++; next; } if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--plot' ) { $$opts{plot}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--title' ) { $$opts{title}=shift(@ARGV); next; } if ( -e $arg ) { push @{$$opts{files}}, $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{files}) ) { error("What files should be compared?\n") } return $opts; } sub read_list { my ($fname) = @_; my @regions; if ( -e $fname ) { open(my $rgs,'<',$fname) or error("$fname: $!"); while (my $line=<$rgs>) { chomp($line); push @regions, $line; } close($rgs); } else { @regions = split(/,/,$fname); } return (@regions); } sub read_mappings_list { my ($fname,$files) = @_; my @maps = read_list($fname); my %mapping; for my $map (@maps) { my @items = split(/:/,$map); if ( scalar @items != scalar @$files ) { error(sprintf "Expected %d column names, found [$map].\n", scalar @$files); } for (my $i=1; $i<@$files; $i++) { $mapping{$$files[$i]}{$items[$i]} = $items[0]; warn("Using column name '$items[0]' for $$files[$i]:$items[$i]\n"); } } return \%mapping; } sub compare_vcfs { my ($opts) = @_; $$opts{match} = {}; $$opts{hapls} = {}; # Open the VCF files and initialize the list of chromosomes my @vcfs; my (@regions,%has_chrom,$mappings); if ( exists($$opts{regions_list}) ) { @regions = read_list($$opts{regions_list}); } if ( exists($$opts{mappings_list}) ) { $mappings = read_mappings_list($$opts{mappings_list},$$opts{files}); } print "# This file was generated by vcf-compare.\n"; print "# The command line was: ", join(' ',@{$$opts{args}}), "\n"; print "#\n"; if ( $$opts{debug} ) { print "#SD Site discordance. Use `grep ^SD | cut -f 2-` to extract this part.\n", "#SD The columns are: \n", "#SD 1 .. chromosome\n", "#SD 2 .. position\n", "#SD 3 .. number of Hom_RR matches\n", "#SD 4 .. number of Het_RA matches\n", "#SD 5 .. number of Hom_AA matches\n", "#SD 6 .. number of Hom_RR mismatches\n", "#SD 7 .. number of Het_RA mismatches\n", "#SD 8 .. number of Hom_AA mismatches\n", "#SD 9 .. site's non-reference discordance rate\n"; print "#AM ALT mismatches. The columns are:\n", "#AM 1 .. chromosome\n", "#AM 2 .. position\n", "#AM 3 .. ALT in the first file\n", "#AM 4 .. differing ALT\n"; print "#RM REF mismatches. The columns are:\n", "#RM 1 .. chromosome\n", "#RM 2 .. position\n", "#RM 3 .. REF in the first file\n", "#RM 4 .. differing REF\n"; } my $ifile = 0; for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $$vcf{vcf_compare_ID} = $ifile++; $vcf->parse_header(); $vcf->close(); $$vcf{nread} = 0; push @vcfs, $vcf; # Update the list of known chromosomes if ( !exists($$opts{regions_list}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @regions, $chr; } } # Check if column names need to be renamed if ( defined $mappings && exists($$mappings{$$vcf{file}}) ) { $$vcf{_col_mapping} = $$mappings{$$vcf{file}}; for my $name (keys %{$$vcf{_col_mapping}}) { if ( !exists($$vcf{has_column}{$name}) ) { error("No such column [$name] in the file $$vcf{file}\n"); } my $new_name = $$vcf{_col_mapping}{$name}; $$vcf{_col_mapping_rev}{$new_name} = $name; } } } # Include only matching samples in haplotype comparison if ( $$opts{cmp_genotypes} ) { my %all_samples; for my $vcf (@vcfs) { if ( exists $$opts{samples} ) { for my $sample (@{$$opts{samples}}) { if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; } if ( exists($$vcf{has_column}{$sample}) ) { $all_samples{$sample}++ } } } else { my @samples = $vcf->get_samples(); for my $sample (@samples) { if ( exists($$vcf{_col_mapping}) && exists($$vcf{_col_mapping}{$sample}) ) { $sample = $$vcf{_col_mapping}{$sample}; } $all_samples{$sample}++ } } } my @include_samples; while (my ($sample,$count)=each %all_samples) { if ( $count != scalar @vcfs ) { next; } push @include_samples, $sample; } if ( !@include_samples ) { error("Error: There is no overlap between any of the samples, yet haplotype comparison was requested.\n"); } $$opts{gt_samples_compared} = scalar @include_samples; for my $vcf (@vcfs) { my @include; if ( !exists($$vcf{_col_mapping}) ) { @include=@include_samples; } else { for my $sample (@include_samples) { push @include, exists($$vcf{_col_mapping_rev}{$sample}) ? $$vcf{_col_mapping_rev}{$sample} : $sample } } $vcf->set_samples(include=>\@include); } } # Go through all the files simultaneously and get the stats. for my $region (@regions) { # Open files for my $vcf (@vcfs) { delete($$vcf{last_line}); $vcf->open(region=>$region); delete($$vcf{eof}); } do_region_stats($opts,\@vcfs); } report_stats($opts,\@vcfs); for my $vcf (@vcfs) { if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); } } } sub report_stats { my ($opts,$vcfs) = @_; # if ( $$opts{debug} ) # { # use Data::Dumper; print Dumper($opts); # } my (@counts,%totals); while (my ($key,$num) = each %{$$opts{match}}) { my @files = split(/'/,$key); for my $file (@files) { $totals{$file} += $num; } push @counts, {count=>$num, files=>[@files]}; } print "#VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part.\n", "#VN The columns are: \n", "#VN 1 .. number of sites unique to this particular combination of files\n", "#VN 2- .. combination of files and space-separated number, a fraction of sites in the file\n"; for my $rec (sort {$$a{count}<=>$$b{count}} @counts) { my $num = $$rec{count}; my $files = $$rec{files}; print "VN\t$num"; for my $file (@$files) { printf "\t$file (%.1f%%)", $num*100.0/$totals{$file}; } print "\n"; } if ( $$opts{refseq} && $$opts{indels} ) { print "#IN Indel Numbers. Use `grep ^IN | cut -f 2-` to extract this part.\n", "#IN .. todo\n", "#IN Number of matching indel haplotypes shared across:\n"; while (my ($file,$stat) = each %{$$opts{indels}}) { print "IN\t$file\n"; my $match = $$stat{match} ? $$stat{match} : 0; my $mismatch = $$stat{mismatch} ? $$stat{mismatch} : 0; printf "\t\tNumber of matches: %d\n", $match; printf "\t\t mismatches: %d\n", $mismatch; printf "\t\t error rate: %.1f%%\n", 100*$mismatch/($match+$mismatch); } } print "#SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"; printf "SN\tNumber of REF matches:\t%d\n", exists($$opts{ref_match}) ? $$opts{ref_match} : 0; printf "SN\tNumber of ALT matches:\t%d\n", exists($$opts{alt_match}) ? $$opts{alt_match} : 0; printf "SN\tNumber of REF mismatches:\t%d\n", exists($$opts{ref_mismatch}) ? $$opts{ref_mismatch} : 0; printf "SN\tNumber of ALT mismatches:\t%d\n", exists($$opts{alt_mismatch}) ? $$opts{alt_mismatch} : 0; printf "SN\tNumber of samples in GT comparison:\t%d\n", $$opts{gt_samples_compared} ? $$opts{gt_samples_compared} : 0; my $out; for my $vcf (@$vcfs) { if ( !exists($totals{$$vcf{file}}) ) { $totals{$$vcf{file}}=0; } if ( $totals{$$vcf{file}} == $$vcf{nread} ) { next; } my $diff = $$vcf{nread}-$totals{$$vcf{file}}; my $reported = $totals{$$vcf{file}}; my $total = $$vcf{nread}; $out .= sprintf "SN\tNumber of lost sites:\t%d\t%.1f%%\t%d\t%d\t%s\n", $diff,$diff*100.0/$total,$total,$reported,$$vcf{file}; } if ( $out ) { print "# Number of sites lost due to grouping (e.g. duplicate sites): lost, %lost, read, reported, file\n"; print $out; } if ( !$$opts{cmp_genotypes} ) { return; } my %summary; for my $id (keys %{$$opts{hapls}}) { for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { if ( !exists($$opts{hapls}{$id}{$key.'gtype_mismatch'}) ) { $$opts{hapls}{$id}{$key.'gtype_mismatch'}=0; } $$opts{hapls}{$id}{total_gtype_mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_match'}) ) { $$opts{hapls}{$id}{$key.'gtype_match'}=0; } $$opts{hapls}{$id}{total_gtype_match} += $$opts{hapls}{$id}{$key.'gtype_match'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_lost'}) ) { $$opts{hapls}{$id}{$key.'gtype_lost'}=0; } $$opts{hapls}{$id}{total_gtype_lost} += $$opts{hapls}{$id}{$key.'gtype_lost'}; if ( !exists($$opts{hapls}{$id}{$key.'gtype_gained'}) ) { $$opts{hapls}{$id}{$key.'gtype_gained'}=0; } $$opts{hapls}{$id}{total_gtype_gained} += $$opts{hapls}{$id}{$key.'gtype_gained'}; $summary{$key}{match} += $$opts{hapls}{$id}{$key.'gtype_match'}; $summary{$key}{mismatch} += $$opts{hapls}{$id}{$key.'gtype_mismatch'}; } for my $key (qw(het_RA_ het_AA_)) { if ( !exists($$opts{hapls}{$id}{$key.'phase_match'}) ) { $$opts{hapls}{$id}{$key.'phase_match'}=0; } $$opts{hapls}{$id}{total_phase_match} += $$opts{hapls}{$id}{$key.'phase_match'}; if ( !exists($$opts{hapls}{$id}{$key.'phase_mismatch'}) ) { $$opts{hapls}{$id}{$key.'phase_mismatch'}=0; } $$opts{hapls}{$id}{total_phase_mismatch} += $$opts{hapls}{$id}{$key.'phase_mismatch'}; if ( !exists($$opts{hapls}{$id}{$key.'phase_lost'}) ) { $$opts{hapls}{$id}{$key.'phase_lost'}=0; } $$opts{hapls}{$id}{total_phase_lost} += $$opts{hapls}{$id}{$key.'phase_lost'}; } } print "#GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part.\n", "#GS The columns are:\n", "#GS 1 .. variant type\n", "#GS 2 .. number of mismatches\n", "#GS 3 .. number of matches\n", "#GS 4 .. discordance\n"; print_gs($opts,\%summary); print "\n", "#GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part.\n", "#GC The columns are:\n", "#GC 1 .. Sample\n", "#GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA \n", "#GC 7-9 .. Gtype lost: total het_RA het_AA \n", "#GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA \n", "#GC 15-17 .. Phase lost: total het_RA het_AA \n", "#GC 18 .. Phase gained\n", "#GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA \n", "#GC 24 .. Phased matches: het_RA \n", "#GC 25 .. Misphased matches: het_RA \n"; for my $id (keys %{$$opts{hapls}}) { print "GC\t$id"; for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_mismatch'}; } for my $key (qw(total_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_lost'}; } for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_gained'}; } for my $key (qw(total_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_lost'}; } if ( !exists($$opts{hapls}{$id}{phase_gained}) ) { $$opts{hapls}{$id}{phase_gained}=0; } print "\t",$$opts{hapls}{$id}{phase_gained}; for my $key (qw(total_ hom_RR_ hom_AA_ het_RA_ het_AA_)) { print "\t",$$opts{hapls}{$id}{$key.'gtype_match'}; } for my $key (qw(het_RA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_match'}; } for my $key (qw(het_RA_)) { print "\t",$$opts{hapls}{$id}{$key.'phase_mismatch'}; } print "\n"; } print "#AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part.\n", "#AF The columns are:\n", "#AF 1 .. Non-ref allele count\n", "#AF 2 .. Hom(RR) matches\n", "#AF 3 .. Het(RA) matches\n", "#AF 4 .. Hom(AA) matches\n", "#AF 5 .. Het(AA) matches\n", "#AF 6 .. Hom(RR) mismatches\n", "#AF 7 .. Het(RA) mismatches\n", "#AF 8 .. Hom(AA) mismatches\n", "#AF 9 .. Het(AA) mismatches\n"; for my $ac (sort {$a<=>$b} keys %{$$opts{counts_by_af}}) { print "AF\t$ac"; for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_af}{$ac}{$key}{matches} ? $$opts{counts_by_af}{$ac}{$key}{matches} : 0; } for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_af}{$ac}{$key}{mismatches} ? $$opts{counts_by_af}{$ac}{$key}{mismatches} : 0; } print "\n"; } for my $infoTag ( @{$$opts{INFOgroup}} ) { print "#INFO/".$infoTag." Number of matching and mismatching genotypes vs INFO/". $infoTag. (exists($$opts{INFOgroupIdx}{$infoTag}) ? "[".$$opts{INFOgroupIdx}{$infoTag}."]" : ""). ". Use `^INFO/".$infoTag." | cut -f 2-` to extract this part.\n", "#INFO/".$infoTag." The columns are:\n", "#INFO/".$infoTag." 1 .. INFO/". $infoTag. (exists($$opts{INFOgroupIdx}{$infoTag}) ? "[".$$opts{INFOgroupIdx}{$infoTag}."]\n" : "\n"), "#INFO/".$infoTag." 2 .. Hom(RR) matches\n", "#INFO/".$infoTag." 3 .. Het(RA) matches\n", "#INFO/".$infoTag." 4 .. Hom(AA) matches\n", "#INFO/".$infoTag." 5 .. Het(AA) matches\n", "#INFO/".$infoTag." 6 .. Hom(RR) mismatches\n", "#INFO/".$infoTag." 7 .. Het(RA) mismatches\n", "#INFO/".$infoTag." 8 .. Hom(AA) mismatches\n", "#INFO/".$infoTag." 9 .. Het(AA) mismatches\n", "#INFO/".$infoTag." 10 .. Non-reference Discordance Rate\n"; for my $info (sort {$a<=>$b} keys %{$$opts{counts_by_INFO}{$infoTag}}) { print "INFO/".$infoTag."\t$info"; my $nonRefMatches=-$$opts{counts_by_INFO}{$infoTag}{$info}{"hom_RR_"}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{"hom_RR_"}{matches} : 0; my $mismatches=0; for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} : 0; $nonRefMatches += $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{matches} : 0; } for my $key (qw(hom_RR_ het_RA_ hom_AA_ het_AA_)) { print "\t", $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} : 0; $mismatches += $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} ? $$opts{counts_by_INFO}{$infoTag}{$info}{$key}{mismatches} : 0; } printf "\t%.2f\n",$mismatches*100.0/($mismatches+$nonRefMatches); } } print "#DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part.\n"; print "#DP The columns are:\n"; print "#DP 1 .. depth\n"; print "#DP 2 .. RR matches\n"; print "#DP 3 .. RA matches\n"; print "#DP 4 .. AA matches\n"; print "#DP 5 .. RR -> RA mismatches\n"; print "#DP 6 .. RR -> AA mismatches\n"; print "#DP 7 .. RA -> RR mismatches\n"; print "#DP 8 .. RA -> AA mismatches\n"; print "#DP 9 .. AA -> RR mismatches\n"; print "#DP 10 .. AA -> RA mismatches\n"; for my $dp (sort {$a<=>$b} keys %{$$opts{counts_by_dp}}) { print "DP\t$dp"; for my $type (qw(hom_RR_-hom_RR_ het_RA_-het_RA_ hom_AA_-hom_AA_ hom_RR_-het_RA_ hom_RR_-hom_AA_ het_RA_-hom_RR_ het_RA_-hom_AA_ hom_AA_-hom_RR_ hom_AA_-het_RA_)) { printf "\t%d", exists($$opts{counts_by_dp}{$dp}{$type}) ? $$opts{counts_by_dp}{$dp}{$type} : 0; } print "\n"; } if ( exists($$opts{counts_by_gl}) ) { print "#EQ Errors by quality. Use `grep ^EQ | cut -f 2-` to extract this part.\n"; print "#EQ The columns are:\n"; print "#EQ 1 .. GL\n"; print "#EQ 2 .. number of matches\n"; print "#EQ 3 .. number of mismatches\n"; for my $qual (sort {$a<=>$b} keys %{$$opts{counts_by_gl}}) { printf "EQ\t%s\t%d\t%d\n", $qual, $$opts{counts_by_gl}{$qual}{match}?$$opts{counts_by_gl}{$qual}{match}:0, $$opts{counts_by_gl}{$qual}{mismatch}?$$opts{counts_by_gl}{$qual}{mismatch}:0; } } if ( $$opts{debug} ) { print "#MT Mismatch Types\n"; for my $t1 (keys %{$$opts{mismatch_types}}) { for my $t2 (keys %{$$opts{mismatch_types}{$t1}}) { print "MT\t$t1\t$t2\t$$opts{mismatch_types}{$t1}{$t2}\n"; } } } } sub print_gs { my ($opts,$stats) = @_; my ($ndr_ms,$ndr_m,@summary); for my $key (qw(hom_RR het_RA hom_AA het_AA)) { my $m = $$stats{"${key}_"}{match}; my $ms = $$stats{"${key}_"}{mismatch}; if ( !$m ) { $m=0; } if ( !$ms ) { $ms=0; } my $err = $m?$ms*100.0/($m+$ms):0; printf "GS\t$key\t%d\t%d\t%.2f%%\n", $ms,$m,$err; $ndr_ms += $ms; $ndr_m += $key eq 'hom_RR' ? 0 : $m; if ( $key eq 'het_AA' ) { next; } if ( $key=~/_(.+)$/ ) { push @summary, sprintf "%s %.2f", $1,$err; } } my $err = $ndr_m+$ndr_ms ? $ndr_ms*100.0/($ndr_m+$ndr_ms) : 0; unshift @summary, sprintf "NDR %.2f", $err; printf "SN\tNon-reference Discordance Rate (NDR):\t%.2f\n", $err; print "SN\tSummary:\t", join(', ', @summary), "\n"; } sub read_stats { my ($stats,$file) = @_; open(my $fh,'<',$file) or error("$file: $!"); while (my $line=<$fh>) { if ( $line=~/^#/ ) { next; } my @items = split(/\t/,$line); chomp($items[-1]); if ( $items[0] eq 'DP' ) { my $dp = $items[1]; $$stats{dp}{ndist}{$dp} += $items[2] + $items[3] + $items[4] + $items[5] + $items[6] + $items[7] + $items[8] + $items[9] + $items[10]; $$stats{dp}{RR}{RR}{$dp} += $items[2]; $$stats{dp}{RA}{RA}{$dp} += $items[3]; $$stats{dp}{AA}{AA}{$dp} += $items[4]; $$stats{dp}{RR}{RA}{$dp} += $items[5]; $$stats{dp}{n}{RR}{RA} += $items[5]; $$stats{dp}{RR}{AA}{$dp} += $items[6]; $$stats{dp}{n}{RR}{AA} += $items[6]; $$stats{dp}{RA}{RR}{$dp} += $items[7]; $$stats{dp}{n}{RA}{RR} += $items[7]; $$stats{dp}{RA}{AA}{$dp} += $items[8]; $$stats{dp}{n}{RA}{AA} += $items[8]; $$stats{dp}{AA}{RR}{$dp} += $items[9]; $$stats{dp}{n}{AA}{RR} += $items[9]; $$stats{dp}{AA}{RA}{$dp} += $items[10]; $$stats{dp}{n}{AA}{RA} += $items[10]; } elsif ( $items[0] eq 'AF' ) { my $af = $items[1]; $$stats{af}{RR}{$af}{matches} += $items[2]; $$stats{af}{RA}{$af}{matches} += $items[3]; $$stats{af}{AA}{$af}{matches} += $items[4]; $$stats{af}{RR}{$af}{mismatches} += $items[6]; $$stats{af}{RA}{$af}{mismatches} += $items[7]; $$stats{af}{AA}{$af}{mismatches} += $items[8]; } elsif ( $items[0] eq 'GS' ) { my $type = $items[1]; $$stats{gs}{$type.'_'}{mismatch} += $items[2]; $$stats{gs}{$type.'_'}{match} += $items[3]; } elsif ( $items[0] eq 'EQ' ) { my $gl = $items[1]; $$stats{counts_by_gl}{$gl}{mismatch} += $items[2]; $$stats{counts_by_gl}{$gl}{match} += $items[3]; } } close($fh); } sub make_dir { my ($prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub plot_stats { my ($opts) = @_; my $stats = {}; for my $file (@{$$opts{files}}) { read_stats($stats,$file); plot_site_ndr($opts,$file); } make_dir($$opts{plot}); plot_dp($opts,$$stats{dp}); plot_af($opts,$$stats{af}); plot_ndr($opts,$$stats{af}); plot_dp_ndr($opts,$$stats{dp}); plot_eq($opts,$$stats{counts_by_gl}); print_gs($opts,$$stats{gs}); } sub plot { my ($file) = @_; system("GDFONTPATH=/usr/share/fonts/truetype/ttf-dejavu/ gnuplot $file"); } sub plot_site_ndr { my ($opts,$file) = @_; my ($fname,$gp,$start_chr,@counts); my $start_pos = -1; my $count = 0; my $numerator = 0; my $denominator = 0; open(my $fh,'<',$file) or error("$file: $!"); while (my $line=<$fh>) { if ( !($line=~/^SD/) ) { next; } my @items = split(/\t/,$line); my $chr = $items[1]; my $pos = $items[2]; if ( !defined $gp ) { $fname = "$$opts{plot}-ndr-$chr-$pos.gp"; open($gp,'>',$fname) or error("$fname: $!"); print $gp q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-ndr-$chr-$pos.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Alternate allele frequency" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set y2tics set xtic rotate by -45 plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; } if ( $start_pos==-1 ) { $start_pos = $pos; $start_chr = $chr; } $numerator += $items[6] + $items[7] + $items[8]; $denominator += $items[6] + $items[7] + $items[8] + $items[4] + $items[5]; $count += $denominator; if ( $start_pos+50_000 > $pos && $start_chr eq $chr ) { next; } printf $gp "$start_pos\t%.2f\n", $denominator ? $numerator*100.0/$denominator : 0; push @counts, "$start_pos\t$count\n"; $numerator = 0; $denominator = 0; $count = 0; $start_pos = $pos; $start_chr = $chr; } close($fh); # Was the ^SD section found? if ( !@counts ) { return; } push @counts, "$start_pos\t$count\n"; printf $gp "$start_pos\t%.2f\n", $denominator ? $numerator*100.0/$denominator : 0; print $gp "end\n"; print $gp join('',@counts), "end\n"; close($gp); plot("$fname"); } sub plot_dp_ndr { my ($opts,$stats) = @_; my ($numerator,$denominator); for my $agt (keys %$stats) { if ( $agt eq 'n' or $agt eq 'ndist' ) { next; } for my $bgt (keys %{$$stats{$agt}}) { if ( $bgt eq 'n' ) { next; } if ( $agt eq 'RR' && $bgt eq 'RR' ) { next; } for my $dp (keys %{$$stats{$agt}{$bgt}}) { if ( $agt ne $bgt ) { $$numerator{$dp} += $$stats{$agt}{$bgt}{$dp}; $$denominator{$dp} += $$stats{$agt}{$bgt}{$dp}; } else { $$denominator{$dp} += $$stats{$agt}{$bgt}{$dp}; } } } } open(my $fh,'>',"$$opts{plot}-dp-ndr.gp") or error("$$opts{plot}-dp-ndr.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 600,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-dp-ndr.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Depth" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set y2tics plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; for my $dp (sort {$a<=>$b} keys %$denominator) { printf $fh "%d\t%.2f\n", $dp,$$denominator{$dp} ? $$numerator{$dp}*100.0/$$denominator{$dp} : 0; } print $fh "end\n"; for my $dp (sort {$a<=>$b} keys %$denominator) { printf $fh "%d\t%d\n", $dp,$$denominator{$dp}; } print $fh "end\n"; close($fh); plot("$$opts{plot}-dp-ndr.gp"); } sub plot_dp { my ($opts,$stats) = @_; my $out; my @plots; for my $agt (sort keys %$stats) { if ( $agt eq 'n' or $agt eq 'ndist' ) { next; } for my $bgt (sort keys %{$$stats{$agt}}) { if ( $bgt eq 'n' ) { next; } if ( $agt eq $bgt ) { next; } for my $dp (sort {$a<=>$b} keys %{$$stats{$agt}{$bgt}}) { $out .= $dp . "\t" . ($$stats{n}{$agt}{$bgt} ? $$stats{$agt}{$bgt}{$dp}*100.0/$$stats{n}{$agt}{$bgt} : 0) . "\n"; } $out .= "end\n"; push @plots, qq["-" using 1:2 with linespoints pt 12 title "$agt -> $bgt"]; } } open(my $fh,'>',"$$opts{plot}-dp.gp") or error("$$opts{plot}-dp.gp: $!"); print $fh q[ set terminal png size 600,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-dp.png" . q[" set ylabel 'Fraction of GTs [%]' set y2label 'Number of GTs total' set y2tics set ytics nomirror set xlabel 'Depth' set xrange [:20] ]; if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh "plot ", join(',',@plots), qq[, '-' using 1:2 axes x1y2 with lines lt 0 title "GTs total"\n]; print $fh $out; for my $dp (sort {$a<=>$b} keys %{$$stats{ndist}}) { print $fh "$dp\t$$stats{ndist}{$dp}\n"; } print $fh "end\n"; close($fh); plot("$$opts{plot}-dp.gp"); } sub plot_af { my ($opts,$stats) = @_; open(my $fh,'>',"$$opts{plot}-af.gp") or error("$$opts{plot}-af.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-af.png" . q[" set grid back lc rgb "#dddddd" set xlabel "Non-reference allele frequency" set ylabel "Concordance" set y2label "Number of genotypes" set yrange [0.0:1.0] set y2tics set key center plot '-' axes x1y2 with lines lw 1 lc rgb "red" notitle, \ '-' axes x1y2 with lines lw 1 lc rgb "green" notitle, \ '-' axes x1y2 with lines lw 1 lc rgb "blue" notitle, \ '-' with points pt 20 lc rgb "red" title "HomRef", \ '-' with points pt 20 lc rgb "green" title "Het", \ '-' with points pt 20 lc rgb "blue" title "HomAlt" ]; for my $type (qw(RR RA AA)) { for my $af (sort {$a<=>$b} keys %{$$stats{$type}}) { print $fh "$af\t" . ($$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}) . "\n"; } print $fh "end\n"; } for my $type (qw(RR RA AA)) { for my $af (sort {$a<=>$b} keys %{$$stats{$type}}) { my $n = $$stats{$type}{$af}{matches}+$$stats{$type}{$af}{mismatches}; print $fh "$af\t" . ($n ? 1-$$stats{$type}{$af}{mismatches}/$n : -1) . "\n"; } print $fh "end\n"; } close($fh); plot("$$opts{plot}-af.gp"); } sub plot_ndr { my ($opts,$stats) = @_; open(my $fh,'>',"$$opts{plot}-ndr.gp") or error("$$opts{plot}-ndr.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-ndr.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "Alternate allele frequency" set ylabel "Non-reference Discordance Rate" set y2label "Number of genotypes" set xrange [0.0:1.0] set y2tics plot '-' with lines lw 1 title "NDR", \ '-' axes x1y2 with lines lw 1 title "GTs" ]; my $afs; for my $type (qw(RA AA)) { for my $af (keys %{$$stats{$type}}) { $$afs{$af}{m} += $$stats{$type}{$af}{matches}; $$afs{$af}{mi} += $$stats{$type}{$af}{mismatches}; } } for my $type (qw(RR)) { for my $af (keys %{$$stats{$type}}) { $$afs{$af}{mi} += $$stats{$type}{$af}{mismatches}; } } my @afs = sort { $a<=>$b } keys %$afs; my $iafs = 0; my $bin_size = 0.02; my @dp; for (my $i=0; $i<=1/$bin_size; $i++) { my $from = $i*$bin_size; my $to = ($i+1)*$bin_size; my ($m,$mi,$af) = (0,0,0); while ( $iafs<@afs && $afs[$iafs]>=$from && $afs[$iafs]<$to ) { $af = $afs[$iafs]; $m += $$afs{$af}{m}; $mi += $$afs{$af}{mi}; $iafs++; } if ( !($m+$mi) ) { next; } printf $fh "$af\t%.2f\n", $m+$mi ? $mi*100.0/($m+$mi) : 0; push @dp, sprintf "$af\t%d",$m+$mi; } print $fh "end\n"; print $fh join("\n",@dp), "\nend\n"; close($fh); plot("$$opts{plot}-ndr.gp"); } sub plot_eq { my ($opts,$stats) = @_; if ( !scalar keys %$stats ) { return; } open(my $fh,'>',"$$opts{plot}-eq.gp") or error("$$opts{plot}-eq.gp: $!"); if ( exists($$opts{title}) ) { print $fh qq[set title "$$opts{title}"\n]; } print $fh q[ set terminal png size 550,400 truecolor font "DejaVuSansMono,9" set output "] . "$$opts{plot}-eq.png" . q[" set style line 1 linecolor rgb "#ff4400" set style line 2 linecolor rgb "#0084ff" set style increment user set grid back lc rgb "#dddddd" set xlabel "GL" set ylabel "Number of matches (log)" set y2label "Number of mismatches (log)" set y2tics set ytics nomirror set log y set log y2 plot '-' with lines lw 1 title "Matches", \ '-' axes x1y2 with lines lw 1 title "Mismatches" ]; for my $gl (sort {$a<=>$b} keys %$stats) { print $fh "$gl\t$$stats{$gl}{match}\n"; } print $fh "end\n"; for my $gl (sort {$a<=>$b} keys %$stats) { print $fh "$gl\t$$stats{$gl}{mismatch}\n"; } print $fh "end\n"; close($fh); plot("$$opts{plot}-eq.gp"); } sub do_region_stats { my ($opts,$vcfs) = @_; my $refseq; if ( $$opts{refseq} ) { $refseq = FaSlice->new(file=>$$opts{refseq}, size=>1_000_000); } my $nvcfs = scalar @$vcfs; my $debug = $$opts{debug} ? $$opts{debug} : 0; my $match = $$opts{match}; my $win = $$opts{win} ? $$opts{win} : 0; while (1) { my $grp = read_next_group($opts,$vcfs,$win); if ( !$grp || !scalar @$grp ) { last } if ( $debug>1 ) { print "Group:\n"; for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; } print "\n"; } my %files; for my $rec (@$grp) { $files{$$rec{vcf}{file}} = 1; } my $key = join(q['],sort(keys %files)); $$match{$key}++; my $npresent = scalar keys %files; if ( $npresent == $nvcfs ) { ref_alt_stats($opts,$grp); } if ( $npresent>1 && defined $refseq ) { cmp_sequence($opts,$grp,$refseq); } if ( $$opts{cmp_genotypes} ) { # Check that in the group there is one record for each file if ( $npresent==$nvcfs && scalar @$grp==$nvcfs ) { cmp_genotypes($opts,$grp); } } } } sub cmp_sequence { my ($opts,$grp,$fa_refseq) = @_; # Detailed comparison will be performed only if there are indels or complex # substitutions, SNPs are interesting only in their presence. There can be # more events from the same file present simultaneously and at multiple # positions. They all are treated as separate variants and if any of them # yields a haplotype present in all files, match is reported. # Note that the original version of the code expected all alternate # variants to be present on a single VCF line and was able to compare # consecutive non-overlapping events as one sequence. However, because the # the major producer of indel calls (Dindel) does report one variant per # line, this idea was abandoned. # Check if there are any interesting events. my %has_indels; my %events_per_file; my $vcf = $$grp[0]{vcf}; for (my $igrp=0; $igrp<@$grp; $igrp++) { my $rec = $$grp[$igrp]; my $ifile = $$rec{vcf}{vcf_compare_ID}; my $ref_len = length($$rec{ref}); my @alts = split(/,/,$$rec{alt}); for my $alt (@alts) { if ( $alt eq '.' ) { next; } if ( $alt=~/^$$rec{pos}, alt=>$alt, ref_len=>$ref_len }; # Do complex checking of event type only if it is still not certain if this is waste of time or not if ( exists($has_indels{$ifile}) ) { next; } if ( $ref_len!=$alt_len ) { $has_indels{$ifile} = $$rec{vcf}{file}; } elsif ( $ref_len>1 ) { my ($type,$len,$ht) = $vcf->event_type($$rec{ref},$alt); if ( $type eq 'o' ) { $has_indels{$ifile} = $$rec{vcf}{file}; } } } } # Return if there is nothing interesting if ( scalar keys %has_indels < 2 ) { return; } for my $ifile (keys %events_per_file) { if ( !exists($has_indels{$ifile}) ) { delete($events_per_file{$ifile}); } } # Cache the reference sequence chunk my $ref_from = $$grp[0]{pos} - $$opts{win}; my $ref_to = $$grp[-1]{pos} + $$opts{win}; my $refseq = $fa_refseq->get_slice($$grp[0]{chr},$ref_from,$ref_to); # For each file get all possible sequences for my $events (values %events_per_file) { for my $variant (@$events) { my $pos = $$variant{pos}; my $len = $pos - $ref_from; my $seq = $len>0 ? substr($refseq,0,$len) : ''; $seq .= $$variant{alt}; $pos += $$variant{ref_len}; if ( $pos<=$ref_to ) { $seq .= substr($refseq,$pos-$ref_from); } $$variant{seq} = $seq; $$variant{length} = length($seq); } } # Now compare the variants: is there a sequence shared across all files? my $match = 1; my @keys = keys %events_per_file; for (my $ikey=0; $ikey<@keys; $ikey++) { my $ivars = $events_per_file{$ikey}; for (my $jkey=0; $jkey<$ikey; $jkey++) { my $jvars = $events_per_file{$jkey}; my $found = 0; for my $ivar (@$ivars) { for my $jvar (@$jvars) { if ( $$ivar{length} != $$jvar{length} ) { next; } if ( $$ivar{seq} ne $$jvar{seq} ) { next; } $found=1; last; } } if ( !$found ) { $match=0; last; } } if ( !$match ) { last; } } my $key = join(q['],sort(values %has_indels)); if ( $match ) { $$opts{indels}{$key}{match}++; } else { $$opts{indels}{$key}{mismatch}++; } } sub ref_alt_stats { my ($opts,$grp) = @_; my $ref = $$grp[0]{ref}; my $alt = join(',',sort split(/,/,$$grp[0]{alt})); my $alt_mismatch; for (my $i=1; $i<@$grp; $i++) { my $rec = $$grp[$i]; if ( $ref ne $$rec{ref} ) { $$opts{ref_mismatch}++; if ( $$opts{debug} ) { print "RM\t$$grp[0]{chr}\t$$grp[0]{pos}\t$$grp[0]{ref}\t$$rec{ref}\n"; } return; } my $tmp = join(',',sort split(/,/,$$rec{alt})); if ( $alt ne $tmp ) { $alt_mismatch = $tmp; } } if ( $alt ne '.' ) { if ( defined $alt_mismatch ) { $$opts{alt_mismatch}++; if ( $$opts{debug} ) { print "AM\t$$grp[0]{chr}\t$$grp[0]{pos}\t$alt\t$alt_mismatch\n"; } } else { $$opts{alt_match}++; } } $$opts{ref_match}++; } sub snp_type { my ($als,$ref) = @_; # Determine SNP type: hom(RR),het(RA),hom(AA) or het(AA) if ( $$als[0] eq $$als[1] ) { if ( $$als[0] eq $ref ) { return 'hom_RR_'; } else { return 'hom_AA_'; } } else { if ( $$als[0] eq $ref or $$als[1] eq $ref ) { return 'het_RA_'; } else { return 'het_AA_'; } } } sub cmp_genotypes { my ($opts,$grp) = @_; my $nrecs = @$grp; my $hapls = $$opts{hapls}; # Break the VCF lines into hashes (required by parse_haplotype) for my $grp_rec (@$grp) { $$grp_rec{rec} = $$grp_rec{vcf}->next_data_hash($$grp_rec{line}); if ( $$opts{ignore_indels} && exists($$grp_rec{rec}{INFO}{INDEL}) ) { return; } if ( exists($$grp_rec{vcf}{_col_mapping}) ) { my %new_cols; while (my ($name_ori,$name_new) = each %{$$grp_rec{vcf}{_col_mapping}}) { $new_cols{$name_new} = $$grp_rec{rec}{gtypes}{$name_ori}; delete($$grp_rec{rec}{gtypes}{$name_ori}); } while (my ($name,$hash) = each %new_cols) { $$grp_rec{rec}{gtypes}{$name} = $hash; } } } if ( $$grp[0]{vcf}{vcf_compare_ID} != 0 ) { error("FIXME: different order than expected: $$grp[0]{vcf}{vcf_compare_ID}\n"); } my $ref = $$grp[0]{rec}{REF}; my %gtype_matches = (); my %gtype_mismatches = (); my $min_dp; my $ndp3 = 0; for my $id (keys %{$$grp[0]{rec}{gtypes}}) { my (@sorted_als1,$nploid,$type,$max_gl); my ($als1,$seps1,$is_phased1,$is_empty1) = $$grp[0]{vcf}->parse_haplotype($$grp[0]{rec},$id); if ( !$is_empty1 ) { @sorted_als1 = sort @$als1; $nploid = scalar @sorted_als1; $type = snp_type($als1,$ref); } if ( exists($$opts{high_confidence_gls}) ) { my @gls = split(/,/,$$grp[1]{rec}{gtypes}{$id}{GL}); if ( @gls!=3 or $gls[0] eq '.' ) { next; } @gls = sort {$b<=>$a} @gls; if ( abs($gls[0]-$gls[1])<$$opts{high_confidence_gls} ) { next; } } if ( exists($$opts{err_by_gl}) && exists($$grp[0]{rec}{gtypes}{$id}{GL}) ) { for my $gl (split(/,/,$$grp[0]{rec}{gtypes}{$id}{GL})) { if ( !defined $max_gl or $gl>$max_gl ) { $max_gl = $gl; } } } # There may be multiple files entering the comparison. Report match only if all are present and all match. # Report mismatch if all are present and they do not match. Otherwise report lost/gained event. my $phase_match = 1; my $phase_mismatch = 0; my $gtype_match = 1; my $gtype_lost = 0; my $gtype_gained = 0; my $phase_lost = 0; my $phase_gained = 0; my $type2; for (my $i=1; $i<$nrecs; $i++) { my ($als2,$seps2,$is_phased2,$is_empty2) = $$grp[$i]{vcf}->parse_haplotype($$grp[$i]{rec},$id); if ( $is_empty1 ) { $gtype_match = 0; if ( !$is_empty2 ) { $gtype_gained = 1; $type = snp_type($als2,$ref); } if ( !$is_phased1 && $is_phased2 ) { $phase_gained = 1; } last; } elsif ( $is_empty2 ) { $gtype_match = 0; $gtype_lost = 1; last; } if ( $is_phased1 ) { if ( !$is_phased2 ) { $phase_lost = 1; $phase_match = 0; } } elsif ( $is_phased2 ) { $phase_gained = 1; $phase_match = 0; } else { $phase_match = 0; } # Consider different number of alleles as mismatch (C vs C/C) if ( scalar @$als1 != scalar @$als2 ) { $gtype_match = 0; if ( $$opts{debug} ) { $$opts{mismatch_types}{$type}{'Allele_Count'}++ } last; } if ( exists($$opts{err_by_gl}) && exists($$grp[$i]{rec}{gtypes}{$id}{GL}) ) { for my $gl (split(/,/,$$grp[$i]{rec}{gtypes}{$id}{GL})) { if ( !defined $max_gl or $gl>$max_gl ) { $max_gl = $gl; } } } my @sorted_als2 = sort @$als2; for (my $ial=0; $ial<$nploid; $ial++) { if ( $sorted_als1[$ial] ne $sorted_als2[$ial] ) { $gtype_match = 0; if ( $$opts{debug} ) { my $type2 = snp_type($als2,$ref); $$opts{mismatch_types}{$type}{$type2}++; } last; } } if ( !$gtype_match ) { if ( !defined $type2 && !$is_empty2 ) { $type2 = snp_type($als2,$ref); } last; } # They match, check also if their phase agrees if ( $phase_match && $is_phased1 && $is_phased2 ) { for (my $ial=0; $ial<$nploid; $ial++) { if ( $$als1[$ial] ne $$als2[$ial] ) { $phase_mismatch=1; last; } } } } if ( $gtype_gained ) { $$hapls{$id}{$type.'gtype_gained'}++; if ( $phase_gained ) { $$hapls{$id}{phased_gtype_gained}++ } next; } if ( $gtype_lost ) { $$hapls{$id}{$type.'gtype_lost'}++; next; } if ( $phase_mismatch ) { $$hapls{$id}{$type.'phase_mismatch'}++; } if ( $phase_gained ) { $$hapls{$id}{phase_gained}++ } elsif ( $phase_lost ) { $$hapls{$id}{$type.'phase_lost'}++ } my $dp = exists($$grp[1]{rec}{gtypes}{$id}{DP}) ? $$grp[1]{rec}{gtypes}{$id}{DP} : -1; if ( $gtype_match ) { $$hapls{$id}{$type.'gtype_match'}++; if ( $phase_match ) { $$hapls{$id}{$type.'phase_match'}++ } $gtype_matches{$type}++; $$opts{counts_by_dp}{$dp}{$type.'-'.$type}++; if ( defined $max_gl ) { my $gl = sprintf "%.2f", $max_gl; $$opts{counts_by_gl}{$gl}{match}++; } } elsif ( defined $type ) { $$hapls{$id}{$type.'gtype_mismatch'}++; $gtype_mismatches{$type}++; $$opts{counts_by_dp}{$dp}{$type.'-'.$type2}++; if ( defined $max_gl ) { my $gl = sprintf "%.2f", $max_gl; $$opts{counts_by_gl}{$gl}{mismatch}++; } } } $$opts{hapls_ncmp}++; my %infoGroup; for my $infoTag ( @{$$opts{INFOgroup}} ) { if ( exists($$grp[1]{rec}{INFO}{$infoTag}) ) { if( exists($$opts{INFOgroupIdx}{$infoTag}) ) { my @arr = split(/,/,$$grp[1]{rec}{INFO}{$infoTag}); $infoGroup{$infoTag} = sprintf "%.2f", $arr[$$opts{INFOgroupIdx}{$infoTag}]; } else { $infoGroup{$infoTag} = sprintf "%.2f", $$grp[1]{rec}{INFO}{$infoTag}; } } } # Store the number of matching types by AC my $af; if ( $$opts{INFO_AF1_af} && exists($$grp[1]{rec}{INFO}{AF1}) ) { $af = sprintf "%.2f", $$grp[1]{rec}{INFO}{AF1}; } elsif ( !$$opts{all_samples_af} ) { my $ac = 0; my $an = 0; if ( exists($gtype_matches{hom_AA_}) ) { $ac += 2*$gtype_matches{hom_AA_}; $an += 2*$gtype_matches{hom_AA_}; } if ( exists($gtype_mismatches{hom_AA_}) ) { $ac += 2*$gtype_mismatches{hom_AA_}; $an += 2*$gtype_mismatches{hom_AA_}; } if ( exists($gtype_matches{het_RA_}) ) { $ac += $gtype_matches{het_RA_}; $an += 2*$gtype_matches{het_RA_}; } if ( exists($gtype_mismatches{het_RA_}) ) { $ac += $gtype_mismatches{het_RA_}; $an += 2*$gtype_mismatches{het_RA_}; } if ( exists($gtype_matches{hom_RR_}) ) { $an += 2*$gtype_matches{hom_RR_}; } if ( exists($gtype_mismatches{hom_RR_}) ) { $an += 2*$gtype_mismatches{hom_RR_}; } $af = sprintf "%.2f", $an>0 ? $ac/$an : 0; } else { my ($an,$ac) = $$grp[0]{vcf}->calc_an_ac($$grp[0]{rec}{gtypes}); $af = sprintf "%.2f", $an>0 ? $ac/$an : 0; } for my $type (keys %gtype_matches) { for my $infoTag ( @{$$opts{INFOgroup}} ) { $$opts{counts_by_INFO}{$infoTag}{$infoGroup{$infoTag}}{$type}{matches} += $gtype_matches{$type}; } $$opts{counts_by_af}{$af}{$type}{matches} += $gtype_matches{$type}; $$opts{gtypes_cmp_total} += $gtype_matches{$type}; } for my $type (keys %gtype_mismatches) { for my $infoTag ( @{$$opts{INFOgroup}} ) { $$opts{counts_by_INFO}{$infoTag}{$infoGroup{$infoTag}}{$type}{mismatches} += $gtype_mismatches{$type}; } $$opts{counts_by_af}{$af}{$type}{mismatches} += $gtype_mismatches{$type}; $$opts{gtypes_cmp_total} += $gtype_mismatches{$type}; } if ( $$opts{debug} ) { my $hom_rr_mm = $gtype_mismatches{hom_RR_} ? $gtype_mismatches{hom_RR_} : 0; my $het_ra_mm = $gtype_mismatches{het_RA_} ? $gtype_mismatches{het_RA_} : 0; my $hom_aa_mm = $gtype_mismatches{hom_AA_} ? $gtype_mismatches{hom_AA_} : 0; my $hom_rr_m = $gtype_matches{hom_RR_} ? $gtype_matches{hom_RR_} : 0; my $het_ra_m = $gtype_matches{het_RA_} ? $gtype_matches{het_RA_} : 0; my $hom_aa_m = $gtype_matches{hom_AA_} ? $gtype_matches{hom_AA_} : 0; my $denom = $het_ra_m+$hom_aa_m+$hom_rr_mm+$het_ra_mm+$hom_aa_mm; my $ndr = sprintf "%.2f", $denom ? ($hom_rr_mm+$het_ra_mm+$hom_aa_mm)*100.0/$denom : 0; print "SD\t$$grp[0]{rec}{CHROM}\t$$grp[0]{rec}{POS}\t$hom_rr_m\t$het_ra_m\t$hom_aa_m\t$hom_rr_mm\t$het_ra_mm\t$hom_aa_mm\t$ndr\n"; } } sub read_next_group { my ($opts,$vcfs,$win) = @_; my @grp; my $prev_vcf; my $start; while (1) { my $min_vcf = get_min_position($opts,$vcfs); if ( !$min_vcf ) { last; } if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; } $prev_vcf = $$min_vcf{buf}[0]; if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} ) { my $rec = shift(@{$$min_vcf{buf}}); push @grp,$rec; $start = $$rec{pos}; next; } } return \@grp; } sub get_min_position { my ($opts,$vcfs) = @_; my ($min_pos,$min_vcf); for my $vcf (@$vcfs) { # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($opts,$vcf); } if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; } my $line = $$vcf{buf}[0]; # Designate this position as the minimum of all the files if: # .. is this the first file? if ( !$min_pos ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } # .. has this file lower position? if ( $min_pos>$$line{pos} ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } } return $min_vcf; } sub read_line { my ($opts,$vcf) = @_; if ( $$vcf{eof} ) { return; } my @items; my $line; while ( !defined $line ) { $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } @items = split(/\t/,$line); if ( $$opts{apply_filters} ) { if ( $items[6] ne 'PASS' && $items[6] ne '.' ) { $line = undef; next; } } } $$vcf{nread}++; my $chr = $items[0]; my $pos = $items[1]; my $ref = uc($items[3]); my $alt = uc($items[4]); if ( $$vcf{buf} && @{$$vcf{buf}} ) { my $prev = $$vcf{buf}[-1]; if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); } } push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf }; return; } vcftools_0.1.11/perl/vcf-fix-ploidy0000755000000000000000000002446512156354770015766 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); fix_ploidy($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "Usage: cat broken.vcf | vcf-fix-ploidy [OPTIONS] > fixed.vcf\n", "Options:\n", " -a, --assumed-sex M or F, required if the list is not complete in -s\n", " -l, --fix-likelihoods Add or remove het likelihoods (not the default behaviour)\n", " -p, --ploidy Ploidy definition. The default is shown below.\n", " -s, --samples List of sample sexes (sample_name [MF]).\n", " -h, -?, --help This help message.\n", "Default ploidy definition:\n", " ploidy =>\n", " {\n", " X =>\n", " [\n", " # The pseudoautosomal regions 60,001-2,699,520 and 154,931,044-155,270,560 with the ploidy 2\n", " { from=>1, to=>60_000, M=>1 },\n", " { from=>2_699_521, to=>154_931_043, M=>1 },\n", " ],\n", " Y =>\n", " [\n", " # No chrY in females and one copy in males\n", " { from=>1, to=>59_373_566, M=>1, F=>0 },\n", " ],\n", " MT =>\n", " [\n", " # Haploid MT in males and females\n", " { from=>1, to => 16_569, M=>1, F=>1 },\n", " ],\n", " }\n", "\n"; exit -1; } sub parse_params { my $opts = { ploidy => { X => [ { from=>1, to=>60_000, M=>1 }, { from=>2_699_521, to=>154_931_043, M=>1 }, ], Y => [ { from=>1, to=>59_373_566, M=>1, F=>0 }, ], MT => [ { from=>1, to => 16_569, M=>1, F=>1 }, ], }, }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-p' || $arg eq '--ploidy' ) { my $file=shift(@ARGV); my $x=do $file; $$opts{ploidy}=$x; next } if ( $arg eq '-s' || $arg eq '--samples' ) { $$opts{samples}=shift(@ARGV); next } if ( $arg eq '-a' || $arg eq '--assumed-sex' ) { $$opts{assumed_sex}=shift(@ARGV); next } if ( $arg eq '-l' || $arg eq '--fix-likelihoods' ) { $$opts{fix_likelihoods}=1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{samples}) ) { error("Missing the -s option.\n") } return $opts; } sub fix_ploidy { my ($opts) = @_; my $vcf = $$opts{vcf} = Vcf->new(fh=>\*STDIN); $vcf->parse_header(); init_regions($opts); print $vcf->format_header; my @samples = $vcf->get_samples; my ($prev_chr,$prev_pos,$regions,$iregion,$nregions); my %nchanged; while (my $line = $vcf->next_line) { my $rec = $vcf->next_data_array($line); if ( !defined $prev_chr or $$rec[0] ne $prev_chr ) { $prev_chr = $$rec[0]; $prev_pos = $$rec[1]; if ( exists($$opts{regions}{$prev_chr}) ) { $regions = $$opts{regions}{$prev_chr}; $iregion = 0; $nregions = @$regions; } else { $regions = undef; } } $prev_chr = $$rec[0]; $prev_pos = $$rec[1]; my $samples; if ( defined $regions ) { if ( $prev_pos >= $$regions[$iregion]{from} && $prev_pos <= $$regions[$iregion]{to} ) { $samples = $$regions[$iregion]{samples}; } else { while ( $iregion<$nregions && $$regions[$iregion]{to}<$prev_pos ) { $iregion++; } if ( $iregion>=$nregions ) { undef $regions; } elsif ( $prev_pos >= $$regions[$iregion]{from} && $prev_pos <= $$regions[$iregion]{to} ) { $samples = $$regions[$iregion]{samples}; } } } if ( !defined $samples ) { print $line; next; } my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $ipl = $vcf->get_tag_index($$rec[8],'PL',':'); my $igl = $vcf->get_tag_index($$rec[8],'GL',':'); if ( $igt==-1 ) { print $line; next; } my @alt = split(/,/,$$rec[4]); my $nals = $alt[0] eq '.' ? 1 : 1 + scalar @alt; my $changed = 0; my $nrec = @$rec; for (my $isample=9; $isample<$nrec; $isample++) { my $sample = $samples[$isample-9]; if ( !exists($$samples{$sample}) ) { next; } my $gt = $vcf->get_field($$rec[$isample],$igt); my ($pl,$gl); if ( $$opts{fix_likelihoods} && $ipl != -1 ) { $pl = $vcf->get_field($$rec[$isample],$ipl); } if ( $$opts{fix_likelihoods} && $igl != -1 ) { $gl = $vcf->get_field($$rec[$isample],$igl); } my ($new_gt, $new_pl, $new_gl); if ( !$$samples{$sample} ) { # missing genotype - leave it as it is unless it must be removed if ( $gt ne '.' && $gt ne './.' ) { my (@als) = $vcf->split_gt($gt); if ( defined $pl && $pl ne '.' ) { ($new_pl) = reploid_g($rec, 1, $nals, $pl, scalar @als, 1); } if ( defined $gl && $gl ne '.' ) { ($new_gl) = reploid_g($rec, -1, $nals, $gl, scalar @als, 1); } $new_gt = '.'; $nchanged{removed}{$sample}++; } } else { my (@als) = $vcf->split_gt($gt); if ( $$samples{$sample} != @als ) { $new_gt = join('/',($als[0]) x $$samples{$sample}); if ( defined $pl && $pl ne '.' ) { ($new_pl,$new_gt) = reploid_g($rec, 1, $nals, $pl, scalar @als, $$samples{$sample}); } if ( defined $gl && $gl ne '.' ) { ($new_gl,$new_gt) = reploid_g($rec, -1, $nals, $gl, scalar @als, $$samples{$sample}); } } } if ( defined $new_gt ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_gt,$igt,':'); $changed++; } if ( defined $new_pl ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_pl,$ipl,':'); $changed++; } if ( defined $new_gl ) { $$rec[$isample] = $vcf->replace_field($$rec[$isample],$new_gl,$igl,':'); $changed++; } } if ( $changed ) { print join("\t",@$rec),"\n"; } else { print $line; } } # Output stats for my $key (sort keys %nchanged) { for my $sample (sort keys %{$nchanged{$key}}) { print STDERR "$sample\t$$opts{samples}{$sample}\t$key\t$nchanged{$key}{$sample}\n"; } } } sub reploid_g { my ($rec, $extr,$nals,$str,$n,$m) = @_; my @vals = split(/,/,$str); if ( $n==2 && $m==1 ) { my @out; my $d = 1; my $k = 0; my ($imin,$min); for (my $i=0; $i<$nals; $i++) { if ( $k>=@vals ) { error("Cannot reploid $$rec[0]:$$rec[1], too few values in $str: $nals, $n->$m ($i,$d,$k)\n"); } if ( $vals[$k] ne '.' && (!defined $min or $min>$extr*$vals[$k]) ) { $min = $extr*$vals[$k]; $imin = $i; } push @out, $vals[$k]; $d++; $k += $d; } my $gt = defined $imin ? $imin : 0; return (join(',',@out), $gt); } elsif ( $n==1 && $m==2 ) { my @out; my ($imin,$min); for (my $i=0; $i<$nals; $i++) { for (my $j=0; $j<=$i; $j++) { push @out, $i==$j ? $vals[$i] : '.'; if ( $vals[$i] ne '.' && (!defined $min or $min>$extr*$vals[$i]) ) { $min = $extr*$vals[$i]; $imin = $i; } } } my $gt = defined $imin ? $imin : 0; return (join(',',@out), "$gt/$gt" ); } else { error("Only diploid/haploid cases handled in this version, sorry."); } } sub init_regions { my ($opts) = @_; open(my $fh,'<',$$opts{samples}) or error("$$opts{samples}: $!"); my (%sexes,%samples); while (my $line=<$fh>) { $line =~ s/^\s*//; $line =~ s/\s*$//; if ( !($line=~/^(\S+)\s+(\S+)$/) ) { error("Could not parse the sample file $$opts{sample}, the offending line was: $line"); } push @{$sexes{$2}}, $1; $samples{$1} = $2; } close($fh); $$opts{samples} = \%samples; for my $sample ($$opts{vcf}->get_samples()) { if ( !exists($samples{$sample}) ) { if ( !exists($$opts{assumed_sex}) ) { error("Could not determine the sex of the sample \"$sample\". Would the -a option help here?\n"); } $samples{$sample} = $$opts{assumed_sex}; push @{$sexes{$$opts{assumed_sex}}}, $sample; } } # Create a quick look-up structure for my $chr (keys %{$$opts{ploidy}}) { if ( ref($$opts{ploidy}{$chr}) ne 'ARRAY' ) { error("Uh, expected list reference for $chr regions.\n"); } my $prev; for my $reg (sort { $$a{from}<=>$$b{to} } @{$$opts{ploidy}{$chr}}) { my $from = $$reg{from}; my $to = $$reg{to}; if ( defined $prev && $prev>=$from ) { error("FIXME: Overlapping regions $chr:$prev>=$from\n"); } $prev = $to; my $region; for my $sex (keys %sexes) { if ( !exists($$reg{$sex}) ) { next; } for my $sample (@{$sexes{$sex}}) { $$region{samples}{$sample} = $$reg{$sex}; } } if ( !defined $region ) { next; } $$region{from} = $from; $$region{to} = $to; push @{$$opts{regions}{$chr}}, $region; } } } vcftools_0.1.11/perl/vcf-subset0000755000000000000000000001703212156354770015177 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); vcf_subset($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-subset [OPTIONS] in.vcf.gz > out.vcf\n", "Options:\n", " -a, --trim-alt-alleles Remove alternate alleles if not found in the subset\n", " -c, --columns File or comma-separated list of columns to keep in the vcf file. If file, one column per row\n", " -e, --exclude-ref Exclude rows not containing variants.\n", " -f, --force Proceed anyway even if VCF does not contain some of the samples.\n", " -p, --private Print only rows where only the subset columns carry an alternate allele.\n", " -r, --replace-with-ref Replace the excluded types with reference allele instead of dot.\n", " -t, --type Comma-separated list of variant types to include: ref,SNPs,indels,MNPs,other.\n", " -u, --keep-uncalled Do not exclude rows without calls.\n", " -h, -?, --help This help message.\n", "Examples:\n", " cat in.vcf | vcf-subset -r -t indels -e -c SAMPLE1 > out.vcf\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { exclude_ref=>0, keep_uncalled=>0, replace_with_ref=>0, private=>0, args=>[$0, @ARGV] }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-t' || $arg eq '--type' ) { my %known = ( ref=>'r', SNPs=>'s', indels=>'i', MNPs=>'m', other=>'o' ); my $types = shift(@ARGV); for my $t (split(/,/,$types)) { if ( !(exists($known{$t})) ) { error("Unknown type [$t] with -t [$types]\n"); } $$opts{types}{$known{$t}} = 1; } next; } if ( $arg eq '-a' || $arg eq '--trim-alt-alleles' ) { $$opts{'trim_alts'} = 1; next } if ( $arg eq '-e' || $arg eq '--exclude-ref' ) { $$opts{'exclude_ref'} = 1; next } if ( $arg eq '-f' || $arg eq '--force' ) { $$opts{'force'} = 1; next } if ( $arg eq '-p' || $arg eq '--private' ) { $$opts{'private'} = 1; next } if ( $arg eq '-r' || $arg eq '--replace-with-ref' ) { $$opts{'replace_with_ref'} = 1; next } if ( $arg eq '-u' || $arg eq '--keep-uncalled' ) { $$opts{'keep_uncalled'} = 1; next } if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{'columns_file'} = shift(@ARGV); next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file} = $arg; next } if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !$$opts{exclude_ref} && !$$opts{'columns_file'} && !exists($$opts{'types'}) && !exists($$opts{trim_alts}) ) { error("Missing the -c or -t or -r option.\n") } return $opts; } sub read_columns { my ($fname) = @_; my @columns; if ( !-e $fname ) { @columns = split(/,/,$fname); return \@columns; } open(my $fh,'<',$fname) or error("$fname: $!"); while (my $line=<$fh>) { chomp($line); $line=~s/\s+//g; push @columns, $line; } close($fh); return \@columns; } sub check_columns { my ($opts,$vcf,$columns) = @_; my @out; for my $col (@$columns) { if ( exists($$vcf{has_column}{$col}) ) { push @out, $col; next; } my $msg = qq[No such column in the VCF file: "$col"\n]; if ( $$opts{force} ) { warn($msg); } else { error($msg); } } return \@out; } sub vcf_subset { my ($opts) = @_; my $vcf = $$opts{file} ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); my $AGtags; if ( $$opts{trim_alts} ) { $$vcf{trim_redundant_ALTs} = 1; $AGtags = $vcf->has_AGtags(); } # Init requested column info. If not present, include all columns. my $columns = exists($$opts{columns_file}) ? read_columns($$opts{columns_file}) : []; $columns = check_columns($opts,$vcf,$columns); if ( !@$columns && (my $ncols=@{$$vcf{columns}})>9 ) { push @$columns, @{$$vcf{columns}}[9..($ncols-1)]; } my $columns_to_keep = { map { $_ => 1 } @$columns }; my %has_col = map { $_ => 1 } @$columns; $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf->set_samples(include=>$columns) unless $$opts{private}; print $vcf->format_header($columns); my $check_private = $$opts{private}; while (my $x=$vcf->next_data_hash()) { my $site_has_call = 0; my $site_has_nonref = 0; my $site_is_private = 1; my $ref = $$x{REF}; for my $col (keys %{$$x{gtypes}}) { if ( !$has_col{$col} && ($site_is_private==0 || !$check_private) ) { # This column is not to be printed delete($$x{gtypes}{$col}); next; } my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); my $sample_has_call = 0; my $sample_has_nonref = 0; my @out_alleles; for (my $i=0; $i<@$alleles; $i++) { my ($type,$len,$ht) = $vcf->event_type($ref,$$alleles[$i]); $out_alleles[$i] = $$alleles[$i]; # Exclude unwanted variant types if requested if ( exists($$opts{types}) ) { if ( $type eq 's' && $len>1 ) { $type = 'm'; } elsif ( $type eq 'b' or $type eq 'u' ) { $type = 'o'; } if ( !exists($$opts{types}{$type}) ) { $out_alleles[$i] = $$opts{replace_with_ref} ? $ref : '.'; next; } $sample_has_call = 1; } elsif ( !$is_empty ) { $sample_has_call = 1; } if ( $type ne 'r' ) { $site_has_nonref = 1; $sample_has_nonref = 1; } } if ( $check_private && !$has_col{$col} ) { if ( $sample_has_nonref ) { $site_is_private=0; } delete($$x{gtypes}{$col}); next; } if ( !$sample_has_call ) { if ( $$opts{replace_with_ref} ) { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = $ref; } } else { for (my $i=0; $i<@$alleles; $i++) { $out_alleles[$i] = '.'; } } } else { $site_has_call = 1; } $$x{gtypes}{$col}{GT} = $vcf->format_haplotype(\@out_alleles,$seps); } if ( !$site_has_call && !$$opts{keep_uncalled} ) { next; } if ( !$site_has_nonref && $$opts{exclude_ref} ) { next; } if ( $check_private && (!$site_is_private || !$site_has_nonref) ) { next; } if ( $$opts{trim_alts} && defined $AGtags ) { $vcf->remove_columns($x, keep=>$columns_to_keep); $vcf->parse_AGtags($x); } $vcf->format_genotype_strings($x); print $vcf->format_line($x,$columns); } } vcftools_0.1.11/perl/vcf-validator0000755000000000000000000000643112156354770015660 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use IPC::Open3 'open3'; use IO::Select; my $opts = parse_params(); do_validation($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-validator [OPTIONS] file.vcf.gz\n", "Options:\n", " -d, --duplicates Warn about duplicate positions.\n", " -u, --unique-messages Output all messages only once.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { unique=>0, duplicates=>0 }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--duplicates' ) { $$opts{duplicates}=1; next; } if ( $arg eq '-u' || $arg eq '--unique-messages' ) { $$opts{unique}=1; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( (-e $arg or $arg=~m{^(?:ftp|http)://}) && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } error("Unknown parameter or non-existent file: \"$arg\". Run -h for help.\n"); } return $opts; } sub do_validation { my ($opts) = @_; my %opts = $$opts{file} ? (file=>$$opts{file}) : (fh=>\*STDIN); if ( !$$opts{unique} ) { my $vcf = Vcf->new(%opts, warn_duplicates=>$$opts{duplicates}); $vcf->run_validation(); return; } my ($kid_in,$kid_out,$kid_err); my $pid = open3($kid_in,$kid_out,$kid_err,'-'); if ( !defined $pid ) { error("Cannot fork: $!"); } if ($pid) { $$opts{known_lines} = []; my $sel = new IO::Select; $sel->add($kid_out,$kid_err); while(my @ready = $sel->can_read) { foreach my $fh (@ready) { my $line = <$fh>; if (not defined $line) { $sel->remove($fh); next; } print_or_discard_line($opts,$line); } } print_summary($opts); } else { my $vcf = Vcf->new(%opts, warn_duplicates=>$$opts{duplicates}); $vcf->run_validation(); return; } } sub print_or_discard_line { my ($opts,$line) = @_; my @items = split(/\s+/,$line); my $nitems = scalar @items; for my $known (@{$$opts{known_lines}}) { if ( @items != @{$$known{line}} ) { next; } my $nmatches = 0; for (my $i=0; $i<$nitems; $i++) { if ( $items[$i] eq $$known{line}[$i] ) { $nmatches++ } } if ( $nitems-$nmatches<3 ) { $$known{n}++; return; } } push @{$$opts{known_lines}}, { line=>\@items, n=>1 }; print $line; } sub print_summary { my ($opts) = @_; my $n = 0; for my $error (@{$$opts{known_lines}}) { $n += $$error{n}; } print "\n\n------------------------\n"; print "Summary:\n"; printf "\t%d errors total \n\n", $n; $n = 0; for my $error (sort {$$b{n}<=>$$a{n}} @{$$opts{known_lines}}) { if ( $n++ > 50 ) { print "\n\nand more...\n"; last; } printf "\t%d\t..\t%s\n", $$error{n},join(' ',@{$$error{line}}); } } vcftools_0.1.11/perl/vcf-indel-stats0000755000000000000000000001043012156354770016114 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); do_stats($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Currently calculates in-frame ratio.\n", "Usage: vcf-indel-stats [OPTIONS] < in.vcf > out.txt\n", "Options:\n", " -h, -?, --help This help message.\n", " -e, --exons Tab-separated file with exons (chr,from,to; 1-based, inclusive)\n", " -v, --verbose\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-e' || $arg eq '--exons' ) { $$opts{exons}=shift(@ARGV); next; } if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose}=1; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub init_regions { my ($opts) = @_; my $exname = $$opts{exons}=~/\.gz$/i ? "gunzip -c $$opts{exons} |" : "<$$opts{exons}"; open(my $exfh, $exname) or error("$exname: $!"); my %regs; while (my $line=<$exfh>) { my ($chr,$from,$to) = split(/\t/,$line); chomp($to); push @{$regs{$chr}}, [$from,$to]; } close($exfh); for my $chr (keys %regs) { $regs{$chr} = [ sort { if ($$a[0]==$$b[0]) {return $$a[1]<=>$$b[1]} else {return $$a[0]<=>$$b[0]} } @{$regs{$chr}} ]; } $$opts{regs} = \%regs; $$opts{iregs} = {}; } sub do_stats { my ($opts) = @_; init_regions($opts); my $vcf = Vcf->new(fh=>\*STDIN); $vcf->parse_header; $$opts{in_frame} = $$opts{out_frame} = 0; my ($prev_chr,$prev_pos); my $ntot=0; while (my $line=$vcf->next_line) { if ( substr($line,0,1) eq '#' ) { next; } my $i=0; my $j; $j=index($line,"\t",$i); my $chr=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); my $pos=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); $i=$j+1; $j=index($line,"\t",$i); my $ref=substr($line,$i,$j-$i); $i=$j+1; $j=index($line,"\t",$i); my $alt=substr($line,$i,$j-$i); $i=$j+1; if ( defined $prev_chr && $prev_chr eq $chr && $prev_pos>$pos ) { error("The VCF file must be sorted"); } $prev_chr = $chr; $prev_pos = $pos; if ( $alt eq '.' ) { next; } #print "[$chr] [$pos] [$ref] [$alt]\n"; my $is_indel; $i=0; while (($j=index($alt,',',$i))!=-1) { my ($type,$len,$ht) = $vcf->event_type($ref,substr($alt,$i,$j-$i)); if ( $type eq 'i' or $type eq 'o' ) { check_csq($opts,$chr,$pos,$len); } $i = $j+1; } my ($type,$len,$ht) = $vcf->event_type($ref,substr($alt,$i)); if ( $type eq 'i' or $type eq 'o' ) { $ntot++; check_csq($opts,$chr,$pos,$len); } } printf "total\t%d\n", $ntot; printf "in-frame\t%d\n", $$opts{in_frame}; printf "frameshift\t%d\n", $$opts{out_frame}; printf "ratio\t%f\n", ($$opts{out_frame}+$$opts{in_frame})?$$opts{out_frame}/($$opts{out_frame}+$$opts{in_frame}) : 0; } sub check_csq { my ($opts,$chr,$pos,$len) = @_; my $opos = $pos; if ( !exists($$opts{regs}{$chr}) ) { return; } my $regs = $$opts{regs}{$chr}; my $ir = exists($$opts{iregs}{$chr}) ? $$opts{iregs}{$chr} : 0; while ( $ir<@$regs && $$regs[$ir][1] <= $pos ) { $ir++; } $$opts{iregs}{$chr} = $ir; if ( $ir>=@$regs ) { return; } my $reg_to = $$regs[$ir][1]; if ( $reg_to<=$pos ) { return; } my $reg_from = $$regs[$ir][0]; my $to = $len<0 ? $pos-$len : $pos+1; if ( $to<$reg_from ) { return; } $pos++; if ( $pos<$reg_from ) { $len += $reg_from-$pos; $pos = $reg_from; } if ( $reg_to<$to && $len<0 ) { $len += $to-$reg_to; } #print "\tinside $$regs[$ir][0] - $$regs[$ir][1] ($pos,$to,$len)\n"; #if ( $len%3 || ($pos-$reg_from)%3 ) { $$opts{out_frame}++; } if ( $len%3 ) { $$opts{out_frame}++; } else { $$opts{in_frame}++; } if ( $$opts{verbose} ) { print "$chr\t$opos\t$$regs[$ir][0]\t$$regs[$ir][1]\t", ($len%3 ? 'frameshift':'inframe') ,"\n"; } } vcftools_0.1.11/perl/fill-an-ac0000755000000000000000000000235512156354770015023 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); fill_an_ac($$opts{file}); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: fill-an-ac [OPTIONS] < in.vcf >out.vcf\n", "Options:\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( -e $arg ) { $$opts{file} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub fill_an_ac { my ($file) = @_; my $vcf = $file ? Vcf->new(file=>$file) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); $vcf->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); print $vcf->format_header(); $vcf->recalc_ac_an(2); while (my $rec=$vcf->next_data_hash()) { print $vcf->format_line($rec); } } vcftools_0.1.11/perl/fill-fs0000755000000000000000000001742512156354770014460 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; use FaSlice; my $opts = parse_params(); flanking_sequence($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: Annotate VCF with flanking sequence (INFO/FS tag)\n", "Usage: fill-fs [OPTIONS] file.vcf\n", "Options:\n", " -b, --bed-mask Regions to mask (tabix indexed), multiple files can be given\n", " -c, --cluster Do self-masking of clustered variants within this range.\n", " -l, --length Flanking sequence length [100]\n", " -m, --mask-char The character to use or \"lc\" for lowercase. This option must preceed\n", " -b, -v or -c in order to take effect. With multiple files works\n", " as a switch on the command line, see the example below [N]\n", " -r, --refseq The reference sequence.\n", " -v, --vcf-mask Mask known variants in the flanking sequence, multiple files can be given (tabix indexed)\n", " -h, -?, --help This help message.\n", "Example:\n", " # Mask variants from the VCF file with N's and use lowercase for the bed file regions\n", " fill-fs file.vcf -v mask.vcf -m lc -b mask.bed\n", "\n"; } sub parse_params { my $opts = { length=>100, mask=>[], cluster=>0 }; my $mask = $$opts{mask_char}{default} = 'N'; my $mask_changed = 0; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-c' || $arg eq '--cluster' ) { $$opts{cluster}=shift(@ARGV); $$opts{mask_char}{default}=$mask; $mask_changed=0; next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-l' || $arg eq '--length' ) { $$opts{length}=shift(@ARGV); next; } if ( $arg eq '-m' || $arg eq '--mask' ) { $mask=shift(@ARGV); check_mask_char($mask); $mask_changed=1; next; } if ( $arg eq '-b' || $arg eq '--bed-mask' ) { $arg=shift(@ARGV); push @{$$opts{bed_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-v' || $arg eq '--vcf-mask' ) { $arg=shift(@ARGV); push @{$$opts{vcf_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !($$opts{length}=~/^\d+$/) ) { error("Expected integer after -l, got $$opts{length}\n"); } if ( !exists($$opts{refseq}) ) { error("Missing the -r option.\n"); } if ( $mask_changed ) { error("The -m parameter must preceed -b, -v, or the file in order to take effect.\n"); } return $opts; } sub check_mask_char { my ($mask) = @_; if ( $mask eq 'lc' ) { return; } if ( length($mask) eq 1 ) { return; } error("Currently only \"lc\" or one-character mask is supported, got \"$mask\".\n"); } sub flanking_sequence { my ($opts) = @_; $$opts{faref} = FaSlice->new(file=>$$opts{refseq},size=>1_024,oob=>'N'); my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header; print $vcf->format_header; my (@lines,@mask); while (my $line=$vcf->next_data_array) { my $chr = $$line[0]; my $pos = $$line[1]; my $ref = $$line[3]; my $alt = $$line[4]; my $off; $alt =~ s/,.+$//; # first allele is used at multiallelic sites ($off,$ref,$alt) = $vcf->normalize_alleles_pos($ref,$alt); $pos += $off; push @lines, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line }; push @mask, { chr=>$chr, pos=>$pos, ref=>$ref }; flush_buffers($opts,\@lines,\@mask); } flush_buffers($opts,\@lines,\@mask,1); } sub flush_buffers { my ($opts,$lines,$mask,$force) = @_; if ( !@$lines ) { return; } if ( !$$opts{cluster} ) { shift(@$mask); output_line($opts,shift(@$lines),$mask); return; } while ( @$lines && ($force or $$mask[0]{chr} ne $$lines[-1]{chr} or $$mask[0]{pos}+2*$$opts{cluster}<=$$lines[-1]{pos}) ) { output_line($opts,$$lines[0],$mask); shift(@$lines); while ( @$mask && @$lines && ($$mask[0]{chr} ne $$lines[0]{chr} or $$mask[0]{pos}+$$opts{cluster}<=$$lines[0]{pos}) ) { shift(@$mask); } } } sub output_line { my ($opts,$hline,$mask) = @_; my $chr = $$hline{chr}; my $pos = $$hline{pos}; my $ref = $$hline{ref}; my $alt = $$hline{alt}; my $line = $$hline{line}; my $seq_pos = $$opts{length}; my $reflen = length($ref); my $from = $pos-$$opts{length}; my $to = $pos+($reflen-1)+$$opts{length}; my $seq = $$opts{faref}->get_slice($chr,$from,$to); $seq = mask_sequence($opts,$seq,$chr,$from,$to,$mask); my $reflen_ori = $reflen; my ($len,$indel,$off) = $$opts{vcf}->is_indel($ref,$alt); if ( $len<0 ) { $seq_pos += $off; $ref = $indel; $reflen = abs($len); $alt = '-'; } elsif ( $len>0 ) { $seq_pos += $off; $ref = '-'; $alt = $indel; $reflen = $off-1; } substr($seq,$seq_pos,$reflen,"[$ref/$alt]"); if ( $reflen_ori - $reflen > 0 ) { # for redundant pad bases which cannot be removed without changing the position, e.g. ACGT AC $seq = substr($seq,$reflen_ori-$reflen); } if ( $$line[7] eq '.' or !defined $$line[7] ) { $$line[7] = ''; } else { $$line[7] .= ';'; } $$line[7] .= "FS=$seq"; print join("\t",@$line),"\n"; } sub mask_sequence { my ($opts,$seq,$chr,$from,$to,$mask) = @_; for my $m (@$mask) { my $reflen = length($$m{ref}); if ( $$m{chr} ne $chr or $$m{pos}+$reflen<$from or $$m{pos}>$to ) { next; } apply_mask($opts,\$seq,$$m{pos}-$from,$$m{ref},$$opts{mask_char}{default}); } for my $file (@{$$opts{vcf_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my $items = $$opts{vcf}->split_mandatory($ret); # In different situations one may want to treat indels differently. For # now, mask the whole REF string as for primer design it is safer to # mask the whole thing; for example, a 2bp deletion can be reported by # samtools as REF=GACACACA ALT=GACACA, the script will mask it all. apply_mask($opts,\$seq,$$items[1]-$from,$$items[3],$$opts{mask_char}{$file}); } } for my $file (@{$$opts{bed_mask}}) { my @tabix = `tabix $file $chr:$from-$to`; for my $ret (@tabix) { my @items = split(/\t/,$ret); apply_mask($opts,\$seq,$items[1]-$from+1,$items[2]-$from,$$opts{mask_char}{$file}); } } return $seq; } sub apply_mask { my ($opts,$seq,$from,$ref,$mask_char) = @_; if ( $from<0 ) { $from=0; } my $ref_len = $ref=~/^\d+$/ ? $ref-$from+1 : length($ref); my $seq_len = length($$seq); if ( $from+$ref_len>=$seq_len ) { $ref_len = $seq_len - $from; } if ( $ref_len<0 ) { return; } if ( $ref_len==1 ) { my $rpl = substr($$seq,$from,1); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,1)) : $mask_char; substr($$seq,$from,1,$rpl); return; } my $rpl = substr($$seq,$from,$ref_len); $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,$ref_len)) : ($mask_char x $ref_len); substr($$seq,$from,$ref_len,$rpl); } vcftools_0.1.11/perl/Vcf.pm0000644000000000000000000031000012156354770014233 0ustar rootrootpackage Vcf; our $VERSION = 'r840'; # http://vcftools.sourceforge.net/specs.html # http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:variant_call_format # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0 # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf_4.0_sv # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 # http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcfv3.2 # # Authors: petr.danecek@sanger # for VCF v3.2, v3.3, v4.0, v4.1 # =head1 NAME Vcf.pm. Module for validation, parsing and creating VCF files. Supported versions: 3.2, 3.3, 4.0, 4.1 =head1 SYNOPSIS From the command line: perl -MVcf -e validate example.vcf perl -I/path/to/the/module/ -MVcf -e validate_v32 example.vcf From a script: use Vcf; my $vcf = Vcf->new(file=>'example.vcf.gz',region=>'1:1000-2000'); $vcf->parse_header(); # Do some simple parsing. Most thorough but slowest way how to get the data. while (my $x=$vcf->next_data_hash()) { for my $gt (keys %{$$x{gtypes}}) { my ($al1,$sep,$al2) = $vcf->parse_alleles($x,$gt); print "\t$gt: $al1$sep$al2\n"; } print "\n"; } # This will split the fields and print a list of CHR:POS while (my $x=$vcf->next_data_array()) { print "$$x[0]:$$x[1]\n"; } # This will return the lines as they were read, including the newline at the end while (my $x=$vcf->next_line()) { print $x; } # Only the columns NA00001, NA00002 and NA00003 will be printed. my @columns = qw(NA00001 NA00002 NA00003); print $vcf->format_header(\@columns); while (my $x=$vcf->next_data_array()) { # this will recalculate AC and AN counts, unless $vcf->recalc_ac_an was set to 0 print $vcf->format_line($x,\@columns); } $vcf->close(); =cut use strict; use warnings; use Carp; use Exporter; use Data::Dumper; use POSIX ":sys_wait_h"; use vars qw/@ISA @EXPORT/; @ISA = qw/Exporter/; @EXPORT = qw/validate validate_v32/; =head2 validate About : Validates the VCF file. Usage : perl -MVcf -e validate example.vcf.gz # (from the command line) validate('example.vcf.gz'); # (from a script) validate(\*STDIN); Args : File name or file handle. When no argument given, the first command line argument is interpreted as the file name. =cut sub validate { my ($fh) = @_; if ( !$fh && @ARGV ) { $fh = $ARGV[0]; } my $vcf; if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(fh=>$fh) : Vcf->new(file=>$fh); } else { $vcf = Vcf->new(fh=>\*STDIN); } $vcf->run_validation(); } =head2 validate_v32 About : Same as validate, but assumes v3.2 VCF version. Usage : perl -MVcf -e validate_v32 example.vcf.gz # (from the command line) Args : File name or file handle. When no argument given, the first command line argument is interpreted as the file name. =cut sub validate_v32 { my ($fh) = @_; if ( !$fh && @ARGV && -e $ARGV[0] ) { $fh = $ARGV[0]; } my %params = ( version=>'3.2' ); my $vcf; if ( $fh ) { $vcf = fileno($fh) ? Vcf->new(%params, fh=>$fh) : Vcf->new(%params, file=>$fh); } else { $vcf = Vcf->new(%params, fh=>\*STDIN); } $vcf->run_validation(); } =head2 new About : Creates new VCF reader/writer. Usage : my $vcf = Vcf->new(file=>'my.vcf', version=>'3.2'); Args : fh .. Open file handle. If neither file nor fh is given, open in write mode. file .. The file name. If neither file nor fh is given, open in write mode. region .. Optional region to parse (requires tabix indexed VCF file) silent .. Unless set to 0, warning messages may be printed. strict .. Unless set to 0, the reader will die when the file violates the specification. version .. If not given, '4.0' is assumed. The header information overrides this setting. =cut sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; $$self{silent} = 0 unless exists($$self{silent}); $$self{strict} = 0 unless exists($$self{strict}); $$self{buffer} = []; # buffer stores the lines in the reverse order $$self{columns} = undef; # column names $$self{mandatory} = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] unless exists($$self{mandatory}); $$self{reserved}{cols} = {CHROM=>1,POS=>1,ID=>1,REF=>1,ALT=>1,QUAL=>1,FILTER=>1,INFO=>1,FORMAT=>1} unless exists($$self{reserved_cols}); $$self{recalc_ac_an} = 1; $$self{has_header} = 0; $$self{default_version} = '4.1'; $$self{versions} = [ qw(Vcf3_2 Vcf3_3 Vcf4_0 Vcf4_1) ]; if ( !exists($$self{max_line_len}) && exists($ENV{MAX_VCF_LINE_LEN}) ) { $$self{max_line_len} = $ENV{MAX_VCF_LINE_LEN} } $$self{fix_v40_AGtags} = $ENV{DONT_FIX_VCF40_AG_TAGS} ? 0 : 1; my %open_args = (); if ( exists($$self{region}) ) { $open_args{region}=$$self{region}; if ( !exists($$self{print_header}) ) { $$self{print_header}=1; } } if ( exists($$self{print_header}) ) { $open_args{print_header}=$$self{print_header}; } return $self->_open(%open_args); } sub throw { my ($self,@msg) = @_; confess @msg,"\n"; } sub warn { my ($self,@msg) = @_; if ( $$self{silent} ) { return; } if ( $$self{strict} ) { $self->throw(@msg); } warn @msg; } sub _open { my ($self,%args) = @_; if ( !exists($$self{fh}) && !exists($$self{file}) ) { # Write mode, the version must be supplied by the user return $self->_set_version(exists($$self{version}) ? $$self{version} : $$self{default_version}); } # Open the file unless filehandle is provided if ( !exists($$self{fh}) ) { if ( !defined $$self{file} ) { $self->throw("Undefined value passed to Vcf->new(file=>undef)."); } my $cmd = "<$$self{file}"; my $tabix_args = ''; if ( exists($args{print_header}) && $args{print_header} ) { $tabix_args .= ' -h '; } $tabix_args .= qq['$$self{file}']; if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= qq[ '$args{region}']; } if ( -e $$self{file} && $$self{file}=~/\.gz/i ) { if ( exists($args{region}) && defined($args{region}) ) { $cmd = "tabix $tabix_args |"; } else { $cmd = "gunzip -c '$$self{file}' |"; } } elsif ( $$self{file}=~m{^(?:http|ftp)://} ) { if ( !exists($args{region}) ) { $tabix_args .= ' .'; } $cmd = "tabix $tabix_args |"; } open($$self{fh},$cmd) or $self->throw("$cmd: $!"); } # Set the correct VCF version, but only when called for the first time my $vcf = $self; if ( !$$self{_version_set} ) { my $first_line = $self->next_line(); $vcf = $self->_set_version($first_line); $self->_unread_line($first_line); } return $vcf; } =head2 open About : (Re)Open file. No need to call this explicitly unless reading from a different region is requested. Usage : $vcf->open(); # Read from the start $vcf->open(region=>'1:12345-92345'); Args : region .. Supported only for tabix indexed files =cut sub open { my ($self,%args) = @_; $self->close(); $self->_open(%args); } =head2 close About : Close the filehandle Usage : $vcf->close(); Args : none Returns : close exit status =cut sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } my $ret = close($$self{fh}); delete($$self{fh}); return $ret; } =head2 next_line About : Reads next VCF line. Usage : my $vcf = Vcf->new(); my $x = $vcf->next_line(); Args : none =cut sub next_line { my ($self) = @_; if ( @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } my $line; if ( !exists($$self{max_line_len}) ) { $line = readline($$self{fh}); } else { while (1) { $line = readline($$self{fh}); if ( !defined $line ) { last; } my $len = length($line); if ( $len>$$self{max_line_len} && !($line=~/^#/) ) { if ( !($line=~/^([^\t]+)\t([^\t]+)/) ) { $self->throw("Could not parse the line: $line"); } $self->warn("The VCF line too long, ignoring: $1 $2 .. len=$len\n"); next; } last; } } return $line; } sub _unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } =head2 next_data_array About : Reads next VCF line and splits it into an array. The last element is chomped. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $x = $vcf->next_data_array(); Args : Optional line to parse =cut sub next_data_array { my ($self,$line) = @_; if ( !$line ) { $line = $self->next_line(); } if ( !$line ) { return undef; } if ( ref($line) eq 'ARRAY' ) { return $line; } my @items = split(/\t/,$line); if ( @items<8 ) { $line=~s/\n/\\n/g; $self->throw("Could not parse the line, wrong number of columns: [$line]"); } chomp($items[-1]); return \@items; } =head2 set_samples About : Parsing big VCF files with many sample columns is slow, not parsing unwanted samples may speed things a bit. Usage : my $vcf = Vcf->new(); $vcf->set_samples(include=>['NA0001']); # Exclude all but this sample. When the array is empty, all samples will be excluded. $vcf->set_samples(exclude=>['NA0003']); # Include only this sample. When the array is empty, all samples will be included. my $x = $vcf->next_data_hash(); Args : Optional line to parse =cut sub set_samples { my ($self,%args) = @_; if ( exists($args{include}) ) { for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 0; } for my $sample (@{$args{include}}) { if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); } my $idx = $$self{has_column}{$sample} - 1; $$self{samples_to_parse}[$idx] = 1; } } if ( exists($args{exclude}) ) { for (my $i=0; $i<@{$$self{columns}}; $i++) { $$self{samples_to_parse}[$i] = 1; } for my $sample (@{$args{exclude}}) { if ( !exists($$self{has_column}{$sample}) ) { $self->throw("The sample not present in the VCF file: [$sample]\n"); } my $idx = $$self{has_column}{$sample} - 1; $$self{samples_to_parse}[$idx] = 0; } } } sub _set_version { my ($self,$version_line) = @_; if ( $$self{_version_set} ) { return $self; } $$self{_version_set} = 1; $$self{version} = $$self{default_version}; if ( $version_line ) { if ( $version_line=~/^(\d+(?:\.\d+)?)$/ ) { $$self{version} = $1; undef $version_line; } elsif ( !($version_line=~/^##fileformat=/i) or !($version_line=~/(\d+(?:\.\d+)?)\s*$/i) ) { chomp($version_line); $self->warn("Could not parse the fileformat version string [$version_line], assuming VCFv$$self{default_version}\n"); undef $version_line; } else { $$self{version} = $1; } } my $reader; if ( $$self{version} eq '3.2' ) { $reader=Vcf3_2->new(%$self); } elsif ( $$self{version} eq '3.3' ) { $reader=Vcf3_3->new(%$self); } elsif ( $$self{version} eq '4.0' ) { $reader=Vcf4_0->new(%$self); } elsif ( $$self{version} eq '4.1' ) { $reader=Vcf4_1->new(%$self); } else { $self->warn(qq[The version "$$self{version}" not supported, assuming VCFv$$self{default_version}\n]); $$self{version} = '4.1'; $reader = Vcf4_1->new(%$self); } $self = $reader; # When changing version, change also the fileformat header line if ( exists($$self{header_lines}) && exists($$self{header_lines}[0]{key}) && $$self{header_lines}[0]{key} eq 'fileformat' ) { shift(@{$$self{header_lines}}); } return $self; } #--------------------------------------- package VcfReader; use base qw(Vcf); use strict; use warnings; use Carp; use Data::Dumper; sub new { my ($class,@args) = @_; my $self = {@args}; bless $self, ref($class) || $class; return $self; } =head2 next_data_hash About : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $x = $vcf->next_data_hash(); # Or having a VCF data line $line my $x = $vcf->next_data_hash($line); Args : Optional line to parse. =cut sub next_data_hash { my ($self,$line) = @_; if ( !$line ) { $line = $self->next_line(); } if ( !$line ) { return undef; } my @items; if ( ref($line) eq 'ARRAY' ) { @items = @$line; } else { @items = split(/\t/,$line); } chomp($items[-1]); my $cols = $$self{columns}; if ( !$cols ) { $self->_fake_column_names(scalar @items - 9); $cols = $$self{columns}; } # Check the number of columns if ( scalar @items != scalar @$cols ) { if ( $line=~/^\s*$/ ) { $self->throw("Sorry, empty lines not allowed.\n"); } my $c = substr($line,0,1); if ( $c eq '#' ) { if ( !$$self{header_parsed} ) { $self->throw("FIXME: parse_header must be called before next_data_hash.\n"); } else { $self->throw("Multiple header blocks (^#) not allowed.\n"); } } if ( $items[-1] eq '' ) { my $nremoved = 0; while ( $items[-1] eq '' ) { pop(@items); $nremoved++; } if ( $nremoved && !$$self{trailing_tabs_warned} ) { $self->warn("Broken VCF: empty columns (trailing TABs) starting at $items[0]:$items[1].\n"); $$self{trailing_tabs_warned} = 1; } } if ( scalar @items != scalar @$cols ) { my @test = split(/\s+/,$line); if ( scalar @test == scalar @$cols ) { $self->warn("(Were spaces used instead of tabs?)\n\n"); } else { $self->throw(sprintf "Wrong number of fields%s; expected %d, got %d. The offending line was:\n[%s]\n\n", exists($$self{file}) ? "in $$self{file}" : '', scalar @$cols, scalar @items, join("\t",@items)); } @items = @test; } } my %out; # Mandatory fields $out{CHROM} = $items[0]; $out{POS} = $items[1]; $out{ID} = $items[2]; $out{REF} = $items[3]; $out{ALT} = [ split(/,/,$items[4]) ]; $out{QUAL} = $items[5]; $out{FILTER} = [ split(/;/,$items[6]) ]; # INFO, e.g. NS=58;DP=258;AF=0.786;DB;H2 if ( defined $items[7] ) { my %hash; for my $info (split(/;/,$items[7])) { my ($key,$val) = split(/=/,$info); if ( !defined $key ) { $self->warn("Broken VCF file, empty INFO field at $items[0]:$items[1]\n"); next; } if ( defined $val ) { $hash{$key} = $val; } elsif ( exists($$self{header}{INFO}{$key}) ) { $hash{$key} = $$self{header}{INFO}{$key}{default}; } else { $hash{$key} = undef; } } $out{INFO} = \%hash; } # The FORMAT field may not be present. GT:GQ:DP:HQ my $format; if ( $$cols[8] || $items[8] ) { $format = $out{FORMAT} = [ split(/:/,$items[8]) ]; if ( (!$$format[0] || $$format[0] ne 'GT') && !$$self{ignore_missing_GT} ) { $self->warn("Expected GT as the first genotype field at $items[0]:$items[1]\n"); } } # Genotype fields my %gtypes; my $check_nformat = $$self{drop_trailings} ? 0 : 1; for (my $icol=9; $icol<@items; $icol++) { if ( $items[$icol] eq '' ) { $self->warn("Empty column $$cols[$icol] at $items[0]:$items[1]\n"); next; } if ( exists($$self{samples_to_parse}) && !$$self{samples_to_parse}[$icol] ) { next; } my @fields = split(/:/, $items[$icol]); if ( $check_nformat && @fields != @$format ) { $self->warn("Different number of fields in the format and the column $$cols[$icol] at $items[0]:$items[1] (" .scalar @fields." vs ".scalar @$format.": [",join(',',@fields),"] vs [",join(',',@$format),"])\n"); } my %hash; for (my $ifield=0; $ifield<@fields; $ifield++) { $hash{$$format[$ifield]} = $fields[$ifield]; } $gtypes{$$cols[$icol]} = \%hash; } $out{gtypes} = \%gtypes; return \%out; } =head2 parse_header About : Reads (and stores) the VCF header. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); Args : silent .. do not warn about duplicate header lines =cut sub parse_header { my ($self,%args) = @_; # First come the header lines prefixed by ## while ($self->_next_header_line(%args)) { ; } # Now comes the column names line prefixed by # $self->_read_column_names(); $$self{header_parsed} = 1; } =head2 _next_header_line About : Stores the header lines and meta information, such as fields types, etc. Args : silent .. do not warn about duplicate column names =cut sub _next_header_line { my ($self,%args) = @_; my $line = $self->next_line(); if ( !defined $line ) { return undef; } if ( substr($line,0,2) ne '##' ) { $self->_unread_line($line); return undef; } my $rec = $self->parse_header_line($line); if ( $rec ) { $self->add_header_line($rec,%args); } return $rec; } =head2 get_header_line Usage : $vcf->get_header_line(key=>'INFO', ID=>'AC') $vcf->get_header_line(key=>'FILTER', ID=>'q10') $vcf->get_header_line(key=>'reference') $vcf->get_header_line(key=>'contig',ID=>'20') Args : Header line filter as in the example above Returns : List ref of header line hashes matching the filter =cut sub get_header_line { my ($self,%filter) = @_; my $key = $filter{key}; delete($filter{key}); my $id = $filter{ID}; my @out; while (my ($hline_key,$hline_hash) = each %{$$self{header}}) { if ( $key ne $hline_key ) { next; } if ( defined $id ) { if ( !exists($$hline_hash{$id}) ) { next; } $hline_hash = $$hline_hash{$id}; } my $match = 1; while (my ($fkey,$fval) = each %filter) { if ( !exists($$hline_hash{$fkey}) or $$hline_hash{$fkey} ne $fval ) { $match=0; last; } } if ( $match ) { push @out,$hline_hash } } return \@out; } =head2 add_header_line Usage : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}) $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'}) Args : Header line hash as in the example above Hash with additional parameters [optional] silent .. do not warn about existing header keys append .. append timestamp to the name of the new one Returns : =cut sub add_header_line { my ($self,$rec,%args) = @_; if ( !%args ) { $args{silent}=0; } my $key = $$rec{key}; if ( !$key ) { $self->throw("Missing key: ",Dumper($rec)); } if ( exists($$rec{Type}) ) { if ( !exists($$rec{default}) ) { my $type = $$rec{Type}; if ( exists($$self{defaults}{$type}) ) { $$rec{default}=$$self{defaults}{$type}; } else { $$rec{default}=$$self{defaults}{default}; } } if ( !exists($$rec{handler}) ) { my $type = $$rec{Type}; if ( !exists($$self{handlers}{$type}) ) { $self->warn("Unknown type [$type]\n"); $type = 'String'; $$rec{Type} = $type; } if ( exists($$self{handlers}{$type}) ) { $$rec{handler}=$$self{handlers}{$type}; } else { $self->throw("Unknown type [$type].\n"); } } } if ( exists($$rec{ID}) ) { my $id = $$rec{ID}; if ( exists($$self{header}{$key}{$id}) ) { $self->remove_header_line(%$rec); } $$self{header}{$key}{$id} = $rec; push @{$$self{header_lines}}, $rec; return; } if ( $args{append} ) { my @tm = gmtime(time); $key = sprintf "%s_%d%.2d%.2d", $key,$tm[5]+1900,$tm[4]+1,$tm[3]; my $i = 1; while ( exists($$self{header}{$key.'.'.$i}) ) { $i++; } $key = $key.'.'.$i; $$rec{key} = $key; } if ( $self->_header_line_exists($key,$rec) ) { $self->remove_header_line(%$rec); } push @{$$self{header}{$key}}, $rec; if ( $$rec{key} eq 'fileformat' ) { unshift @{$$self{header_lines}}, $rec; } else { push @{$$self{header_lines}}, $rec; } } sub _header_line_exists { my ($self,$key,$rec) = @_; if ( !exists($$self{header}{$key}) ) { return 0; } if ( $key eq 'fileformat' ) { return 1; } for my $hrec (@{$$self{header}{$key}}) { my $differ = 0; for my $item (keys %$rec) { if ( !exists($$hrec{$item}) ) { $differ=1; last; } if ( $$hrec{$item} ne $$rec{$item} ) { $differ=1; last; } } if ( !$differ ) { return $hrec; } } return 0; } =head2 remove_header_line Usage : $vcf->remove_header_line(key=>'INFO', ID=>'AC') Args : Returns : =cut sub remove_header_line { my ($self,%args) = @_; my $key = $args{key}; my %to_be_removed; for (my $i=0; $i<@{$$self{header_lines}}; $i++) { my $line = $$self{header_lines}[$i]; if ( $$line{key} ne $key ) { next; } if ( exists($args{ID}) ) { if ( $args{ID} ne $$line{ID} ) { next; } delete($$self{header}{$key}{$args{ID}}); splice(@{$$self{header_lines}},$i--,1); } elsif ( scalar keys %args==1 && exists($$self{header}{$key}) ) { splice(@{$$self{header_lines}},$i--,1); $to_be_removed{$key} = 1; } else { my $to_be_removed = $self->_header_line_exists($key,\%args); if ( !$to_be_removed ) { next; } for (my $j=0; $j<@{$$self{header}{$key}}; $j++) { if ( $$self{header}{$key}[$j] eq $to_be_removed ) { splice(@{$$self{header}{$key}},$j,1); last; } } splice(@{$$self{header_lines}},$i--,1); } } for my $key (keys %to_be_removed) { delete($$self{header}{$key}); } } =head2 parse_header_line Usage : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36]) $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"]) Args : Returns : =cut sub parse_header_line { my ($self,$line) = @_; chomp($line); $line =~ s/^##//; if ( !($line=~/^([^=]+)=/) ) { return { key=>$line, value=>'' }; } my $key = $1; my $value = $'; my $desc; if ( $value=~/,\s*\"([^\"]+)\"\s*$/ ) { $desc=$1; $value=$`; } if ( !$desc ) { return { key=>$key, value=>$value }; } if ( $key eq 'INFO' or $key eq 'FORMAT' ) { my ($id,$number,$type,@rest) = split(/,\s*/,$value); if ( !$type or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); } return { key=>$key, ID=>$id, Number=>$number, Type=>$type, Description=>$desc }; } if ( $key eq 'FILTER' ) { my ($id,@rest) = split(/,\s*/,$value); if ( !$id or scalar @rest ) { $self->throw("Could not parse the header line: $line\n"); } return { key=>$key, ID=>$id, Description=>$desc }; } $self->throw("Could not parse the header line: $line\n"); } =head2 _read_column_names About : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index. The indexes go from 1. Usage : $vcf->_read_column_names(); Args : none =cut sub _read_column_names { my ($self) = @_; my $line = $self->next_line(); if ( !defined $line or substr($line,0,1) ne '#' ) { $self->throw("Broken VCF header, no column names?"); } $$self{column_line} = $line; my @cols = split(/\t/, substr($line,1)); chomp($cols[-1]); my $nremoved = 0; for (my $i=0; $i<@cols; $i++) { if ( !($cols[$i]=~/^\s*$/) ) { next; } $self->warn(sprintf "Empty fields in the header line, the column %d is empty, removing.\n",$i+1+$nremoved); $nremoved++; splice(@cols,$i,1); } my $ncols = scalar @cols; if ( $ncols == 1 ) { # If there is only one name, it can be space-separated instead of tab separated @cols = split(/\s+/, $cols[0]); $ncols = scalar @cols; chomp($line); if ( $ncols <= 1 ) { $self->warn("Could not parse the column names. [$line]\n"); return; } $self->warn("The column names not tab-separated? [$line]\n"); } my $fields = $$self{mandatory}; my $nfields = scalar @$fields; # Check the names of the mandatory columns if ( $ncols < $nfields ) { chomp($line); $self->warn("Missing some of the mandatory column names.\n\tGot: $line\n\tExpected: #", join("\t",@{$$self{mandatory}}),"\n"); return; } for (my $i=0; $i<$ncols; $i++) { if ( $cols[$i]=~/^\s+/ or $cols[$i]=~/\s+$/ ) { $self->warn("The column name contains leading/trailing spaces, removing: '$cols[$i]'\n"); $cols[$i] =~ s/^\s+//; $cols[$i] =~ s/\s+$//; } if ( $i<$nfields && $cols[$i] ne $$fields[$i] ) { $self->warn("Expected mandatory column [$$fields[$i]], got [$cols[$i]]\n"); $cols[$i] = $$fields[$i]; } $$self{has_column}{$cols[$i]} = $i+1; } $$self{columns} = \@cols; return; } =head2 _fake_column_names About : When no header is present, fake column names as the default mandatory ones + numbers Args : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present =cut sub _fake_column_names { my ($self,$ncols) = @_; $$self{columns} = [ @{$$self{mandatory}} ]; if ( $ncols>=0 ) { push @{$$self{columns}}, 'FORMAT'; } for (my $i=1; $i<=$ncols; $i++) { push @{$$self{columns}}, $i; } } =head2 format_header About : Returns the header. Usage : print $vcf->format_header(); Args : The columns to include on output [optional] =cut sub format_header { my ($self,$columns) = @_; my $out = ''; for my $line (@{$$self{header_lines}}) { $out .= $self->format_header_line($line); } # This is required when using the API for writing new VCF files and the caller does not add the line explicitly if ( !exists($$self{header_lines}[0]{key}) or $$self{header_lines}[0]{key} ne 'fileformat' ) { $out = "##fileformat=VCFv$$self{version}\n" .$out; } if ( !$$self{columns} ) { return $out; } my @out_cols; if ( $columns ) { @out_cols = @{$$self{columns}}[0..8]; for my $col (@$columns) { if ( exists($$self{has_column}{$col}) ) { push @out_cols, $col; } } } else { @out_cols = @{$$self{columns}}; } $out .= "#". join("\t", @out_cols). "\n"; return $out; } =head2 format_line About : Returns the header. Usage : $x = $vcf->next_data_hash(); print $vcf->format_line($x); $x = $vcf->next_data_array(); print $vcf->format_line($x); Args 1 : The columns or hash in the format returned by next_data_hash or next_data_array. 2 : The columns to include [optional] =cut sub format_line { my ($self,$record,$columns) = @_; if ( ref($record) eq 'HASH' ) { return $self->_format_line_hash($record,$columns); } if ( ref($record) eq 'ARRAY' ) { return join("\t",@$record)."\n"; } $self->throw("FIXME: todo .. " .ref($record). "\n"); } =head2 recalc_ac_an About : Control if the AC and AN values should be updated. Usage : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x); Args 1 : 0 .. never recalculate 1 .. recalculate if present 2 .. recalculate if present and add if missing =cut sub recalc_ac_an { my ($self,$value) = @_; if ( $value eq '0' || $value eq '1' || $value eq '2' ) { $$self{recalc_ac_an} = $value; } return; } =head2 get_tag_index Usage : my $idx = $vcf->get_tag_index('GT:PL:DP:SP:GQ','PL',':'); Arg 1 : Field 2 : The tag to find 3 : Tag separator Returns : Index of the tag or -1 when not found =cut sub get_tag_index { my ($self,$field,$tag,$sep) = @_; if ( !defined $field ) { return -1; } my $idx = 0; my $prev_isep = 0; my $isep = 0; while (1) { $isep = index($field,':',$prev_isep); if ( $isep==-1 ) { if ( substr($field,$prev_isep) eq $tag ) { return $idx; } else { return -1; } } if ( substr($field,$prev_isep,$isep-$prev_isep) eq $tag ) { return $idx; } $prev_isep = $isep+1; $idx++; } } =head2 remove_field Usage : my $field = $vcf->remove_field('GT:PL:DP:SP:GQ',1,':'); # returns 'GT:DP:SP:GQ' Arg 1 : Field 2 : The index of the field to remove 3 : Field separator Returns : Modified string =cut sub remove_field { my ($self,$string,$idx,$sep) = @_; my $isep = -1; my $prev_isep = 0; my $itag = 0; while ($itag!=$idx) { $isep = index($string,$sep,$prev_isep); # The index may be out of range, VCFv4.1 allows omitting empty fields if ( $isep==-1 ) { return $string; } $prev_isep = $isep+1; $itag++; } my $out; if ( $isep>=0 ) { $out = substr($string,0,$isep); } my $ito=index($string,$sep,$isep+1); if ( $ito!=-1 ) { if ( $isep>=0 ) { $out .= ':' } $out .= substr($string,$ito+1); } if ( !defined $out ) { return '.'; } return $out; } =head2 replace_field Usage : my $col = $vcf->replace_field('GT:PL:DP:SP:GQ','XX',1,':'); # returns 'GT:XX:DP:SP:GQ' Arg 1 : Field 2 : Replacement 3 : 0-based index of the field to replace 4 : Field separator Returns : Modified string =cut sub replace_field { my ($self,$string,$repl,$idx,$sep) = @_; my $isep = -1; my $prev_isep = 0; my $itag = 0; while ($itag!=$idx) { $isep = index($string,$sep,$prev_isep); if ( $isep==-1 ) { # the out of range index may be OK, VCFv4.1 allows omitting empty fields if ( $$self{version}<4.1 ) { $self->throw("The index out of range ($string,$repl,$idx,$sep), missing fields not supported in VCFv$$self{version}."); } while ( $itag<$idx ) { $string .= ':'; $itag++; } $string .= $repl; return $string; } $prev_isep = $isep+1; $itag++; } my $out; if ( $isep>=0 ) { $out = substr($string,0,$isep+1); } my $ito = index($string,$sep,$isep+1); if ( $ito==-1 ) { $out .= $repl; } else { $out .= $repl; $out .= ':'; $out .= substr($string,$ito+1); } if ( !defined $out ) { return '.'; } return $out; } =head2 get_info_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); $af = $vcf->get_info_field('DP=14;AF=0.5;DB','AF'); # returns 0.5 $af = $vcf->get_info_field('DP=14;AF=0.5;DB','DB'); # returns 1 $af = $vcf->get_info_field('DP=14;AF=0.5;DB','XY'); # returns undef Arg 1 : The VCF line broken into an array 2 : The tag to retrieve Returns : undef when tag is not present, the tag value if present, or 1 if flag is present =cut sub get_info_field { my ($self,$info,$tag) = @_; my $info_len = length($info); my $tag_len = length($tag); my $idx = 0; while (1) { $idx = index($info,$tag,$idx); if ( $idx==-1 ) { return undef; } if ( $idx!=0 && substr($info,$idx-1,1) ne ';' ) { $idx += $tag_len; next; } if ( $tag_len+$idx >= $info_len ) { return 1; } my $follows = substr($info,$idx+$tag_len,1); if ( $follows eq ';' ) { return 1; } $idx += $tag_len; if ( $follows ne '=' ) { next; } $idx++; my $to = index($info,';',$idx); return $to==-1 ? substr($info,$idx) : substr($info,$idx,$to-$idx); } } =head2 get_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); my $idx = $vcf->get_tag_index($$line[8],'PL',':'); my $pl = $vcf->get_field($$line[9],$idx) unless $idx==-1; Arg 1 : The VCF line broken into an array 2 : The index of the field to retrieve 3 : The delimiter [Default is ':'] Returns : The tag value =cut sub get_field { my ($self,$col,$idx,$delim) = @_; if ( !defined $delim ) { $delim=':'; } my $isep = 0; my $prev_isep = 0; my $itag = 0; while (1) { $isep = index($col,$delim,$prev_isep); if ( $itag==$idx ) { last; } if ( $isep==-1 ) { return '.'; } # This is valid, missing fields can be ommited from genotype columns $prev_isep = $isep+1; $itag++; } return $isep<0 ? substr($col,$prev_isep) : substr($col,$prev_isep,$isep-$prev_isep); } =head2 get_sample_field Usage : my $line = $vcf->next_line; my @items = split(/\t/,$line); my $idx = $vcf->get_tag_index($$line[8],'PL',':'); my $pls = $vcf->get_sample_field(\@items,$idx) unless $idx==-1; Arg 1 : The VCF line broken into an array 2 : The index of the field to retrieve Returns : Array of values =cut sub get_sample_field { my ($self,$cols,$idx) = @_; my @out; my $n = @$cols; for (my $icol=9; $icol<$n; $icol++) { my $col = $$cols[$icol]; my $isep = 0; my $prev_isep = 0; my $itag = 0; while (1) { $isep = index($col,':',$prev_isep); if ( $itag==$idx ) { last; } if ( $isep==-1 ) { $self->throw("The index out of range: $col:$isep .. $idx"); } $prev_isep = $isep+1; $itag++; } my $val = $isep<0 ? substr($col,$prev_isep) : substr($col,$prev_isep,$isep-$prev_isep); push @out,$val; } return \@out; } =head2 split_mandatory About : Faster alternative to regexs, extract the mandatory columns Usage : my $line=$vcf->next_line; my @cols = $vcf->split_mandatory($line); Arg : Returns : Pointer to the array of values =cut sub split_mandatory { my ($self,$line) = @_; my @out; my $prev = 0; for (my $i=0; $i<7; $i++) { my $isep = index($line,"\t",$prev); if ( $isep==-1 ) { $self->throw("Could not parse the mandatory columns: $line"); } push @out, substr($line,$prev,$isep-$prev); $prev = $isep+1; } my $isep = index($line,"\t",$prev); if ( $isep!=-1 ) { push @out, substr($line,$prev,$isep-$prev-1); } else { push @out, substr($line,$prev); } return \@out; } =head2 split_gt About : Faster alternative to regexs Usage : my ($a1,$a2,$a3) = $vcf->split_gt('0/0/1'); # returns (0,0,1) Arg : Diploid genotype to split into alleles Returns : Array of values =cut sub split_gt { my ($self,$gt) = @_; my @als; my $iprev = 0; while (1) { my $isep = index($gt,'/',$iprev); my $jsep = index($gt,'|',$iprev); if ( $isep<0 or ($jsep>=0 && $jsep<$isep) ) { $isep = $jsep; } push @als, $isep<0 ? substr($gt,$iprev) : substr($gt,$iprev,$isep-$iprev); if ( $isep<0 ) { return (@als); } $iprev = $isep+1; } return (@als); } =head2 split_by About : Generalization of split_gt Usage : my ($a1,$a2,$a3) = $vcf->split_gt('0/0|1',qw(| /)); # returns (0,0,1) Arg : Diploid genotype to split into alleles Returns : Array of values =cut sub split_by { my ($self,$str,@seps) = @_; my @out; my $iprev = 0; while (1) { my $min; for my $sep (@seps) { my $idx = index($str,$sep,$iprev); if ( $idx==-1 ) { next; } if ( !defined $min or $idx<$min ) { $min=$idx } } push @out, defined $min ? substr($str,$iprev,$min-$iprev) : substr($str,$iprev); if ( !defined $min ) { return @out; } $iprev = $min+1; } return (@out); } =head2 decode_genotype About : Faster alternative to regexs Usage : my $gt = $vcf->decode_genotype('G',['A','C'],'0/0'); # returns 'G/G' Arg 1 : Ref allele 2 : Alt alleles 3 : The genotype to decode Returns : Decoded GT string =cut sub decode_genotype { my ($self,$ref,$alt,$gt) = @_; my $isep = 0; my $out; while (1) { my $i = index($gt,'/',$isep); my $j = index($gt,'|',$isep); if ( $i==-1 && $j==-1 ) { my $idx = substr($gt,$isep); if ( $idx eq '.' ) { $out .= $idx; } else { if ( $idx>@$alt ) { $self->throw("The genotype index $idx in $gt is out of bounds: ", join(',',@$alt)); } $out .= $idx==0 ? $ref : $$alt[$idx-1]; } return $out; } if ( $i!=-1 && $j!=-1 && $i>$j ) { $i=$j; } elsif ( $i==-1 ) { $i=$j } my $idx = substr($gt,$isep,$i-$isep); if ( $idx eq '.' ) { $out .= $idx; } else { if ( $idx>@$alt ) { $self->throw("The genotype index $idx in $gt out of bounds: ", join(',',@$alt)); } $out .= $idx==0 ? $ref : $$alt[$idx-1]; } $out .= substr($gt,$i,1); $isep = $i+1; } } sub _format_line_hash { my ($self,$record,$columns) = @_; if ( !$$self{columns} ) { my $ngtypes = scalar keys %{$$record{gtypes}}; if ( !$ngtypes && !exists($$record{FORMAT}) ) { $ngtypes--; } $self->_fake_column_names($ngtypes); } my $cols = $$self{columns}; # CHROM POS ID REF my $out; $out .= $$record{CHROM} . "\t"; $out .= $$record{POS} . "\t"; $out .= (defined $$record{ID} ? $$record{ID} : '.') . "\t"; $out .= $$record{REF} . "\t"; # ALT $out .= join(',',@{$$record{ALT}} ? @{$$record{ALT}} : '.'); # QUAL $out .= "\t". $$record{QUAL}; # FILTER $out .= "\t". join(';',$$record{FILTER} ? @{$$record{FILTER}} : '.'); # Collect the gtypes of interest my $gtypes; if ( $columns ) { # Select only those gtypes keys with a corresponding key in columns. for my $col (@$columns) { $$gtypes{$col} = $$record{gtypes}{$col}; } } else { $gtypes = $$record{gtypes}; } # INFO # .. calculate NS, AN and AC, but only if recalc_ac_an is set my $needs_an_ac = $$self{recalc_ac_an}==2 ? 1 : 0; my @info; while (my ($key,$value) = each %{$$record{INFO}}) { if ( $$self{recalc_ac_an}>0 ) { if ( $key eq 'AN' ) { $needs_an_ac=1; next; } if ( $key eq 'AC' ) { $needs_an_ac=1; next; } } if ( defined $value ) { push @info, "$key=$value"; } elsif ( $key ne '.' ) { push @info, $key; } } if ( $needs_an_ac ) { my $nalt = scalar @{$$record{ALT}}; if ( $nalt==1 && $$record{ALT}[0] eq '.' ) { $nalt=0; } my ($an,$ac) = $self->calc_an_ac($gtypes,$nalt); push @info, "AN=$an","AC=$ac"; } if ( !@info ) { push @info, '.'; } $out .= "\t". join(';', sort @info); # FORMAT, the column is not required, it may not be present when there are no genotypes if ( exists($$cols[8]) && defined $$record{FORMAT} ) { $out .= "\t". join(':',@{$$record{FORMAT}}); } # Genotypes: output all columns or only a selection? my @col_names = $columns ? @$columns : @$cols[9..@$cols-1]; my $nformat = defined $$record{FORMAT} ? @{$$record{FORMAT}} : 0; for my $col (@col_names) { my $gt = $$gtypes{$col}; my $can_drop = $$self{drop_trailings}; my @gtype; for (my $i=$nformat-1; $i>=0; $i--) { my $field = $$record{FORMAT}[$i]; if ( $i==0 ) { $can_drop=0; } if ( exists($$gt{$field}) ) { $can_drop = 0; if ( ref($$gt{$field}) eq 'HASH' ) { # Special treatment for Number=[AG] tags unshift @gtype, $self->format_AGtag($record,$col,$$gt{$field},$field); } else { unshift @gtype,$$gt{$field}; } } elsif ( $can_drop ) { next; } elsif ( exists($$self{header}{FORMAT}{$field}{default}) ) { unshift @gtype,$$self{header}{FORMAT}{$field}{default}; $can_drop=0; } else { $self->throw(qq[No value for the field "$field" and no default available, column "$col" at $$record{CHROM}:$$record{POS}.\n]); } } $out .= "\t" . join(':',@gtype); } $out .= "\n"; return $out; } sub calc_an_ac { my ($self,$gtypes,$nalleles) = @_; my $sep_re = $$self{regex_gtsep}; my ($an,%ac_counts); if ( defined $nalleles ) { for (my $i=1; $i<=$nalleles; $i++) { $ac_counts{$i}=0; } } $an = 0; for my $gt (keys %$gtypes) { my $value = $$gtypes{$gt}{GT}; if ( !defined $value ) { next; } # GT may not be present my ($al1,$al2) = split($sep_re,$value); if ( defined($al1) && $al1 ne '.' ) { $an++; if ( $al1 ne '0' ) { $ac_counts{$al1}++; } } if ( defined($al2) && $al2 ne '.' ) { $an++; if ( $al2 ne '0' ) { $ac_counts{$al2}++; } } } my @ac; for my $ac ( sort { $a <=> $b } keys %ac_counts) { push @ac, $ac_counts{$ac}; } if ( !@ac ) { @ac = ('0'); } return ($an,join(',',@ac),\@ac); } sub _validate_alt_field { my ($self,$values,$ref) = @_; for (my $i=0; $i<@$values; $i++) { for (my $j=0; $j<$i; $j++) { if ( $$values[$i] eq $$values[$j] ) { return "The alleles not unique: $$values[$i]"; } } if ( $$values[$i] eq $ref ) { return "REF allele listed in the ALT field??"; } } return undef; } =head2 validate_alt_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT}); Args : The ALT arrayref Returns : Error message in case of an error. =cut sub validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my @err; for my $item (@$values) { if ( $item=~/^[ACTGN]$/ ) { next; } elsif ( $item=~/^I[ACTGN]+$/ ) { next; } elsif ( $item=~/^D\d+$/ ) { next; } push @err, $item; } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']'; } =head2 event_type Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); for my $allele (@$alleles) { my ($type,$len,$ht) = $vcf->event_type($x,$allele); } or my ($type,$len,$ht) = $vcf->event_type($ref,$al); Args : VCF data line parsed by next_data_hash or the reference allele : Allele Returns : 's' for SNP and number of SNPs in the record 'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion) 'r' identical to the reference, length 0 'o' for other (complex events) and the number of affected bases 'b' breakend 'u' unknown =cut sub event_type { my ($self,$rec,$allele) = @_; my $ref = $rec; if ( ref($rec) eq 'HASH' ) { if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); } $ref = $$rec{REF}; } my ($type,$len,$ht); if ( $allele eq $ref or $allele eq '.' ) { $len=0; $type='r'; $ht=$ref; } elsif ( $allele=~/^[ACGT]$/ ) { $len=1; $type='s'; $ht=$allele; } elsif ( $allele=~/^I/ ) { $len=length($allele)-1; $type='i'; $ht=$'; } elsif ( $allele=~/^D(\d+)/ ) { $len=-$1; $type='i'; $ht=''; } else { my $chr = ref($rec) eq 'HASH' ? $$rec{CHROM} : 'undef'; my $pos = ref($rec) eq 'HASH' ? $$rec{POS} : 'undef'; $self->throw("Eh?: $chr:$pos .. $ref $allele\n"); } if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = [$type,$len,$ht]; } return ($type,$len,$ht); } =head2 has_AGtags About : Checks the header for the presence of tags with variable number of fields (Number=A or Number=G, such as GL) Usage : $vcf->parse_header(); my $agtags = $vcf->has_AGtags(); Args : None Returns : Hash {fmtA=>[tags],fmtG=>[tags],infoA=>[tags],infoG=>[tags]} or undef if none is present =cut sub has_AGtags { my ($self) = @_; my $out; if ( exists($$self{header}{FORMAT}) ) { for my $tag (keys %{$$self{header}{FORMAT}}) { if ( $$self{header}{FORMAT}{$tag}{Number} eq 'A' ) { push @{$$out{fmtA}},$tag; } if ( $$self{header}{FORMAT}{$tag}{Number} eq 'G' ) { push @{$$out{fmtG}},$tag; } } } if ( exists($$self{header}{INFO}) ) { for my $tag (keys %{$$self{header}{INFO}}) { if ( $$self{header}{INFO}{$tag}{Number} eq 'A' ) { push @{$$out{infoA}},$tag; } if ( $$self{header}{INFO}{$tag}{Number} eq 'G' ) { push @{$$out{infoG}},$tag; } } } if ( defined $out ) { for my $key (qw(fmtA fmtG infoA infoG)) { if ( !exists($$out{$key}) ) { $$out{$key}=[] } } } return $out; } =head2 parse_AGtags About : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes Usage : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x); Args : VCF data line parsed by next_data_hash : Mapping between ALT representations based on different REFs [optional] : New REF [optional] Returns : Hash {Allele=>Value} =cut sub parse_AGtags { my ($self,$rec,$ref_alt_map,$new_ref) = @_; if ( !exists($$rec{gtypes}) ) { return; } my (@atags,@gtags); for my $fmt (@{$$rec{FORMAT}}) { # These have been listed explicitly for proper merging of v4.0 VCFs if ( $$self{fix_v40_AGtags} ) { if ( $fmt eq 'GL' or $fmt eq 'PL' ) { push @gtags,$fmt; next; } if ( $fmt eq 'AC' or $fmt eq 'AF' ) { push @atags,$fmt; next; } } if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { push @atags,$fmt; next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { push @gtags,$fmt; next; } } my $missing = $$self{defaults}{default}; if ( @atags ) { # Parse Number=A tags my $alts; if ( defined $ref_alt_map ) { $alts = []; for my $alt (@{$$rec{ALT}}) { if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: $new_ref $alt...?\n"); } push @$alts, $$ref_alt_map{$new_ref}{$alt}; } } else { $alts = $$rec{ALT}; } for my $tag (@atags) { for my $sample (values %{$$rec{gtypes}}) { if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; } my @values = split(/,/,$$sample{$tag}); $$sample{$tag} = {}; for (my $i=0; $i<@values; $i++) { $$sample{$tag}{$$alts[$i]} = $values[$i]; } } } } if ( @gtags ) { # Parse Number=G tags my @alleles; if ( defined $ref_alt_map ) { push @alleles, $new_ref; for my $alt (@{$$rec{ALT}}) { if ( !exists($$ref_alt_map{$new_ref}{$alt}) ) { $self->throw("FIXME: [$new_ref] [$alt]...?\n", Dumper($ref_alt_map,$rec)); } push @alleles, $$ref_alt_map{$new_ref}{$alt}; } } else { @alleles = ($$rec{REF},@{$$rec{ALT}}); if ( @alleles==2 && $alleles[1] eq '.' ) { pop(@alleles); } } my @gtypes; for (my $i=0; $i<@alleles; $i++) { for (my $j=0; $j<=$i; $j++) { push @{$gtypes[1]}, $alleles[$i].'/'.$alleles[$j]; } push @{$gtypes[0]}, $alleles[$i]; } for my $tag (@gtags) { for my $name (keys %{$$rec{gtypes}}) { my $sample = $$rec{gtypes}{$name}; if ( !exists($$sample{$tag}) or $$sample{$tag} eq $missing ) { next; } my @values = split(/,/,$$sample{$tag}); my $ploidy = $self->guess_ploidy(scalar @alleles, scalar @values) - 1; if ( $ploidy>1 ) { $self->throw("Sorry, not ready for ploidy bigger than 2\n"); } if ( $ploidy!=1 ) { $$rec{_cached_ploidy}{$name} = $ploidy; } $$sample{$tag} = {}; for (my $i=0; $i<@values; $i++) { $$sample{$tag}{$gtypes[$ploidy][$i]} = $values[$i]; } } } } } =head2 format_AGtag About : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) Usage : Args : : : Returns : =cut sub format_AGtag { my ($self,$record,$sample,$tag_data,$tag) = @_; # The FORMAT field is checked only once and the results are cached. if ( !exists($$record{_atags}) ) { $$record{_atags} = {}; # Check if there are any A,G tags for my $fmt (@{$$record{FORMAT}}) { # These have been listed explicitly for proper merging of v4.0 VCFs if ( $$self{fix_v40_AGtags} ) { if ( $fmt eq 'GL' or $fmt eq 'PL' ) { $$record{_gtags}{$fmt}=1; next; } if ( $fmt eq 'AC' or $fmt eq 'AF' ) { $$record{_atags}{$fmt}=1; next; } } if ( !exists($$self{header}{FORMAT}{$fmt}) ) { next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'A' ) { $$record{_atags}{$fmt}=1; next; } if ( $$self{header}{FORMAT}{$fmt}{Number} eq 'G' ) { $$record{_gtags}{$fmt}=1; next; } } } my @out; if ( exists($$record{_atags}{$tag}) ) { for my $alt (@{$$record{ALT}}) { push @out, exists($$tag_data{$alt}) ? $$tag_data{$alt} : $$self{defaults}{default}; } } if ( exists($$record{_gtags}{$tag}) ) { my $gtypes = $$record{_gtypes}; my $gtypes2 = $$record{_gtypes2}; if ( !defined $gtypes ) { $gtypes = []; $gtypes2 = []; my @alleles = ( $$record{REF}, @{$$record{ALT}} ); for (my $i=0; $i<@alleles; $i++) { for (my $j=0; $j<=$i; $j++) { push @{$$gtypes[1]}, $alleles[$i].'/'.$alleles[$j]; push @{$$gtypes2[1]}, $alleles[$j].'/'.$alleles[$i]; } push @{$$gtypes[0]}, $alleles[$i]; } $$record{_gtypes} = $gtypes; $$record{_gtypes2} = $gtypes2; } my $ploidy = exists($$record{_cached_ploidy}{$sample}) ? $$record{_cached_ploidy}{$sample} : 1; for (my $i=0; $i<@{$$gtypes[$ploidy]}; $i++) { my $gt = $$gtypes[$ploidy][$i]; if ( !exists($$tag_data{$gt}) ) { $gt = $$gtypes2[$ploidy][$i]; } push @out, exists($$tag_data{$gt}) ? $$tag_data{$gt} : $$self{defaults}{default}; } } return join(',',@out); } =head2 parse_alleles About : Deprecated, use parse_haplotype instead. Usage : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001'); Args : VCF data line parsed by next_data_hash : The genotype column name Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string. =cut sub parse_alleles { my ($self,$rec,$column) = @_; if ( !exists($$rec{gtypes}) || !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); } my $gtype = $$rec{gtypes}{$column}{GT}; if ( !($gtype=~$$self{regex_gt}) ) { $self->throw("Could not parse gtype string [$gtype] [$$rec{CHROM}:$$rec{POS}]\n"); } my $al1 = $1; my $sep = $2; my $al2 = $3; if ( !$al1 ) { $al1 = $$rec{REF}; } elsif ( $al1 ne '.' ) { if ( !($al1=~/^\d+$/) ) { $self->throw("Uh, what is this? [$al1] $$rec{CHROM}:$$rec{POS}\n"); } $al1 = $$rec{ALT}[$al1-1]; } if ( !defined $al2 or $al2 eq '' ) { $sep = ''; $al2 = ''; } else { if ( !$al2 ) { $al2 = $$rec{REF}; } elsif ( $al2 ne '.' ) { $al2 = $$rec{ALT}[$al2-1]; } } return ($al1,$sep,$al2); } =head2 parse_haplotype About : Similar to parse_alleles, supports also multiploid VCFs. Usage : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); Args : VCF data line parsed by next_data_hash : The genotype column name Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values can be cashed and must be therefore considered read only! =cut sub parse_haplotype { my ($self,$rec,$column) = @_; if ( !exists($$rec{gtypes}{$column}) ) { $self->throw("The column not present: '$column'\n"); } if ( !exists($$rec{gtypes}{$column}{GT}) ) { return (['.'],[],0,1); } my $gtype = $$rec{gtypes}{$column}{GT}; if ( exists($$rec{_cached_haplotypes}{$gtype}) ) { return (@{$$rec{_cached_haplotypes}{$gtype}}); } my @alleles = (); my @seps = (); my $is_phased = 0; my $is_empty = 1; my $buf = $gtype; while ($buf ne '') { if ( !($buf=~m{^(\.|\d+)([|/]?)}) ) { $self->throw("Could not parse gtype string [$gtype] .. $$rec{CHROM}:$$rec{POS} $column\n"); } $buf = $'; if ( $1 eq '.' ) { push @alleles,'.'; } else { $is_empty = 0; if ( $1 eq '0' ) { push @alleles,$$rec{REF}; } elsif ( exists($$rec{ALT}[$1-1]) ) { push @alleles,$$rec{ALT}[$1-1]; } else { $self->throw(qq[The haplotype indexes in "$gtype" do not match the ALT column .. $$rec{CHROM}:$$rec{POS} $column\n]); } } if ( $2 ) { if ( $2 eq '|' ) { $is_phased=1; } push @seps,$2; } } $$rec{_cached_haplotypes}{$gtype} = [\@alleles,\@seps,$is_phased,$is_empty]; return (@{$$rec{_cached_haplotypes}{$gtype}}); } =head2 format_haplotype Usage : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps); =cut sub format_haplotype { my ($self,$alleles,$seps) = @_; if ( @$alleles != @$seps+1 ) { $self->throw(sprintf("Uh: %d vs %d\n",scalar @$alleles,scalar @$seps),Dumper($alleles,$seps)); } my $out = $$alleles[0]; for (my $i=1; $i<@$alleles; $i++) { $out .= $$seps[$i-1]; $out .= $$alleles[$i]; } return $out; } =head2 format_genotype_strings Usage : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 }; $vcf->format_genotype_strings($x); print $vcf->format_line($x); Args 1 : VCF data line in the format as if parsed by next_data_hash with alleles written as letters. 2 : Optionally, a subset of columns can be supplied. See also format_line. Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles numbers starting from 1. If the key $$vcf{trim_redundant_ALTs} is set, ALT alleles not appearing in any of the sample column will be removed. =cut sub format_genotype_strings { my ($self,$rec,$columns) = @_; if ( !exists($$rec{gtypes}) ) { return; } my $ref = $$rec{REF}; my $nalts = 0; my %alts = (); if ( !$columns ) { $columns = [keys %{$$rec{gtypes}}]; } for my $key (@$columns) { my $gtype = $$rec{gtypes}{$key}{GT}; my $buf = $gtype; my $out = ''; while ($buf ne '') { $buf=~m{^([^/|]+)([/|]?)}; $buf = $'; my $al = $1; my $sep = $2; if ( $al eq $ref or $al eq '0' or $al eq '*' ) { $al=0; } else { if ( $al=~/^\d+$/ ) { if ( !exists($$rec{ALT}[$al-1]) ) { $self->throw("Broken ALT, index $al out of bounds\n"); } $al = $$rec{ALT}[$al-1]; } if ( exists($alts{$al}) ) { $al = $alts{$al} } elsif ( $al ne '.' ) { $alts{$al} = ++$nalts; $al = $nalts; } } $out .= $al; if ( $sep ) { $out .= $sep; } } $$rec{gtypes}{$key}{GT} = $out; } if ( !$$self{trim_redundant_ALTs} && exists($$rec{ALT}) && @{$$rec{ALT}} ) { for my $alt (@{$$rec{ALT}}) { if ( !exists($alts{$alt}) ) { $alts{$alt} = ++$nalts; } } } $$rec{ALT} = [ sort { $alts{$a}<=>$alts{$b} } keys %alts ]; } sub fill_ref_alt_mapping { my ($self,$map) = @_; my $new_ref; for my $ref (keys %$map) { $new_ref = $ref; if ( $ref ne $new_ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); } for my $alt (keys %{$$map{$ref}}) { $$map{$ref}{$alt} = $alt; } } $$map{$new_ref}{$new_ref} = $new_ref; return $new_ref; } =head2 format_header_line Usage : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}) Args : Returns : =cut sub format_header_line { my ($self,$rec) = @_; my $line = "##$$rec{key}"; $line .= "=$$rec{value}" unless !exists($$rec{value}); $line .= "=$$rec{ID}" unless !exists($$rec{ID}); $line .= ",$$rec{Number}" unless !exists($$rec{Number}); $line .= ",$$rec{Type}" unless !exists($$rec{Type}); $line .= qq[,"$$rec{Description}"] unless !exists($$rec{Description}); $line .= "\n"; return $line; } =head2 remove_columns Usage : my $rec=$vcf->next_data_hash(); $vcf->remove_columns($rec,remove=>['NA001','NA0002']); Args : VCF hash pointer : list of columns to remove or a lookup hash with column names to keep (remove=>[] or keep=>{}) Returns : =cut sub remove_columns { my ($self,$rec,%args) = @_; if ( ref($rec) ne 'HASH' ) { $self->throw("TODO: rec for array"); } if ( exists($args{keep}) ) { for my $col (keys %{$$rec{gtypes}}) { if ( !exists($args{keep}{$col}) ) { delete($$rec{gtypes}{$col}); } } } if ( exists($args{remove}) ) { for my $col (@{$args{remove}}) { if ( exists($$rec{gtypes}{$col}) ) { delete($$rec{gtypes}{$col}); } } } } =head2 add_columns Usage : $vcf->add_columns('NA001','NA0002'); Args : Returns : =cut sub add_columns { my ($self,@columns) = @_; if ( !$$self{columns} ) { # The columns should be initialized de novo. Figure out if the @columns contain also the mandatory # columns and if FORMAT should be present (it can be absent when there is no genotype column present). my $has_other = 0; for my $col (@columns) { if ( !exists($$self{reserved}{cols}{$col}) ) { $has_other=1; last; } } $$self{columns} = [ @{$$self{mandatory}} ]; if ( $has_other ) { push @{$$self{columns}},'FORMAT'; } for my $col (@{$$self{columns}}) { $$self{has_column}{$col}=1; } } my $ncols = @{$$self{columns}}; for my $col (@columns) { if ( $$self{has_column}{$col} ) { next; } $ncols++; push @{$$self{columns}}, $col; } } =head2 add_format_field Usage : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x); Args : The record obtained by next_data_hash : The field name Returns : =cut sub add_format_field { my ($self,$rec,$field) = @_; if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; } for my $key (@{$$rec{FORMAT}}) { if ( $key eq $field ) { return; } # already there } push @{$$rec{FORMAT}}, $field; } =head2 remove_format_field Usage : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x); Args : The record obtained by next_data_hash : The field name Returns : =cut sub remove_format_field { my ($self,$rec,$field) = @_; if ( !$$rec{FORMAT} ) { $$rec{FORMAT}=[]; } my $i = 0; for my $key (@{$$rec{FORMAT}}) { if ( $key eq $field ) { splice @{$$rec{FORMAT}},$i,1; } $i++; } } =head2 add_info_field Usage : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n"; Args : The record obtained by next_data_array : The INFO field name and value pairs. If value is undef and the key is present in $$x[7], it will be removed. To add fields without a value, use empty string ''. Returns : The formatted INFO. =cut sub add_info_field { my ($self,$info,%fields) = @_; my @out = (); # First handle the existing values, keep everything unless in %fields for my $field (split(/;/,$info)) { my ($key,$value) = split(/=/,$field); if ( $key eq '.' ) { next; } if ( !exists($fields{$key}) ) { push @out,$field; next; } } # Now add the new values and remove the unwanted ones while (my ($key,$value)=each %fields) { if ( !defined($value) ) { next; } # this one should be removed if ( $value eq '' ) { push @out,$key; } # this one is of the form HM2 in contrast to DP=3 else { push @out,"$key=$value"; } # this is the standard key=value pair } if ( !@out ) { push @out,'.'; } return join(';',@out); } =head2 add_filter Usage : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n"; Args : The record obtained by next_data_array or next_data_hash : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed. Returns : The formatted filter field. =cut sub add_filter { my ($self,$filter,%filters) = @_; my @out = (); my @filters = ref($filter) eq 'ARRAY' ? @$filter : split(/;/,$filter); # First handle the existing filters, keep everything unless in %filters for my $key (@filters) { if ( $key eq '.' or $key eq 'PASS' ) { next; } if ( !exists($filters{$key}) ) { push @out,$key; next; } } # Now add the new filters and remove the unwanted ones while (my ($key,$value)=each %filters) { if ( !$value ) { next; } # this one should be removed push @out,$key; # this one should be added } if ( !@out ) { push @out,'PASS'; } return ref($filter) eq 'ARRAY' ? return \@out : join(';',@out); } =head2 validate_filter_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER}); Args : The FILTER arrayref Returns : Error message in case of an error. =cut sub validate_filter_field { my ($self,$values) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my @errs; my @missing; for my $item (@$values) { if ( $item eq $$self{filter_passed} ) { next; } if ( $item=~/,/ ) { push @errs,"Expected semicolon as a separator."; } if ( exists($$self{reserved}{FILTER}{$item}) ) { return qq[The filter name "$item" cannot be used, it is a reserved word.]; } if ( exists($$self{header}{FILTER}{$item}) ) { next; } push @missing, $item; $self->add_header_line({key=>'FILTER',ID=>$item,Description=>'No description'}); } if ( !@errs && !@missing ) { return undef; } if ( $$self{version}<3.3 ) { return undef; } return join(',',@errs) .' '. 'The filter(s) [' . join(',',@missing) . '] not listed in the header.'; } sub _add_unknown_field { my ($self,$field,$key,$nargs) = @_; $self->add_header_line({key=>$field,ID=>$key,Number=>$nargs,Type=>'String',Description=>'No description'}); } =head2 validate_header About : Version specific header validation code. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $vcf->validate_header(); Args : =cut sub validate_header { my ($self) = @_; } =head2 validate_line About : Version specific line validation code. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x); Args : =cut sub validate_line { my ($self,$x) = @_; # Is the ID composed of alphanumeric chars if ( !($$x{ID}=~/^[\w;\.]+$/) ) { $self->warn("Expected alphanumeric ID at $$x{CHROM}:$$x{POS}, but got [$$x{ID}]\n"); } } =head2 validate_info_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT}); Args : The INFO hashref Returns : Error message in case of an error. =cut sub validate_info_field { my ($self,$values,$alts) = @_; if ( !defined $values ) { return 'Empty INFO field.'; } # First handle the empty INFO field (.) if ( scalar keys %$values == 1 && exists($$values{'.'}) ) { return undef; } # Expected numbers my $ng = -1; my $na = -1; if ( $$self{version}>4.0 ) { if ( $$alts[0] eq '.' ) { $ng=1; $na=1; } else { $na = @$alts; $ng = (1+$na+1)*($na+1)/2; } } my @errs; while (my ($key,$value) = each %$values) { if ( !exists($$self{header}{INFO}{$key}) ) { push @errs, "INFO tag [$key] not listed in the header" unless $$self{version}<3.3; my $nargs = defined $value ? -1 : 0; $self->_add_unknown_field('INFO',$key,$nargs); next; } my $type = $$self{header}{INFO}{$key}; my @vals = defined $value ? split(/,/, $value) : (); if ( $$type{Number} eq 'G' ) { if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $ng, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'A' ) { if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "INFO tag [$key=$value] expected different number of values (expected $na, found ".scalar @vals.")"; } } elsif ( $$type{Number}==0 ) { if ( defined($value) ) { push @errs, "INFO tag [$key] did not expect any parameters, got [$value]"; } next; } elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} ) { push @errs, "INFO tag [$key=$value] expected different number of values ($$type{Number})"; } if ( !$$type{handler} ) { next; } for my $val (@vals) { my $err = &{$$type{handler}}($self,$val,$$type{default}); if ( $err ) { push @errs, $err; } } } if ( !@errs ) { return undef; } return join(',',@errs); } =head2 validate_gtype_field Usage : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT}); Args : The genotype data hashref The ALT arrayref Returns : Error message in case of an error. =cut sub guess_ploidy { my ($self, $nals, $nvals) = @_; if ( $nvals==$nals ) { return 1; } if ( $nvals==binom(1+$nals,2) ) { return 2; } $self->throw("Could not determine the ploidy (nals=$nals, nvals=$nvals). (TODO: ploidy bigger than 2)\n", binom(2+$nals,2)); } sub binom { my ($n, $k) = @_; my $b = 1; if ( $k > $n-$k ) { $k = $n-$k; } if ( $k < 1 ) { return 1; } for (my $i=1; $i<=$k; $i++) { $b *= ($n-$k+$i)/$i; } return $b; } sub validate_gtype_field { my ($self,$data,$alts,$format) = @_; my @errs; my $ploidy = 2; if ( !exists($$data{GT}) ) { push @errs, "The mandatory tag GT not present." unless $$self{ignore_missing_GT}; } else { my (@als) = $self->split_by($$data{GT},@{$$self{gt_sep}}); for my $al (@als) { if ( $al eq '.' or $al eq '0' ) { next; } if ( !($al=~/^[0-9]+$/) ) { push @errs, "Unable to parse the GT field [$$data{GT}], expected integers"; } if ( !exists($$alts[$al-1]) ) { push @errs, "Bad ALT value in the GT field, the index [$al] out of bounds [$$data{GT}]."; last; } } $ploidy = @als; } # Expected numbers my $ng = -1; my $na = -1; if ( $$self{version}>4.0 ) { if ( $$alts[0] eq '.' ) { $ng=1; $na=1; } else { $na = @$alts; $ng = binom($ploidy+$na,$ploidy); } } while (my ($key,$value) = each %$data) { if ( !exists($$self{header}{FORMAT}{$key}) ) { push @errs, "FORMAT tag [$key] not listed in the header" unless $$self{version}<3.3; $self->_add_unknown_field('FORMAT',$key,-1); next; } my $type = $$self{header}{FORMAT}{$key}; my @vals = split(/,/, $value); if ( $$type{Number} eq 'G' ) { if ( $ng != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $ng, found ".scalar @vals.")"; } } elsif ( $$type{Number} eq 'A' ) { if ( $na != @vals && !(@vals==1 && $vals[0] eq '.') ) { push @errs, "FORMAT tag [$key] expected different number of values (expected $na, found ".scalar @vals.")"; } } elsif ( $$type{Number}!=-1 && @vals!=$$type{Number} ) { push @errs, "FORMAT tag [$key] expected different number of values ($$type{Number})"; } if ( !$$type{handler} ) { next; } for my $val (@vals) { my $err = &{$$type{handler}}($self,$val,$$type{default}); if ( $err ) { push @errs, $err; } } } if ( !@errs ) { return undef; } return join(',',@errs); } sub validate_ref_field { my ($self,$ref) = @_; if ( !($ref=~/^[ACGTN]$/) ) { return "Expected one of A,C,G,T,N, got [$ref]\n"; } return undef; } sub validate_int { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( $value =~ /^-?\d+$/ ) { return undef; } return "Could not validate the int [$value]"; } sub validate_float { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( $value =~ /^-?\d+(?:\.\d*)$/ ) { return undef; } if ( $value =~ /^-?\d*(?:\.\d+)$/ ) { return undef; } if ( $value =~ /^-?\d+$/ ) { return undef; } if ( $value =~ /^-?\d*(?:\.?\d+)(?:[Ee][-+]?\d+)?$/ ) { return undef; } return "Could not validate the float [$value]"; } sub validate_char { my ($self,$value,$default) = @_; if ( defined($default) && $value eq $default ) { return undef; } if ( length($value)==1) { return undef; } return "Could not validate the char value [$value]"; } =head2 run_validation About : Validates the VCF file. Usage : my $vcf = Vcf->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz'); Args : File name or file handle. =cut sub run_validation { my ($self) = @_; $self->parse_header(); $self->validate_header(); if ( !exists($$self{header}) ) { $self->warn(qq[The header not present.\n]); } elsif ( !exists($$self{header}{fileformat}) ) { $self->warn(qq[The "fileformat" field not present in the header, assuming VCFv$$self{version}\n]); } elsif ( $$self{header_lines}[0]{key} ne 'fileformat' ) { $self->warn(qq[The "fileformat" not the first line in the header\n]); } if ( !exists($$self{columns}) ) { $self->warn("No column descriptions found.\n"); } my $default_qual = $$self{defaults}{QUAL}; my $warn_sorted=1; my $warn_duplicates = exists($$self{warn_duplicates}) ? $$self{warn_duplicates} : 1; my ($prev_chrm,$prev_pos); while (my $line=$self->next_data_array()) { for (my $i=0; $i<@$line; $i++) { if (!defined($$line[$i]) or $$line[$i] eq '' ) { my $colname = $i<@{$$self{columns}} ? $$self{columns}[$i] : $i+1; $self->warn("The column $colname is empty at $$line[0]:$$line[1].\n"); } } my $x = $self->next_data_hash($line); $self->validate_line($x); # Is the position numeric? if ( !($$x{POS}=~/^\d+$/) ) { $self->warn("Expected integer for the position at $$x{CHROM}:$$x{POS}\n"); } if ( $warn_duplicates ) { if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos eq $$x{POS} ) { $self->warn("Warning: Duplicate entries, for example $$x{CHROM}:$$x{POS}\n"); $warn_duplicates = 0; } } # Is the file sorted? if ( $warn_sorted ) { if ( $prev_chrm && $prev_chrm eq $$x{CHROM} && $prev_pos > $$x{POS} ) { $self->warn("Warning: The file is not sorted, for example $$x{CHROM}:$$x{POS} comes after $prev_chrm:$prev_pos\n"); $warn_sorted = 0; } $prev_chrm = $$x{CHROM}; $prev_pos = $$x{POS}; } # The reference base: one of A,C,G,T,N, non-empty. my $err = $self->validate_ref_field($$x{REF}); if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); } # The ALT field (alternate non-reference base) $err = $self->validate_alt_field($$x{ALT},$$x{REF}); if ( $err ) { $self->warn("$$x{CHROM}:$$x{POS} .. $err\n"); } # The QUAL field my $ret = $self->validate_float($$x{QUAL},$default_qual); if ( $ret ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} .. $ret\n"); } elsif ( $$x{QUAL}=~/^-?\d+$/ && $$x{QUAL}<-1 ) { $self->warn("QUAL field at $$x{CHROM}:$$x{POS} is negative .. $$x{QUAL}\n"); } # The FILTER field $err = $self->validate_filter_field($$x{FILTER}); if ( $err ) { $self->warn("FILTER field at $$x{CHROM}:$$x{POS} .. $err\n"); } # The INFO field $err = $self->validate_info_field($$x{INFO},$$x{ALT}); if ( $err ) { $self->warn("INFO field at $$x{CHROM}:$$x{POS} .. $err\n"); } while (my ($gt,$data) = each %{$$x{gtypes}}) { $err = $self->validate_gtype_field($data,$$x{ALT},$$x{FORMAT}); if ( $err ) { $self->warn("column $gt at $$x{CHROM}:$$x{POS} .. $err\n"); } } if ( scalar keys %{$$x{gtypes}} && (exists($$x{INFO}{AN}) || exists($$x{INFO}{AC})) ) { my $nalt = scalar @{$$x{ALT}}; if ( $nalt==1 && $$x{ALT}[0] eq '.' ) { $nalt=0; } my ($an,$ac) = $self->calc_an_ac($$x{gtypes},$nalt); # Allow alleles in ALT which are absent in samples if ( exists($$x{INFO}{AN}) && $an ne $$x{INFO}{AN} ) { $self->warn("$$x{CHROM}:$$x{POS} .. AN is $$x{INFO}{AN}, should be $an\n"); } if ( exists($$x{INFO}{AC}) && $ac ne $$x{INFO}{AC} ) { $self->warn("$$x{CHROM}:$$x{POS} .. AC is $$x{INFO}{AC}, should be $ac\n"); } } } } =head2 get_chromosomes About : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed. Usage : my $vcf = Vcf->new(); $vcf->get_chromosomes(); Args : none =cut sub get_chromosomes { my ($self) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } my (@out) = `tabix -l '$$self{file}'`; if ( $? ) { my @has_tabix = `which tabix`; if ( !@has_tabix ) { $self->throw(qq[The command "tabix" not found, please add it to your PATH\n]); } $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]); } for (my $i=0; $i<@out; $i++) { chomp($out[$i]); } return \@out; } =head2 get_samples About : Get list of samples. Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples(); Args : none =cut sub get_samples { my ($self) = @_; my $n = @{$$self{columns}} - 1; return (@{$$self{columns}}[9..$n]); } =head2 get_column About : Convenient way to get data for a sample Usage : my $rec = $vcf->next_data_array(); my $sample_col = $vcf->get_column($rec, 'NA0001'); Args 1 : Array pointer returned by next_data_array 2 : Column/Sample name =cut sub get_column { my ($self,$line,$column) = @_; if ( !exists($$self{has_column}{$column}) ) { $self->throw("No such column: [$column]\n"); } my $idx = $$self{has_column}{$column}; return $$line[$idx-1]; } =head2 get_column_name About : Mapping between zero-based VCF column and its name Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $name = $vcf->get_column_name(1); # returns POS Args : Index of the column (0-based) =cut sub get_column_name { my ($self,$idx) = @_; if ( $idx >= @{$$self{columns}} ) { $self->throw("The index out of bounds\n"); } return $$self{columns}[$idx]; } =head2 get_column_index About : Mapping between VCF column name and its zero-based index Usage : my $vcf = Vcf->new(); $vcf->parse_header(); my $name = $vcf->get_column_index('POS'); # returns 1 Args : Name of the column =cut sub get_column_index { my ($self,$column) = @_; if ( !exists($$self{has_column}{$column}) ) { $self->throw("No such column: [$column]\n"); } return $$self{has_column}{$column}-1; } #------------------------------------------------ # Version 3.2 specific functions package Vcf3_2; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '3.2', drop_trailings => 1, filter_passed => 0, defaults => { QUAL => '-1', default => '.', Flag => undef, GT => '.', }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$/i, regex_ins => qr/^I[ACGTN]+$/, regex_del => qr/^D\d+$/, regex_gtsep => qr{[\\|/]}, regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$}, regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } #------------------------------------------------ # Version 3.3 specific functions package Vcf3_3; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '3.3', drop_trailings => 0, filter_passed => 0, defaults => { QUAL => '-1', Integer => '-1', Float => '-1', Character => '.', String => '.', Flag => undef, GT => './.', default => '.', }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$/i, regex_ins => qr/^I[ACGTN]+$/, regex_del => qr/^D\d+$/, regex_gtsep => qr{[\\|/]}, regex_gt => qr{^(\.|\d+)([\\|/]?)(\.?|\d*)$}, regex_gt2 => qr{^(\.|[0-9ACGTNIDacgtn]+)([\\|/]?)}, # . 0/1 0|1 A/A A|A D4/IACGT gt_sep => [qw(\ | /)], }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } #------------------------------------------------ # Version 4.0 specific functions =head1 VCFv4.0 VCFv4.0 specific functions =cut package Vcf4_0; use base qw(VcfReader); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '4.0', drop_trailings => 1, filter_passed => 'PASS', defaults => { QUAL => '.', Flag => undef, GT => '.', default => '.', }, reserved => { FILTER => { 0=>1 }, }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i, regex_ins => qr/^[ACGTN]+$/, regex_del => qr/^[ACGTN]+$/, regex_gtsep => qr{[|/]}, # | / regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1 regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0| gt_sep => [qw(| /)], }; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } sub Vcf4_0::format_header_line { my ($self,$rec) = @_; my %tmp_rec = ( %$rec ); if ( exists($tmp_rec{Number}) && $tmp_rec{Number} eq '-1' ) { $tmp_rec{Number} = '.' } my $value; if ( exists($tmp_rec{ID}) or $tmp_rec{key} eq 'PEDIGREE' ) { my %has = ( key=>1, handler=>1, default=>1 ); # Internal keys not to be output my @items; for my $key (qw(ID Number Type Description), sort keys %tmp_rec) { if ( !exists($tmp_rec{$key}) or $has{$key} ) { next; } my $quote = ($key eq 'Description' or $tmp_rec{$key}=~/\s/) ? '"' : ''; push @items, "$key=$quote$tmp_rec{$key}$quote"; $has{$key}=1; } $value = '<' .join(',',@items). '>'; } else { $value = $tmp_rec{value}; } my $line = "##$tmp_rec{key}=".$value."\n"; return $line; } =head2 parse_header_line Usage : $vcf->parse_header_line(q[##FORMAT=]) $vcf->parse_header_line(q[reference=1000GenomesPilot-NCBI36]) Args : Returns : =cut sub Vcf4_0::parse_header_line { my ($self,$line) = @_; chomp($line); $line =~ s/^##//; if ( !($line=~/^([^=]+)=/) ) { $self->throw("Expected key=value pair in the header: $line\n"); } my $key = $1; my $value = $'; if ( !($value=~/^<(.+)>\s*$/) ) { # Simple sanity check for subtle typos if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' or $key eq 'ALT' ) { $self->throw("Hmm, is this a typo? [$key] [$value]"); } return { key=>$key, value=>$value }; } my $rec = { key=>$key }; my $tmp = $1; my ($attr_key,$attr_value,$quoted); while ($tmp ne '') { if ( !defined $attr_key ) { if ( $tmp=~/^([^=]+)="/ ) { $attr_key=$1; $quoted=1; $tmp=$'; next; } elsif ( $tmp=~/^([^=]+)=/ ) { $attr_key=$1; $quoted=0; $tmp=$'; next; } else { $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); } } if ( $tmp=~/^[^,\\"]+/ ) { $attr_value .= $&; $tmp = $'; } if ( $tmp=~/^\\\\/ ) { $attr_value .= '\\\\'; $tmp = $'; next; } if ( $tmp=~/^\\"/ ) { $attr_value .= '\\"'; $tmp = $'; next; } if ( $tmp eq '' or ($tmp=~/^,/ && !$quoted) or $tmp=~/^"/ ) { if ( $attr_key=~/^\s+/ or $attr_key=~/\s+$/ or $attr_value=~/^\s+/ or $attr_value=~/\s+$/ ) { $self->warn("Leading or trailing space in attr_key-attr_value pairs is discouraged:\n\t[$attr_key] [$attr_value]\n\t$line\n"); $attr_key =~ s/^\s+//; $attr_key =~ s/\s+$//; $attr_value =~ s/^\s+//; $attr_value =~ s/\s+$//; } $$rec{$attr_key} = $attr_value; $tmp = $'; if ( $quoted && $tmp=~/^,/ ) { $tmp = $'; } $attr_key = $attr_value = $quoted = undef; next; } if ( $tmp=~/^,/ ) { $attr_value .= $&; $tmp = $'; next; } $self->throw(qq[Could not parse header line: $line\nStopped at [$tmp].\n]); } if ( $key eq 'INFO' or $key eq 'FILTER' or $key eq 'FORMAT' ) { if ( $key ne 'PEDIGREE' && !exists($$rec{ID}) ) { $self->throw("Missing the ID tag in $line\n"); } if ( !exists($$rec{Description}) ) { $self->warn("Missing the Description tag in $line\n"); } } if ( exists($$rec{Number}) && $$rec{Number} eq '-1' ) { $self->warn("The use of -1 for unknown number of values is deprecated, please use '.' instead.\n\t$line\n"); } if ( exists($$rec{Number}) && $$rec{Number} eq '.' ) { $$rec{Number}=-1; } return $rec; } sub Vcf4_0::validate_ref_field { my ($self,$ref) = @_; if ( !($ref=~/^[ACGTN]+$/) ) { my $offending = $ref; $offending =~ s/[ACGTN]+//g; return "Expected combination of A,C,G,T,N for REF, got [$ref], the offending chars were [$offending]\n"; } return undef; } sub Vcf4_0::validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my $ref_len = length($ref); my $ref1 = substr($ref,0,1); my @err; my $msg = ''; for my $item (@$values) { if ( !($item=~/^[ACTGN]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; } if ( $item=~/^<[^<>\s]+>$/ ) { next; } if ( $ref_len==length($item) ) { next; } if ( substr($item,0,1) ne $ref1 ) { $msg=', first base does not match the reference.'; push @err,$item; next; } } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg; } =head2 fill_ref_alt_mapping About : A tool for merging VCFv4.0 records. The subroutine unifies the REFs and creates a mapping from the original haplotypes to the haplotypes based on the new REF. Consider the following example: REF ALT G GA GT G GT GA GT GAA GTC G G my $map={G=>{GA=>1},GT=>{G=>1,GA=>1,GAA=>1},GTC=>{G=>1},G=>{''=>1}}; my $new_ref=$vcf->fill_ref_alt_mapping($map); The call returns GTC and $map is now G GA -> GTC GATC GT G -> GTC GC GT GA -> GTC GAC GT GAA -> GTC GAAC GTC G -> GTC G G -> GTC Args : Returns : New REF string and fills the hash with appropriate ALT. =cut sub Vcf4_0::fill_ref_alt_mapping { my ($self,$map) = @_; my $max_len = 0; my $new_ref; for my $ref (keys %$map) { my $len = length($ref); if ( $max_len<$len ) { $max_len = $len; $new_ref = $ref; } $$map{$ref}{$ref} = 1; } for my $ref (keys %$map) { my $rlen = length($ref); if ( substr($new_ref,0,$rlen) ne $ref ) { $self->throw("The reference prefixes do not agree: $ref vs $new_ref\n"); } for my $alt (keys %{$$map{$ref}}) { # The second part of the regex is for VCF>4.0, but does no harm for v<=4.0 if ( $alt=~/^<.+>$/ or $alt=~/\[|\]/ ) { $$map{$ref}{$alt} = $alt; next; } my $new = $alt; if ( $rlen<$max_len ) { $new .= substr($new_ref,$rlen); } $$map{$ref}{$alt} = $new; } } return $new_ref; } =head2 normalize_alleles About : Makes REF and ALT alleles more compact if possible (e.g. TA,TAA -> T,TA) Usage : my $line = $vcf->next_data_array(); ($ref,@alts) = $vcf->normalize_alleles($$line[3],$$line[4]); =cut sub Vcf4_0::normalize_alleles { my ($self,$ref,$alt) = @_; my $rlen = length($ref); if ( $rlen==1 or length($alt)==1 ) { return ($ref,split(/,/,$alt)); } my @als = split(/,/,$alt); my $i = 1; my $done = 0; while ( $i<$rlen ) { my $r = substr($ref,$rlen-$i,1); for my $al (@als) { my $len = length($al); if ( $i>=$len ) { $done = 1; } my $c = substr($al,$len-$i,1); if ( $c ne $r ) { $done = 1; last; } } if ( $done ) { last; } $i++; } if ( $i>1 ) { $i--; $ref = substr($ref,0,$rlen-$i); for (my $j=0; $j<@als; $j++) { $als[$j] = substr($als[$j],0,length($als[$j])-$i); } } return ($ref,@als); } sub Vcf4_0::normalize_alleles_pos { my ($self,$ref,$alt) = @_; my @als; ($ref,@als) = $self->normalize_alleles($ref,$alt); my $rlen = length($ref); if ( $rlen==1 ) { return (0,$ref,@als); } my $i = 0; my $done = 0; while ( $i+1<$rlen ) { my $r = substr($ref,$i,1); for my $al (@als) { my $len = length($al); if ( $i+1>=$len ) { $done = 1; last; } my $c = substr($al,$i,1); if ( $c ne $r ) { $done = 1; last; } } if ( $done ) { last; } $i++; } if ( $i<0 ) { $i = 0; } if ( $i>0 ) { substr($ref,0,$i,''); for (my $j=0; $j<@als; $j++) { substr($als[$j],0,$i,''); } } return ($i,$ref,@als); } sub Vcf4_0::event_type { my ($self,$rec,$allele) = @_; my $ref = $rec; if ( ref($rec) eq 'HASH' ) { if ( exists($$rec{_cached_events}{$allele}) ) { return (@{$$rec{_cached_events}{$allele}}); } $ref = $$rec{REF}; } if ( $allele=~/^<[^>]+>$/ ) { if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['u',0,$allele]; } return ('u',0,$allele); } if ( $allele eq '.' ) { if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = ['r',0,$ref]; } return ('r',0,$ref); } my $reflen = length($ref); my $len = length($allele); my $ht; my $type; if ( $len==$reflen ) { # This can be a reference, a SNP, or multiple SNPs my $mism = 0; for (my $i=0; $i<$len; $i++) { if ( substr($ref,$i,1) ne substr($allele,$i,1) ) { $mism++; } } if ( $mism==0 ) { $type='r'; $len=0; } else { $type='s'; $len=$mism; } } else { ($len,$ht)=$self->is_indel($ref,$allele); if ( $len ) { # Indel $type = 'i'; $allele = $ht; } else { $type = 'o'; $len = $len>$reflen ? $len-1 : $reflen-1; } } if ( ref($rec) eq 'HASH' ) { $$rec{_cached_events}{$allele} = [$type,$len,$allele]; } return ($type,$len,$allele); } # The sequences start at the same position, which simplifies things greatly. # Returns length of the indel (+ insertion, - deletion), the deleted/inserted sequence # and the position of the first base after the shared sequence sub is_indel { my ($self,$seq1,$seq2) = @_; my $len1 = length($seq1); my $len2 = length($seq2); if ( $len1 eq $len2 ) { return (0,'',0); } my ($del,$len,$LEN); if ( $len1<$len2 ) { $len = $len1; $LEN = $len2; $del = 1; } else { $len = $len2; $LEN = $len1; $del = -1; my $tmp=$seq1; $seq1=$seq2; $seq2=$tmp; } my $ileft; for ($ileft=0; $ileft<$len; $ileft++) { if ( substr($seq1,$ileft,1) ne substr($seq2,$ileft,1) ) { last; } } if ( $ileft==$len ) { return ($del*($LEN-$len), substr($seq2,$ileft), $ileft); } my $iright; for ($iright=0; $iright<$len; $iright++) { if ( substr($seq1,$len-$iright,1) ne substr($seq2,$LEN-$iright,1) ) { last; } } if ( $iright+$ileft<=$len ) { return (0,'',0); } return ($del*($LEN-$len),substr($seq2,$ileft,$LEN-$len),$ileft); } #------------------------------------------------ # Version 4.1 specific functions =head1 VCFv4.1 VCFv4.1 specific functions =cut package Vcf4_1; use base qw(Vcf4_0); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); bless $self, ref($class) || $class; $$self{_defaults} = { version => '4.1', drop_trailings => 1, filter_passed => 'PASS', defaults => { QUAL => '.', Flag => undef, GT => '.', default => '.', }, reserved => { FILTER => { 0=>1 }, }, handlers => { Integer => \&VcfReader::validate_int, Float => \&VcfReader::validate_float, Character => \&VcfReader::validate_char, String => undef, Flag => undef, }, regex_snp => qr/^[ACGTN]$|^<[\w:.]+>$/i, regex_ins => qr/^[ACGTN]+$/i, regex_del => qr/^[ACGTN]+$/i, regex_gtsep => qr{[|/]}, # | / regex_gt => qr{^(\.|\d+)([|/]?)(\.?|\d*)$}, # . ./. 0/1 0|1 regex_gt2 => qr{^(\.|[0-9ACGTNacgtn]+|<[\w:.]+>)([|/]?)}, # . ./. 0/1 0|1 A/A A|A 0| gt_sep => [qw(| /)], }; $$self{ignore_missing_GT} = 1; for my $key (keys %{$$self{_defaults}}) { $$self{$key}=$$self{_defaults}{$key}; } return $self; } sub Vcf4_1::validate_header { my ($self) = @_; my $lines = $self->get_header_line(key=>'reference'); if ( !@$lines ) { $self->warn("The header tag 'reference' not present. (Not required but highly recommended.)\n"); } } sub Vcf4_1::validate_line { my ($self,$line) = @_; if ( !$$self{_contig_validated}{$$line{CHROM}} ) { my $lines = $self->get_header_line(key=>'contig',ID=>$$line{CHROM}); if ( !@$lines ) { $self->warn("The header tag 'contig' not present for CHROM=$$line{CHROM}. (Not required but highly recommended.)\n"); } $$self{_contig_validated}{$$line{CHROM}} = 1; } if ( index($$line{CHROM},':')!=-1 ) { $self->warn("Colons not allowed in chromosome names: $$line{CHROM}\n"); } # Is the ID composed of alphanumeric chars if ( !($$line{ID}=~/^\S+$/) ) { $self->warn("Expected non-whitespace ID at $$line{CHROM}:$$line{POS}, but got [$$line{ID}]\n"); } } sub Vcf4_1::validate_alt_field { my ($self,$values,$ref) = @_; if ( @$values == 1 && $$values[0] eq '.' ) { return undef; } my $ret = $self->_validate_alt_field($values,$ref); if ( $ret ) { return $ret; } my $ref_len = length($ref); my $ref1 = substr($ref,0,1); my @err; my $msg = ''; for my $item (@$values) { if ( $item=~/^(.*)\[(.+)\[(.*)$/ or $item=~/^(.*)\](.+)\](.*)$/ ) { if ( $1 ne '' && $3 ne '' ) { $msg=', two replacement strings given (expected one)'; push @err,$item; next; } my $rpl; if ( $1 ne '' ) { $rpl = $1; if ( $rpl ne '.' ) { my $rref = substr($rpl,0,1); if ( $rref ne $ref1 ) { $msg=', the first base of the replacement string does not match the reference'; push @err,$item; next; } } } else { $rpl = $3; if ( $rpl ne '.' ) { my $rref = substr($rpl,-1,1); if ( $rref ne $ref1 ) { $msg=', the last base of the replacement string does not match the reference'; push @err,$item; next; } } } my $pos = $2; if ( !($rpl=~/^[ACTGNacgtn]+$/) && $rpl ne '.' ) { $msg=', replacement string not valid (expected [ACTGNacgtn]+)'; push @err,$item; next; } if ( !($pos=~/^\S+:\d+$/) ) { $msg=', cannot parse sequence:position'; push @err,$item; next; } next; } if ( $item=~/^\.[ACTGNactgn]*([ACTGNactgn])$/ ) { next; } elsif ( $item=~/^([ACTGNactgn])[ACTGNactgn]*\.$/ ) { next; } if ( !($item=~/^[ACTGNactgn]+$|^<[^<>\s]+>$/) ) { push @err,$item; next; } } if ( !@err ) { return undef; } return 'Could not parse the allele(s) [' .join(',',@err). ']' . $msg; } sub Vcf4_1::next_data_hash { my ($self,@args) = @_; my $out = $self->SUPER::next_data_hash(@args); if ( !defined $out or $$self{assume_uppercase} ) { return $out; } # Case-insensitive ALT and REF bases $$out{REF} = uc($$out{REF}); my $nalt = @{$$out{ALT}}; for (my $i=0; $i<$nalt; $i++) { if ( $$out{ALT}[$i]=~/^SUPER::next_data_array(@args); if ( !defined $out or $$self{assume_uppercase} ) { return $out; } # Case-insensitive ALT and REF bases $$out[3] = uc($$out[3]); my $alt = $$out[4]; $$out[4] = ''; my $pos = 0; while ( $pos',$start+1); if ( $end==-1 ) { $self->throw("Could not parse ALT [$alt]\n") } if ( $start>$pos ) { $$out[4] .= uc(substr($alt,$pos,$start-$pos)); } $$out[4] .= substr($alt,$start,$end-$start+1); $pos = $end+1; } if ( $posSUPER::event_type($rec,$allele); } my $c = substr($allele,0,1); if ( $c eq '<' ) { return ('u',0,$allele); } elsif ( $c eq '[' or $c eq ']' or $c eq '.' ) { return 'b'; } $c = substr($allele,-1,1); if ( $c eq '[' or $c eq ']' or $c eq '.' ) { return 'b'; } elsif ( index($allele,'[')!=-1 or index($allele,']')!=-1 ) { return 'b'; } return $self->SUPER::event_type($rec,$allele); } 1; vcftools_0.1.11/perl/vcf-phased-join0000755000000000000000000005524712156354770016105 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); if ( $$opts{split_size} ) { split_vcf($opts); } else { join_vcfs($opts); } exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: The script takes multiple overlapping pre-phased chunks and concatenates them into one VCF\n", " using heterozygous calls from the overlaps to determine correct phase.\n", "Usage: vcf-phased-join [OPTIONS] A.vcf B.vcf C.vcf\n", "Options:\n", " -j, --min-join-quality Quality threshold for gluing the pre-phased blocks together [10]\n", " -l, --list List of VCFs to join.\n", " -o, --output Output file name. When \"-\" is supplied, STDOUT and STDERR will be used\n", " -q, --min-PQ Break pre-phased segments if PQ value is lower in input VCFs [0.6]\n", " -h, -?, --help This help message\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], min_join_quality => 10, min_PQ => 0.6, min_BP => 1, }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-o' || $arg eq '--output' ) { $$opts{output}=shift(@ARGV); next; } if ( $arg eq '-j' || $arg eq '--min-join-quality' ) { $$opts{min_join_quality}=shift(@ARGV); next; } if ( $arg eq '-q' || $arg eq '--min-PQ' ) { $$opts{min_PQ}=shift(@ARGV); next; } if ( $arg eq '-l' || $arg eq '--list' ) { $$opts{list}=shift(@ARGV); next; } if ( $arg eq '--min-BP' ) { $$opts{min_BP}=shift(@ARGV); next; } if ( $arg eq '--split-size' ) { $$opts{split_size}=shift(@ARGV); next; } if ( $arg eq '--split-noise' ) { $$opts{split_noise}=shift(@ARGV); next; } if ( $arg eq '--split-overlap' ) { $$opts{split_overlap}=shift(@ARGV); next; } if ( $arg eq '--split-prefix' ) { $$opts{split_prefix}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{vcfs}}, $arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( exists($$opts{list}) ) { open(my $fh,'<',$$opts{list}) or error("$$opts{list}: $!"); while (my $line=<$fh>) { if ($line=~/^\s*$/) { next; } $line =~ s/^\s*//; $line =~ s/\s*$//; if ( ! -e $line ) { error("Some of the files in $$opts{list} do not exist\n"); } push @{$$opts{vcfs}},$line; } close($fh); } if ( !exists($$opts{vcfs}) ) { error("No VCF file given"); } if ( !exists($$opts{split_size}) ) { if ( @{$$opts{vcfs}}<1 ) { error("No input VCF given?\n"); } if ( @{$$opts{vcfs}}<2 ) { warn("Only one input VCF given, running in --min-PQ splitting mode.\n"); if ( $$opts{min_PQ}<=0.5 ) { warn("You better know what you're doing: --min-PQ set too low, will hardly find any split!"); } } if ( !exists($$opts{output}) ) { error("No output VCF file name given"); } } return $opts; } sub split_vcf { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcfs}[0]); $vcf->parse_header(); $$opts{vcf} = $vcf; my ($fh_next,$swap_next) = open_next_file($opts); my ($fh,$prev_boundary,$start_pos,@buffer,$prev_chr,$prev_pos,$swap); while (my $rec=$vcf->next_data_array) { my $rec_next = []; for my $col (@$rec) { push @{$rec_next}, "$col"; } my $chr = $$rec[0]; my $pos = $$rec[1]; if ( defined $prev_chr && $prev_chr ne $chr ) { last; } if ( defined $prev_pos && $pos<=$prev_pos ) { error("Not sorted or duplicate position: $chr:$prev_pos vs $chr:$pos"); } $prev_pos = $pos; $prev_chr = $chr; if ( !defined $start_pos ) { $start_pos = $pos; } my $bnd = $start_pos + int(($pos-$start_pos)/$$opts{split_size})*$$opts{split_size}; if ( $start_pos!=$bnd && abs($pos-$bnd)*2 <= $$opts{split_overlap} ) { # Known boundary if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); } if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); } next; } $bnd += $$opts{split_size}; if ( abs($pos-$bnd)*2 >= $$opts{split_overlap} ) { print $fh_next swap_gts($vcf,$swap_next,$rec); next; } # New boundary if ( !defined $prev_boundary || $prev_boundary ne $bnd ) { close($fh) unless !defined $fh; $fh = $fh_next; $swap = $swap_next; $prev_boundary = $bnd; $fh_next = undef; } if ( !defined $fh_next ) { ($fh_next,$swap_next) = open_next_file($opts); $prev_boundary = $bnd; } if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); } if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); } } if ( defined $fh ) { close($fh); } if ( defined $fh_next ) { close($fh_next); } } sub open_next_file { my ($opts) = @_; $$opts{split_ifile}++; my $fname = sprintf "%s%02d.vcf", $$opts{split_prefix},$$opts{split_ifile}; open(my $fh,'>',$fname) or error("$fname: $!"); print $fh $$opts{vcf}->format_header; my @swap; for (my $i=9; $i<@{$$opts{vcf}{columns}}; $i++) { if ( $$opts{split_ifile}==1 ) { $swap[$i-9] = -1; } else { $swap[$i-9] = int(rand(2)) ? 1 : -1; } if ( $swap[$i-9]==1 ) { printf "%s\t%s\tswapped\n",$fname,$$opts{vcf}{columns}[$i]; } } return ($fh,\@swap); } sub randomize { my ($opts,$swap,$vcf,$rec,$dist) = @_; if ( $dist>$$opts{split_overlap} ) { $dist = $$opts{split_overlap}; } my $noise = $dist/$$opts{split_overlap}; if ( exists($$opts{split_noise}) ) { $noise = $$opts{split_noise}; } my $na = 2 * (scalar @$rec - 9); my $nchanged = int($na*$noise); if ( !$nchanged ) { return swap_gts($vcf,$swap,$rec); } use List::Util 'shuffle'; my @errors = (1) x $nchanged; if ( $nchanged<$na ) { @errors = (@errors, (0) x ($na-$nchanged)); } @errors = shuffle(@errors); print "$$rec[1] .. dist=$dist, changed=$nchanged total=$na ($noise)\n"; my $itag = $vcf->get_tag_index($$rec[8],'GT',':'); my $i = -2; for (my $isample=9; $isample<@$rec; $isample++) { $i += 2; if ( !$errors[$i] && $errors[$i+1] ) { next; } my $gt = $vcf->get_field($$rec[$isample],$itag); my ($a1,$a2) = $vcf->split_gt($gt); if ( $errors[$i] ) { $a1 = $a1 ? 0 : 1; } if ( $errors[$i+1] ) { $a2 = $a2 ? 0 : 1; } $$rec[$isample] = $vcf->replace_field($$rec[$isample],"$a1|$a2",$itag,':'); } return swap_gts($vcf,$swap,$rec); } sub swap_gts { my ($vcf,$swap,$rec) = @_; my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $gts = $vcf->get_sample_field($rec,$igt); for (my $i=0; $i<@$gts; $i++) { if ( $$swap[$i]==-1 ) { next; } my ($a1,$a2) = $vcf->split_gt($$gts[$i]); $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':'); } return $vcf->format_line($rec); } sub check_columns { my ($opts) = @_; my @columns; for my $file (@{$$opts{vcfs}}) { my $vcf = Vcf->new(file=>$file); $vcf->parse_header(); if ( @columns ) { if ( @columns != @{$$vcf{columns}} ) { warn("Different number of columns in [$file].\n"); } for (my $i=0; $i<@columns; $i++) { if ( $$vcf{columns}[$i] ne $columns[$i] ) { warn("The column names do not agree in [$file].\n"); last; } } } else { @columns = @{$$vcf{columns}}; } $vcf->close(); } $$opts{nsamples} = @columns-9; } sub log_msg { my ($opts,@msg) = @_; print {$$opts{log_fh}} @msg; } sub next_vcf_file { my ($opts) = @_; if ( !exists($$opts{ifile}) ) { $$opts{ifile}=-1; } my $chr = $$opts{current_chr}; my @vcfs = @{$$opts{chroms}{$chr}}; while (1) { $$opts{ifile}++; if ( $$opts{ifile} >= @vcfs ) { return (undef,undef); } $$opts{ivcf_fname} = $vcfs[$$opts{ifile}]; my $vcf = Vcf->new(file=>$$opts{ivcf_fname}, region=>$chr, print_header=>1); $vcf->parse_header(); my $rec = $vcf->next_data_array(); if ( !defined $rec ) { next; } return ($vcf,$rec); } } sub join_vcfs { my ($opts) = @_; # Determine the chromosomes for my $vcf (@{$$opts{vcfs}}) { my @chroms = `tabix -l $vcf`; if ( $? ) { error(qq[The command "tabix -l $vcf" exited with an error. Is the file tabix indexed?\n]); } if ( !@chroms ) { warn(qq[Warning: Is the VCF file $vcf empty?\n]); } for my $chr (@chroms) { chomp($chr); push @{$$opts{chroms}{$chr}},$vcf; } } check_columns($opts); $$opts{phased_blocks} = [ (0) x $$opts{nsamples} ]; $$opts{broken_blocks} = [ (0) x $$opts{nsamples} ]; for my $chr (sort keys %{$$opts{chroms}}) { $$opts{current_chr} = $chr; join_vcfs_chr($opts); } report_stats($opts); } sub join_vcfs_chr { my ($opts) = @_; delete($$opts{ifile}); $$opts{swapped} = [ (0) x $$opts{nsamples} ]; $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; my ($vcf1,$rec1) = next_vcf_file($opts); if ( !defined $rec1 ) { error("Broken/Empty VCFs?"); } if ( $$opts{output} ne '-' ) { my $logfile = $$opts{output}; if ( $$opts{output}=~/\.[^\.]+$/ ) { $logfile = $`; } $logfile .= '.plog'; open($$opts{log_fh},'>',$logfile) or error("$logfile: $!"); open($$opts{out_fh},'>',$$opts{output}) or error("$$opts{output}: $!"); } else { $$opts{log_fh} = \*STDERR; $$opts{out_fh} = \*STDOUT; } $$opts{vcf} = $vcf1; if ( !$$opts{header_printed} ) { $$opts{header_printed} = 1; $$opts{vcf}->add_header_line({key=>'FORMAT',ID=>'PS',Number=>1,Type=>'Integer',Description=>'Phase set'}); $$opts{vcf}->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print {$$opts{out_fh}} $$opts{vcf}->format_header(); log_msg($opts, "# This file was generated by vcf-phased-join.\n"); log_msg($opts, "# The command line was: ", join(' ',@{$$opts{args}}), "\n"); log_msg($opts, "#\n"); log_msg($opts, "#PS 'Phasing Summary'. Use `grep ^PS | cut -f 2-` to extract this part.\n"); log_msg($opts, "#PS The columns are:\n"); log_msg($opts, "#PS 1,2 .. the pair of files being joined\n"); log_msg($opts, "#PS 3 .. the overlapping region used for determining the phase\n"); log_msg($opts, "#PS 4 .. sample name\n"); log_msg($opts, "#PS 5 .. did a swap occur?\n"); log_msg($opts, "#PS 6 .. quality of phase assignment\n"); log_msg($opts, "#PS 7 .. number of het genotypes used for phasing\n"); log_msg($opts, "#PS 8,9 .. log10 likelihood of phase match/mismatch\n"); } $$opts{file1} = $$opts{ivcf_fname}; my ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { # Only one non-empty VCF file present, running in --min-PQ splitting mode while ( defined($rec1 = $vcf1->next_data_array()) ) { output_line($opts,$rec1,$$opts{swapped}); } return; } else { $$opts{file2} = $$opts{ivcf_fname}; if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } my @buffer; while (1) { # is vcf1 ahead of vcf2? while ( $$rec1[1] < $$rec2[1] ) { output_line($opts,$rec1,$$opts{swapped}); $rec1 = $vcf1->next_data_array(); if ( !defined $rec1 ) { last; } if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } if ( defined $rec1 ) { while ( $$rec1[1] eq $$rec2[1] ) { push @buffer, [$rec1,$rec2]; $rec1 = $vcf1->next_data_array(); $rec2 = $vcf2->next_data_array(); if ( !defined $rec1 ) { last; } if ( !defined $rec2 ) { error("The file $$opts{file1} ended before $$opts{file2}."); } if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } } if ( defined $rec1 && $$rec1[1] ne $$rec2[1] ) { error("ERROR\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n"); } } # is vcf1 done? if ( !defined $rec1 ) { flush_buffer($opts,$vcf1,\@buffer); $vcf1->close(); if ( !defined $rec2 ) { # Yes, this can happen when file1 ends exactly where file2 does $vcf2->close(); ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { last; } } $vcf1 = $vcf2; $rec1 = $rec2; $$opts{file1} = $$opts{ivcf_fname}; ($vcf2,$rec2) = next_vcf_file($opts); if ( !defined $rec2 ) { last; } $$opts{file2} = $$opts{ivcf_fname}; if ( wrong_order($opts,$rec1,$rec2) ) { $vcf1->_unread_line($rec1); $rec1 = $rec2; } next; } } if ( @buffer ) { flush_buffer($opts,$vcf1,\@buffer); } do { output_line($opts,$rec1,$$opts{swapped}) unless !defined $rec1; } while ( exists($$vcf1{fh}) && defined($rec1 = $vcf1->next_data_array()) ); } sub wrong_order { my ($opts,$rec1,$rec2) = @_; if ( $$rec1[0] ne $$rec2[0] ) { error("Encountered different chromosomes in $$opts{file1} and $$opts{file2}: \"$$rec1[0]:$$rec1[1]\" vs \"$$rec2[0]:$$rec2[1]\"\n"); } if ( $$rec1[1] > $$rec2[1] ) { log_msg($opts,"WARNING\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n"); return 1; } return 0; } sub output_line { my ($opts,$rec,$swap) = @_; my $vcf = $$opts{vcf}; my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); my $ips = $vcf->get_tag_index($$rec[8],'PS',':'); if ( $ips==-1 ) { $$rec[8] .= ':PS'; } my $ipq = exists($$opts{min_PQ}) ? $vcf->get_tag_index($$rec[8],'PQ',':') : -1; my $breakpoints = 0; for (my $i=0; $i<@$swap; $i++) { if ( $$swap[$i]==1 ) { my $gt = $vcf->get_field($$rec[$i+9],$igt); my ($a1,$a2) = $vcf->split_gt($gt); if ( defined $a2 ) { $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':'); } } if ( $ipq!=-1 ) { my $pq = $vcf->get_field($$rec[$i+9],$ipq); if ( $pq ne '.' && $pq<$$opts{min_PQ} ) { $$opts{phasing_set}[$i]=0; $$opts{broken_blocks}[$i]++; } } if ( !$$opts{phasing_set}[$i] ) { $$opts{phasing_set}[$i] = $$rec[1]; $$opts{phased_blocks}[$i]++; $breakpoints++; } if ( $ips==-1 ) { $$rec[$i+9] .= ':'.$$opts{phasing_set}[$i]; } else { $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],$$opts{phasing_set}[$i],$ips,':'); } } print {$$opts{out_fh}} $vcf->format_line($rec); $breakpoints *= 100./$$opts{nsamples}; if ( $breakpoints>$$opts{min_BP} ) { push @{$$opts{breakpoints}}, sprintf("BP\t%s\t%d\t%.1f\n", $$rec[0],$$rec[1],$breakpoints); } } sub flush_buffer { my ($opts,$vcf,$buffer) = @_; if ( !@$buffer ) { $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; return; } my $chr = $$buffer[0][0][0]; my $from = $$buffer[0][0][1]; my $to = $$buffer[-1][0][1]; # Determine likelihoods of genotypes being swapped my @lks_match = (); my @lks_mism = (); my @nsites = (0) x $$opts{nsamples}; for my $site (@$buffer) { my $rec1 = $$site[0]; my $rec2 = $$site[1]; my $igt1 = $vcf->get_tag_index($$rec1[8],'GT',':'); my $igt2 = $vcf->get_tag_index($$rec2[8],'GT',':'); my $gts1 = $vcf->get_sample_field($rec1,$igt1); my $gts2 = $vcf->get_sample_field($rec2,$igt2); my $ngts = $$opts{nsamples}; my $nerrors = 0; my (@als1,@als2,@phased); for (my $i=0; $i<@$gts1; $i++) { if ( index($$gts1[$i],'|')==-1 or index($$gts2[$i],'|')==-1 ) { push @phased, 0; next; } push @phased, 1; my ($a1,$a2) = $vcf->split_gt($$gts1[$i]); my ($b1,$b2) = $vcf->split_gt($$gts2[$i]); if ( !defined $a2 ) { $a2 = $a1; } # haploid genotypes if ( !defined $b2 ) { $b2 = $b1; } if ( !(($a1 eq $b1 && $a2 eq $b2) or ($a1 eq $b2 && $a2 eq $b1)) ) { $nerrors++ } push @als1, $a1,$a2; push @als2, $b1,$b2; } my $dist = $to-$$site[0][1] < $$site[0][1]-$from ? $to-$$site[0][1] : $$site[0][1]-$from; $$opts{dist_errors}{$nerrors}++; my $p = $nerrors/$ngts; if ( $p==0 ) { $p=1./$ngts; } elsif ( $p==1 ) { $p=1 - 1./$ngts; } for (my $i=0; $i<@$gts1; $i++) { if ( !$phased[$i] ) { next; } my $a1 = $als1[2*$i]; my $a2 = $als1[2*$i+1]; my $b1 = $als2[2*$i]; my $b2 = $als2[2*$i+1]; if ( $a1 eq $a2 or $b1 eq $b2 ) { next; } # homozygous GT if ( $a1 eq $b1 && $a2 eq $b2 ) { #print STDERR "$i .. counting match $a1/$a2 $b1/$b2\n"; $lks_match[$i] += log($p*$p + (1-$p)*(1-$p)); $lks_mism[$i] += log($p*(1-$p) + (1-$p)*$p); } elsif ( $a1 eq $b2 && $a2 eq $b1 ) { #print STDERR "$i .. counting mismatch $a1/$a2 $b1/$b2\n"; $lks_match[$i] += log($p*(1-$p) + (1-$p)*$p); $lks_mism[$i] += log($p*$p + (1-$p)*(1-$p)); } else { next; } # different alleles might have been selected at multiallelic sites $nsites[$i]++; } } my $file1 = $$opts{file1}; my $file2 = $$opts{file2}; my @swapped = ( (0) x $$opts{nsamples} ); my @quals = (); my $log10 = log(10); for (my $i=0; $i<$$opts{nsamples}; $i++) { if ( !defined $lks_match[$i] ) { $lks_match[$i] = $lks_mism[$i] = log(0.5); } $swapped[$i] = $lks_match[$i]>$lks_mism[$i] ? $$opts{swapped}[$i] : -1*$$opts{swapped}[$i]; $quals[$i] = abs($lks_match[$i]-$lks_mism[$i])/$log10; log_msg($opts, sprintf "PS\t%s\t%s\t$chr:$from-$to\t%s\t%d\t%.1f\t%d\t%f\t%f\n", $file1,$file2, $$vcf{columns}[$i+9], $swapped[$i]==-1?0:1, $quals[$i], $nsites[$i], $lks_match[$i]/$log10, $lks_mism[$i]/$log10); } # Do not allow segment breaking while processing the buffer: this may help sometimes, but may also make things worse. my $min_PQ = $$opts{min_PQ}; delete($$opts{min_PQ}); # In case there is no overlap, reset the phasing set if ( !@quals ) { $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; } # Output the VCF line and quality for my $site (@$buffer) { # Which of the two overlapping VCF lines to output? Take the one farther from the end. my ($rec,$swap); if ( $to-$$site[0][1] > $$site[0][1]-$from ) { $rec = $$site[0]; $swap = $$opts{swapped}; } else { # Update the phasing set ID if ( @quals ) { for (my $i=0; $i<@quals; $i++) { if ( $quals[$i] < $$opts{min_join_quality} ) { $$opts{phasing_set}[$i]=0; } } @quals = (); } $rec = $$site[1]; $swap = \@swapped; } output_line($opts,$rec,$swap); } $$opts{min_PQ} = $min_PQ; @$buffer = (); $$opts{swapped} = \@swapped; } sub report_stats { my ($opts) = @_; log_msg($opts, "#NS Number of phased segments. Use `grep ^NS | cut -f 2-` to extract this part.\n"); log_msg($opts, "#NS The columns are:\n"); log_msg($opts, "#NS 1 .. sample\n"); log_msg($opts, "#NS 2 .. number of phased blocks\n"); log_msg($opts, "#NS 3 .. number of blocks created because of low PQ\n"); log_msg($opts, "#NS 4 .. number of blocks created because of low joining quality\n"); for my $i (sort { $$opts{phased_blocks}[$b] <=> $$opts{phased_blocks}[$a] } (0..($$opts{nsamples}-1))) { log_msg($opts,sprintf "NS\t%s\t%d\t%d\t%d\n", $$opts{vcf}{columns}[9+$i],$$opts{phased_blocks}[$i],$$opts{broken_blocks}[$i],$$opts{phased_blocks}[$i]-$$opts{broken_blocks}[$i]); } log_msg($opts, "#BP Break Points. Use `grep ^BP | cut -f 2-` to extract this part.\n"); log_msg($opts, "#BP The columns are:\n"); log_msg($opts, "#BP 1 .. chromosome\n"); log_msg($opts, "#BP 2 .. position\n"); log_msg($opts, "#BP 3 .. percent of samples with breakpoint at that position\n"); for my $break (@{$$opts{breakpoints}}) { log_msg($opts,$break); } log_msg($opts, "#ED Error Distribution. Use `grep ^ED | cut -f 2-` to extract this part.\n"); log_msg($opts, "#ED The columns are:\n"); log_msg($opts, "#ED 1 .. number of GT mismatches per site not attributable to phasing\n"); log_msg($opts, "#ED 2 .. frequency \n"); for my $nerrors (sort {$a<=>$b} keys %{$$opts{dist_errors}}) { log_msg($opts, "ED\t$nerrors\t$$opts{dist_errors}{$nerrors}\n"); } } vcftools_0.1.11/perl/fill-ref-md50000755000000000000000000001464712156354770015312 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use IPC::Open2; use Vcf; my $opts = parse_params(); fill_ref_md5($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "About: The script computes MD5 sum of the reference sequence and inserts\n", " 'reference' and 'contig' tags into header as recommended by VCFv4.1.\n", " The VCF file must be compressed and tabix indexed, as it takes advantage\n", " of the lightning fast tabix reheader functionality.\n", "Usage: fill-ref-md5 [OPTIONS] in.vcf.gz out.vcf.gz\n", "Options:\n", " -d, --dictionary Where to read/write computed MD5s. Opened in append mode, existing records are not touched.\n", " -i, --info Optional info on reference assembly (AS), species (SP), taxonomy (TX)\n", " -r, --refseq The reference sequence in fasta format indexed by samtools faidx\n", " -h, -?, --help This help message.\n", "Examples:\n", " fill-ref-md5 -i AS:NCBIM37,SP:\"Mus\\ Musculus\" -r NCBIM37_um.fa -d NCBIM37_um.fa.dict in.vcf.gz out.vcf.gz\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-i' || $arg eq '--info' ) { $$opts{info}=shift(@ARGV); next; } if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-d' || $arg eq '--dictionary' ) { $$opts{dictionary}=shift(@ARGV); next; } if ( -e $arg && !exists($$opts{file}) ) { $$opts{file} = $arg; next } if ( exists($$opts{file}) && !exists($$opts{outfile}) ) { $$opts{outfile} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\" or non-existent file. Run -h for help.\n"); } if ( !exists($$opts{refseq}) && !exists($$opts{dictionary}) ) { error("Expected one of -d or -r options\n"); } if ( !exists($$opts{file}) ) { error("No input VCF file given.\n"); } if ( !exists($$opts{outfile}) ) { error("No output VCF file given.\n"); } return $opts; } sub read_dict { my ($dict) = @_; my $out = {}; if ( !$dict or !-e $dict ) { return $out } open(my $fh,'<',$dict) or error("$dict: $!"); my $line=<$fh>; if ( $line ne "\@HD\tVN:1.0\tSO:unsorted\n" ) { error("Could not parse $dict: $line"); } while (my $line=<$fh>) { chomp($line); # @SQ SN:5 LN:152537259 UR:file:/lustre/scratch102/projects/mouse/ref/NCBIM37_um.fa M5:f90804fb8fe9cb06076d51a710fb4563 my @items = split(/\t/,$line); if ( @items != 5 ) { error("Could not parse $dict: $line"); } my $item = shift(@items); if ( $item ne '@SQ' ) { next; } my $rec = {}; for my $item (@items) { if ( !($item=~/^([^:]+):(.+)$/) ) { error("Could not parse $dict: [$item] [$line]"); } $$rec{$1} = $2; } if ( !exists($$rec{SN}) ) { error("No SN in [$dict] [$line]?"); } $$out{$$rec{SN}} = $rec; } close($fh); return $out; } sub add_to_dictionary { my ($opts,$dict,$chr) = @_; if ( !exists($$opts{refseq}) ) { error("The chromosome [$chr] not present in the dictionary and no reference sequence given.\n"); } my($md5_in,$md5_out,$ok,$len); eval { open2($md5_out,$md5_in,'md5sum'); $ok=1; }; if ( !$ok ) { error("md5sum: $!"); } my $cmd = "samtools faidx $$opts{refseq} $chr"; open(my $refseq,"$cmd |") or error("$cmd: $!"); # get rid of the first ">$chr" line. <$refseq>; while (my $line=<$refseq>) { chomp($line); print $md5_in $line; $len += length($line); } close($refseq); close($md5_in); my @md5 = <$md5_out>; close($md5_out); $md5[0] =~ s/\s+.*$//; chomp($md5[0]); if ( !$len ) { error("The sequence [$chr] not present in $$opts{refseq}\n"); } $$dict{$chr} = { dirty=>1, SN=>$chr, LN=>$len, UR=>'file://'.$$opts{refseq}, M5=>$md5[0] }; $$dict{dirty} = 1; } sub write_dictionary { my ($opts,$dict) = @_; if ( !$$dict{dirty} or !exists($$opts{dictionary}) ) { return } my $needs_header = !-e $$opts{dictionary} ? 1 : 0; open(my $fh,'>>',$$opts{dictionary}) or error("$$opts{dictionary}: $!"); print $fh "\@HD\tVN:1.0\tSO:unsorted\n" unless !$needs_header; for my $key (sort keys %$dict) { if ( ref($$dict{$key}) ne 'HASH' or !$$dict{$key}{dirty} ) { next; } my $sn = $$dict{$key}{SN}; my $ln = $$dict{$key}{LN}; my $ur = $$dict{$key}{UR}; my $m5 = $$dict{$key}{M5}; print $fh "\@SQ\tSN:$sn\tLN:$ln\tUR:$ur\tM5:$m5\n"; } close($fh); } sub write_header { my ($opts,$dict,$chroms) = @_; my %info; if ( exists($$opts{info}) ) { $$opts{info} =~ s/AS:/assembly:/; $$opts{info} =~ s/SP:/species:/; $$opts{info} =~ s/TX:/taxonomy:/; for my $item (split(/,/,$$opts{info})) { my ($key,$value) = split(/:/,$item); if ( !defined $value ) { error("Could not parse the info: [$item] [$$opts{info}]"); } $info{$key} = $value; } } my $vcf = Vcf->new(file=>$$opts{file}); $vcf->parse_header(); my $uri = $$opts{refseq}=~m{^[^/:]+:} ? '' : 'file:'; $vcf->add_header_line({key=>'reference', value=>"$uri$$opts{refseq}"}); for my $chrom (@$chroms) { my %line = ( key => 'contig', ID => $$dict{$chrom}{SN}, length => $$dict{$chrom}{LN}, md5 => $$dict{$chrom}{M5}, %info ); $vcf->add_header_line(\%line); } open(my $out,'>',"$$opts{outfile}.header") or error("$$opts{outfile}.header: $!"); print $out $vcf->format_header(); close($out); } sub fill_ref_md5 { my ($opts) = @_; # List chromosomes my @chroms = `tabix -l $$opts{file}`; if ( $? ) { error("The command failed: tabix -l $$opts{file}\n"); } # Read dictionary my $dict = read_dict($$opts{dictionary},\@chroms); for my $chr (@chroms) { chomp($chr); if ( !exists($$dict{$chr}) ) { add_to_dictionary($opts,$dict,$chr); } } write_dictionary($opts,$dict); write_header($opts,$dict,\@chroms); `tabix -r $$opts{outfile}.header $$opts{file} > $$opts{outfile}`; } vcftools_0.1.11/perl/vcf-shuffle-cols0000755000000000000000000000437512156354770016272 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); concat($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Reorder columns to match the order in the template VCF.\n", "Usage: vcf-shuffle-cols [OPTIONS] -t template.vcf.gz file.vcf.gz > out.vcf\n", "Options:\n", " -t, --template The file with the correct order of the columns.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-t' || $arg eq '--template' ) { $$opts{template}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file}=$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{template}) ) { error("Missing the -t option.\n"); } return $opts; } sub concat { my ($opts) = @_; my $tmpl = Vcf->new(file=>$$opts{template}); $tmpl->parse_header(); $tmpl->close(); my $vcf = $$opts{file} ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); $vcf->parse_header(); # Check if one-to-one correspondence can be found and create a mapping my @new_to_old = (); for my $tcol (@{$$tmpl{columns}}) { if ( !exists($$vcf{has_column}{$tcol}) ) { error("TODO: the column names do not match\n"); } } for my $vcol (@{$$vcf{columns}}) { if ( !exists($$tmpl{has_column}{$vcol}) ) { error("TODO: the column names do not match\n"); } my $new = $$tmpl{has_column}{$vcol} - 1; my $old = $$vcf{has_column}{$vcol} - 1; $new_to_old[$new] = $old; } # Output the header with modified column order my $ncols = @{$$tmpl{columns}} - 1; my @cols = @{$$tmpl{columns}}[9..$ncols]; print $vcf->format_header(\@cols); while (my $x=$vcf->next_data_array()) { print $$x[0]; for (my $i=1; $i<=$ncols; $i++) { my $idx = $new_to_old[$i]; print "\t".$$x[$idx]; } print "\n"; } } vcftools_0.1.11/perl/vcf-tstv0000755000000000000000000000343412156354770014673 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; my $opts = parse_params(); calc_tstv(); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } print "Usage: cat file.vcf | vcf-tstv\n", "Options:\n", " -h, -?, --help This help message.\n", "\n"; exit -1; } sub parse_params { my $opts = {}; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub calc_tstv { my $stats; my $n=0; my $multiallelic=0; while (my $line=) { if ( substr($line,0,1) eq '#' ) { next; } $n++; my $i=-1; for (1..3) { $i=index($line,"\t",$i+1); } my $j = index($line,"\t",$i+1); my $ref = substr($line,$i+1,$j-$i-1); if ( length($ref)>1 ) { next; } $i = index($line,"\t",$j+1); my $alt = substr($line,$j+1,$i-$j-1); if ( $alt eq '.' ) { next; } $i = index($alt,','); if ( $i!=-1 ) { $alt = substr($alt,0,$i); } # only first ALT is counted if ( length($alt)>1 ) { next; } if ( $i!=-1 ) { $multiallelic++ } $$stats{$ref.$alt}++; } my $ts = 0; for my $mut (qw(AG GA CT TC)) { if ( exists($$stats{$mut}) ) { $ts += $$stats{$mut}; } } my $tv = 0; for my $mut (qw(AC CA GT TG AT TA CG GC)) { if ( exists($$stats{$mut}) ) { $tv += $$stats{$mut}; } } my $ratio = $tv ? $ts/$tv : 0; printf "%.2f\t%d\t(ts=%d tv=%d total=%d skipped=%d multiallelic=%d)\n", $ratio,$ts+$tv, $ts,$tv,$n,$n-$ts-$tv,$multiallelic; } vcftools_0.1.11/perl/vcf-merge0000755000000000000000000005443612156354770015002 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); merge_vcf_files($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak join('',@msg); } die "About: Merges VCF files by position, creating multi-sample VCFs from fewer-sample VCFs.\n", " The tool requires bgzipped and tabix indexed VCF files on input. (E.g. bgzip file.vcf; tabix -p vcf file.vcf.gz)\n", " If you need to concatenate VCFs (e.g. files split by chromosome), look at vcf-concat instead.\n", "Usage: vcf-merge [OPTIONS] file1.vcf file2.vcf.gz ... > out.vcf\n", "Options:\n", " -c, --collapse treat as identical sites with differing alleles [any]\n", " -d, --remove-duplicates If there should be two consecutive rows with the same chr:pos, print only the first one.\n", " -H, --vcf-header Use the VCF header\n", " -h, -?, --help This help message.\n", " -r, --regions Do only the given regions (comma-separated list or one region per line in a file).\n", " -R, --ref-for-missing Use the REF allele instead of the default missing genotype. Because it is not obvious\n", " what ploidy should be used, a user-defined string is used instead (e.g. 0/0).\n", " -s, --silent Try to be a bit more silent, no warnings about duplicate lines.\n", " -t, --trim-ALTs If set, redundant ALTs will be removed\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args => [$0, @ARGV], joiner => { 'DP4' => \&joiner_dp4, 'DP' => \&joiner_sum, }, trim_redundant_ALTs => 0, collapse_any => 1, }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--remove-duplicates' ) { $$opts{rm_dups}=1; next; } if ( $arg eq '-R' || $arg eq '--ref-for-missing' ) { $$opts{ref_for_missing}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--trim-ALTs' ) { $$opts{trim_redundant_ALTs}=1; next; } if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{silent_dups}=1; next; } if ( $arg eq '-H' || $arg eq '--vcf-header' ) { $$opts{vcf_header}=shift(@ARGV); next; } if ( $arg eq '-c' || $arg eq '--collapse' ) { $$opts{collapse_any} = 1; my $c = shift(@ARGV); if ( $c eq 'snps' ) { $$opts{collapse_snps}=1; } elsif ( $c eq 'indels' ) { $$opts{collapse_indels}=1; } elsif ( $c eq 'both' ) { $$opts{collapse_snps}=1; $$opts{collapse_indels}=1; } elsif ( $c eq 'any' ) { $$opts{collapse_any}=1; } elsif ( $c eq 'none' ) { $$opts{collapse_any}=0; $$opts{collapse_snps}=0; $$opts{collapse_indels}=0; } else { error("Expected one of with -c"); } next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{regions_list}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { push @{$$opts{files}},$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -? for help.\n"); } if ( !exists($$opts{files}) ) { error() } return $opts; } # Returns the common prefix of the files. sub common_prefix { my ($files) = @_; my @paths; my $len = -1; for my $file (@$files) { my @path = split(m{/+},$file); if ( $len<0 || $len>scalar @path ) { $len=scalar @path; } push @paths, \@path; } my @common; for (my $i=0; $i<$len; $i++) { my $identical=1; for (my $ifile=1; $ifile[$i] ne $paths[0]->[$i] ) { $identical=0; last; } } if ( !$identical ) { last; } push @common, $paths[0]->[$i]; } return join('/+',@common); } sub read_region_list { my ($opts) = @_; my @regions = (); if ( exists($$opts{regions_list}) ) { if ( -e $$opts{regions_list} ) { open(my $rgs,'<',$$opts{regions_list}) or error("$$opts{regions_list}: $!"); while (my $line=<$rgs>) { chomp($line); push @regions, $line; } close($rgs); } else { @regions = split(/,/,$$opts{regions_list}); } } return (@regions); } sub check_AGtags_definition { my ($vcf) = @_; if ( $$vcf{version} >= 4.1 ) { return; } # Whatever is the value set to, the user takes the responsibility for the merging strategy used if ( exists($ENV{DONT_FIX_VCF40_AG_TAGS}) ) { return; } my @tags; if ( exists($$vcf{header}{INFO}{PL}) && $$vcf{header}{INFO}{PL}{Number} != -1 ) { push @tags, 'PL'; } if ( exists($$vcf{header}{INFO}{GL}) && $$vcf{header}{INFO}{GL}{Number} != -1 ) { push @tags, 'GL'; } if ( exists($$vcf{header}{INFO}{AC}) && $$vcf{header}{INFO}{AC}{Number} != -1 ) { push @tags, 'AC'; } if ( exists($$vcf{header}{INFO}{AF}) && $$vcf{header}{INFO}{AF}{Number} != -1 ) { push @tags, 'AF'; } if ( !@tags ) { return; } $ENV{DONT_FIX_VCF40_AG_TAGS} = 1; my $tags = join(',',@tags); print STDERR "Warning: The $tags tag(s) will not be merged correctly for multiallelic sites.\n", " To be handled correctly, please redefine with Number=. or set the environment\n", " variable DONT_FIX_VCF40_AG_TAGS=0.\n"; } sub init_cols { my ($opts,$vcf_out) = @_; my $prefix; my @regions = read_region_list($opts); my @vcfs; my @cols; my %has_chrom; my %col_names; my $icol = 9; my $ncols_total = 0; if ( !$$opts{has_col_names} ) { $prefix = common_prefix($$opts{files}); } # Go through all files and read header, obtain list of chromosomes. The file names will be used for columns, unless # they were read from the header. for my $file (@{$$opts{files}}) { my $vcf = Vcf->new(file=>$file); $$vcf{line_buffer} = []; $vcf->parse_header(); check_AGtags_definition($vcf); $vcf->close(); push @vcfs, $vcf; # Precompute the weighting factor for the QUAL column my $ncols = scalar @{$$vcf{columns}} - 9; if ( $ncols<=0 ) { $ncols = 1; } $$vcf{qual_weight} = 1.0*$ncols; $ncols_total += $ncols; # Update the list of known chromosomes if ( !exists($$opts{regions_list}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @regions, $chr; } } my $col_prefix = ''; if ( !$$opts{has_col_names} ) { # Make the column names nice - strip common prefix and the suffix .vcf.gz $col_prefix = $file; $col_prefix =~ s{^/*$prefix/*}{}; $col_prefix =~ s/\.gz$//i; $col_prefix =~ s/\.vcf$//i; $col_prefix .= '_'; } if ( !exists($$vcf{columns}) ) { error("No header present? $file\n"); } # Create good names for the columns in the merged vcf file my @vcf_cols = @{$$vcf{columns}}; $$vcf{__col_names} = []; for my $col (@vcf_cols[9..$#vcf_cols]) { my $col_name = $col; if ( $$opts{has_col_names} ) { if ( $icol >= @{$$vcf_out{columns}} ) { error("Fewer columns in the header than in the VCF files total.\n"); } $col_name = $$vcf_out{columns}[$icol]; $icol++; if ( exists($col_names{$col_name}) ) { error("The column names not unique in the header: $col_name\n"); } } else { if ( exists($col_names{$col_name}) ) { $col_name = $col_prefix.$col; } if ( exists($col_names{$col_name}) ) { warn("FIXME: the column name [$col_name] not unique.\n"); } } warn("Using column name '$col_name' for $file:$col\n"); $col_names{$col_name} = 1; push @cols, $col_name; push @{$$vcf{__col_names}}, $col_name; } } if ( $$opts{has_col_names} && $icol!=@{$$vcf_out{columns}} ) { error("More columns in the header than in the VCF files total.\n"); } # QUAL weighting for my $vcf (@vcfs) { $$vcf{qual_weight} /= $ncols_total; } $$opts{vcfs} = \@vcfs; $$opts{cols} = \@cols; $$opts{regions} = \@regions; } sub merge_vcf_files { my ($opts) = @_; # Create output VCF my $vcf_out; if ( $$opts{vcf_header} ) { $vcf_out = Vcf->new(file=>$$opts{vcf_header}); $vcf_out->parse_header(); if ( $$vcf_out{columns} && @{$$vcf_out{columns}} ) { $$opts{has_col_names}=1; } } else { $vcf_out = Vcf->new(); } $$vcf_out{trim_redundant_ALTs} = $$opts{trim_redundant_ALTs}; init_cols($opts,$vcf_out); my @regions = @{$$opts{regions}}; my @cols = @{$$opts{cols}}; my @vcfs = @{$$opts{vcfs}}; # Get the header of the output VCF ready $vcf_out->add_columns(@cols); if ( !$$vcf_out{has_header} ) { for my $vcf (@vcfs) { # To get the missig fields filled by the default values for my $hline (@{$$vcf{header_lines}}) { if ( $$hline{key} eq 'fileformat' ) { next; } $vcf_out->add_header_line($hline,silent=>1); } } } # List source files my $source; for (my $i=0; $i<@vcfs; $i++) { if ( $i ) { $source .= ','; } $source .= "$i:$vcfs[$i]{file}"; } $vcf_out->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf_out->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp'); $vcf_out->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'}); my $have_samples = @{$$vcf_out{columns}}>9 ? 1 : 0; $vcf_out->recalc_ac_an($have_samples ? 2 : 0); $vcf_out->add_header_line({key=>'INFO',ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf_out->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); print $vcf_out->format_header(); # Go through all VCF files simultaneously and output each line, one region at a time. for my $region (@regions) { # Open files for my $vcf (@vcfs) { delete($$vcf{done}); $vcf->open(region=>$region); } while ( my $pos=advance_position($opts,\@vcfs) ) { my %out; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{FORMAT} = []; my %format; my %info; my @src_files; my %filters; my (@quals,@qual_weights,$qual_weights_sum,%ac,$an); my %ref_alt_map = (); # Find out the REFs and ALTs: in VCFv4.0, the REFs can differ and ALTs must be converted for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } if ( !exists($out{CHROM}) ) { $out{CHROM} = $$line{CHROM}; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $ref_alt_map{$ref}{$alt}=$alt; } } # Do the REF,ALT conversion only when necessary my $new_ref; if ( scalar keys %ref_alt_map > 1 ) { $new_ref = $vcf_out->fill_ref_alt_mapping(\%ref_alt_map); } if ( !$have_samples or !$$opts{trim_redundant_ALTs} ) { # Do not loose information from the ALT column when samples are not present my %alts; for my $vcf (@vcfs) { my $line = $$vcf{last_line}; if ( !$line ) { next; } my $ref = $$line{REF}; for my $alt (@{$$line{ALT}}) { $alts{$ref_alt_map{$ref}{$alt}}=1; } delete($alts{'.'}); $out{ALT} = [ keys %alts ]; } } for (my $ivcf=0; $ivcf<@vcfs; $ivcf++) { my $vcf = $vcfs[$ivcf]; my $line = $$vcf{last_line}; # If this file does not have a record for this position, then for all its columns output undef gtype if ( !$line ) { for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $name = $$vcf{__col_names}->[$i]; $out{gtypes}{$name}{GT} = exists($$opts{ref_for_missing}) ? $$opts{ref_for_missing} : $$vcf_out{defaults}{GT}; } next; } # Check if the site has been filtered if ( scalar @{$$line{FILTER}}>1 or ($$line{FILTER}[0] ne $$vcf{filter_passed} && $$line{FILTER}[0] ne $$vcf{defaults}{default}) ) { push @src_files,$ivcf.'f'; } else { push @src_files,$ivcf; } # Collect information for the FILTER field for my $flt (@{$$line{FILTER}}) { if ( $flt eq $$vcf{filter_passed} ) { $filters{$$vcf_out{filter_passed}} = 1; } elsif ( $flt ne $$vcf{defaults}{default} ) { $filters{$flt} = 1; } } # Collect information for the QUAL field if ( $$line{QUAL} ne $$vcf{defaults}{QUAL} && $$line{QUAL} ne $$vcf{defaults}{default} && $$line{QUAL}>0 ) { push @quals,$$line{QUAL}; push @qual_weights,$$vcf{qual_weight}; $qual_weights_sum += $$vcf{qual_weight}; } if ( $$line{ID} ne '.' && $out{ID} eq '.' ) { $out{ID}=$$line{ID}; } # Remember the FORMAT fields for my $field (@{$$line{FORMAT}}) { $format{$field} = 1; } # VCF without genotypes: calculate AC,AN if present if ( !$have_samples ) { if ( exists($$line{INFO}{AN}) ) { $an += $$line{INFO}{AN}; } if ( exists($$line{INFO}{AC}) ) { my (@acs) = split(/,/,$$line{INFO}{AC}); for (my $i=0; $i<@acs; $i++) { my $alt = $ref_alt_map{$$line{REF}}{$$line{ALT}[$i]}; $ac{$alt} += $acs[$i]; } } } # Join the INFO field for my $inf (keys %{$$line{INFO}}) { # When conflicting INFO fields are present, use the first one, unless a joining method exists if ( exists($info{$inf}) ) { if ( exists($$opts{joiner}{$inf}) ) { &{$$opts{joiner}{$inf}}(\$info{$inf},$$line{INFO}{$inf}); } next; } $info{$inf} = $$line{INFO}{$inf}; } my $ref = $$line{REF}; # The ALT column may change after the merge, take care of ALT dependent tags such as GL. if ( $have_samples ) { if ( defined $new_ref ) { $vcf->parse_AGtags($line,\%ref_alt_map,$$line{REF}); } else { $vcf->parse_AGtags($line); } } # Now fill in the genotype information for each column for (my $i=0; $i<@{$$vcf{__col_names}}; $i++) { my $ori_name = $$vcf{columns}->[$i+9]; my $out_name = $$vcf{__col_names}->[$i]; $out{gtypes}{$out_name} = $$line{gtypes}{$ori_name}; # This is to convert 0/1 to G/C my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($line,$ori_name); if ( defined $new_ref ) { my @als; for my $al (@$alleles) { push @als, exists($ref_alt_map{$ref}{$al}) ? $ref_alt_map{$ref}{$al} : '.'; } $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype(\@als,$seps); } else { $out{gtypes}{$out_name}{GT} = $vcf->format_haplotype($alleles,$seps); } } $out{REF} = defined $new_ref ? $new_ref : $ref; } $out{INFO} = { %info }; $out{INFO}{SF} = join(',',@src_files); # Output the QUAL information my $qual; for (my $i=0; $i<@quals; $i++) { $qual += $quals[$i] * $qual_weights[$i] * (1.0 / $qual_weights_sum); } $out{QUAL} = defined $qual ? sprintf("%.2f",$qual) : $$vcf_out{defaults}{QUAL}; # Output the FILTER information: remove PASS or missing value if some other information # is present. delete($filters{$$vcf_out{defaults}{default}}); if ( exists($filters{$$vcf_out{filter_passed}}) && scalar keys %filters > 1 ) { delete($filters{$$vcf_out{filter_passed}}); } $out{FILTER} = [ keys %filters ]; if ( !@{$out{FILTER}} ) { push @{$out{FILTER}},$$vcf_out{defaults}{default}; } # The GT field must come as first delete($format{GT}); $out{FORMAT} = ['GT']; for my $key (keys %format) { push @{$out{FORMAT}},$key; } if ( $have_samples ) { $vcf_out->format_genotype_strings(\%out); } else { if ( defined $an ) { $out{INFO}{AN}=$an; } if ( scalar keys %ac ) { my @acs; for my $alt (@{$out{ALT}}) { # Some of the files may not have AC, the AC count can be undefined in such a case. push @acs, exists($ac{$alt}) ? $ac{$alt} : 0; } $out{INFO}{AC} = join(',',@acs); } } print $vcf_out->format_line(\%out); } } for my $vcf (@vcfs) { $vcf->close() or error("close failed: $$vcf{file}\n"); } } sub advance_position { my ($opts,$vcfs) = @_; my $min_pos; for my $vcf (@$vcfs) { fill_buffer($opts,$vcf) unless $$vcf{done}; if ( @{$$vcf{line_buffer}} && (!defined $min_pos or $min_pos>$$vcf{line_buffer}[0]{POS}) ) { $min_pos = $$vcf{line_buffer}[0]{POS}; } } if ( !defined $min_pos ) { return undef; } my ($first,$has_snp,$has_indel); for my $vcf (@$vcfs) { delete($$vcf{last_line}); if ( @{$$vcf{line_buffer}} && $min_pos ne $$vcf{line_buffer}[0]{POS} ) { next; } if ( !defined $first ) { $$vcf{last_line} = shift @{$$vcf{line_buffer}}; $first = $$vcf{last_line}; next; } my $irec; for (my $i=0; $i<@{$$vcf{line_buffer}}; $i++) { my $line = $$vcf{line_buffer}[$i]; if ( $$line{POS} ne $$first{POS} ) { last; } if ( $$opts{collapse_any} ) { $irec=$i; last; } # checking position only if ( $$opts{collapse_snps} && $$first{variant_type}&1 && $$line{variant_type}&1 ) { $irec=$i; last; } if ( $$opts{collapse_indels} && $$first{variant_type}&2 && $$line{variant_type}&2 ) { $irec=$i; last; } if ( $$vcf{line_buffer}[$i]{REF} ne $$first{REF} ) { next; } # refs do not match for my $al1 (@{$$line{ALT}}) { for my $al2 (@{$$first{ALT}}) { if ( $al1 eq $al2 ) { $irec=$i; last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { last; } } if ( defined $irec ) { $$vcf{last_line} = splice(@{$$vcf{line_buffer}},$irec,1); } } return $min_pos; } sub fill_buffer { my ($opts,$vcf) = @_; if ( @{$$vcf{line_buffer}} && $$vcf{line_buffer}[0]{POS}!=$$vcf{line_buffer}[-1]{POS} ) { return; } while ( 1 ) { my $line = $vcf->next_data_hash(); if ( !$line ) { $$vcf{done} = 1; return; } if ( !$$opts{collapse_any} ) { for my $al (@{$$line{ALT}}) { my ($type,$len,$ht) = $vcf->event_type($$line{REF},$al); if ( $type eq 's' or $type eq 'r' ) { $$line{variant_type} |= 1; } if ( $type eq 'i' or $type eq 'o' ) { $$line{variant_type} |= 2; } } } push @{$$vcf{line_buffer}}, $line; if ( $$vcf{line_buffer}[0]{POS} != $$vcf{line_buffer}[-1]{POS} ) { return; } } } # Field joiner methods sub joiner_sum { my ($ori,$new) = @_; $$ori += $new; } sub joiner_dp4 { my ($ori,$new) = @_; my @vals1 = split(/,/,$$ori); my @vals2 = split(/,/,$new); if ( @vals1 != @vals2 ) { error("Cannot join: $$ori vs $new\n"); } for (my $i=0; $i<@vals1; $i++) { $vals1[$i] += $vals2[$i]; } $$ori = join(',',@vals1); } vcftools_0.1.11/perl/vcf-consensus0000755000000000000000000001355412156354770015717 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); do_consensus($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz > out.fa\n", "Options:\n", " -h, -?, --help This help message.\n", " -H, --haplotype Apply only variants for the given haplotype (1,2)\n", " -s, --sample If not given, all variants are applied\n", "Examples:\n", " samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n", "\n"; } sub parse_params { my $opts = { }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-s' || $arg eq '--sample' ) { $$opts{sample}=shift(@ARGV); next; } if ( $arg eq '-H' || $arg eq '--haplotype' ) { $$opts{haplotype}=shift(@ARGV); next; } if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( exists($$opts{haplotype}) && !exists($$opts{sample}) ) { error("Expected -s option with -H.\n"); } return $opts; } sub do_consensus { my ($opts) = @_; my $vcf = Vcf->new(file=>$$opts{vcf_file}); $vcf->parse_header; if ( exists($$opts{sample}) ) { if ( !exists($$vcf{has_column}{$$opts{sample}}) ) { error("No such sample: $$opts{sample}"); } $$opts{vcf} = $vcf; $$opts{sample_col} = $$vcf{has_column}{$$opts{sample}}; } my $chrs = $vcf->get_chromosomes(); my %chrs = map { $_=>0 } @$chrs; my ($chr,$vcf_pos,$warned,$vcf_line); while (my $line=) { if ( $line=~/^>([^:\s]+)/ ) { flush_fa_buffer($opts,0); $chr = $1; my $rest = $'; $$opts{fa_pos} = ($rest=~/^:(\d+)-\d+$/) ? $1 : 1; $$opts{fa_idx} = 0; $$opts{fa_frz} = 0; if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; } my $region = $$opts{fa_pos} > 1 ? "$chr:$$opts{fa_pos}" : $chr; $vcf->open(region=>$region); print $line; next; } chomp($line); $$opts{fa_buf} .= $line; $$opts{fa_len} += length($line); while ( defined($vcf_line = $vcf->next_data_array()) ) { # can the beginning of the buffer be printed? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1] ) { $vcf->_unread_line($vcf_line); flush_fa_buffer($opts,60); last; } # is the buffer long enough? if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1]+length($$vcf_line[3]) ) { $vcf->_unread_line($vcf_line); last; } apply_variant($opts,$vcf_line); } if ( !defined $vcf_line ) { flush_fa_buffer($opts,60); } } flush_fa_buffer($opts,0); for my $chr (keys %chrs) { if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); } } } sub flush_fa_buffer { my ($opts,$len) = @_; while ( $$opts{fa_len} && $$opts{fa_len}>=60 ) { print substr($$opts{fa_buf},0,60,''), "\n"; $$opts{fa_len} -= 60; $$opts{fa_pos} += 60 - $$opts{fa_idx}; $$opts{fa_idx} = 0; } if ( $len or !$$opts{fa_len} ) { return; } print $$opts{fa_buf},"\n"; $$opts{fa_pos} += $$opts{fa_len}-$$opts{fa_idx}; $$opts{fa_len} = 0; $$opts{fa_buf} = ''; $$opts{fa_idx} = 0; } sub apply_variant { my ($opts,$vline) = @_; if ( $$vline[4] eq '.' ) { return; } my $hap = exists($$opts{haplotype}) ? $$opts{haplotype} : 0; my $alt; if ( !exists($$opts{sample_col}) ) { my $idx; $alt = ($idx=index($$vline[4],','))==-1 ? $$vline[4] : substr($$vline[4],0,$idx); } else { my @als = $$opts{vcf}->split_gt($$vline[$$opts{sample_col}-1]); if ( $hap ) { # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect if ( $hap <= @als && $als[$hap-1] ne '0' ) { $alt = $$opts{vcf}->get_field($$vline[4],$als[$hap-1]-1,','); } } else { for my $al (@als) { if ( $al eq '0' ) { next; } $alt = $$opts{vcf}->get_field($$vline[4],$al-1,','); last; } } if ( !defined $alt ) { return; } } if ( $$vline[1] <= $$opts{fa_frz} ) { print STDERR "Note: Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n"; return; } my $pos = $$vline[1] - $$opts{fa_pos} + $$opts{fa_idx}; if ( $pos<0 or $pos>=$$opts{fa_len} ) { error("FIXME: $$vline[0]:$$vline[1] .. $$opts{fa_pos},$pos,$$opts{fa_len},$$opts{fa_frz}\n"); } # Sanity check my $ref_len = length($$vline[3]); if ( $$vline[3] ne substr($$opts{fa_buf},$pos,$ref_len) ) { error(sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n", substr($$opts{fa_buf},$pos,$ref_len), substr($$opts{fa_buf},$pos+1,$ref_len+5), $$vline[3], $$opts{fa_frz}?$$opts{fa_frz}:0); } my $alt_len = length($alt); substr($$opts{fa_buf},$pos,$ref_len,$alt); $$opts{fa_len} += $alt_len - $ref_len; $$opts{fa_pos} += $ref_len; # position with respect to the original reference sequence $$opts{fa_idx} += $alt_len; # position in the modified sequence $$opts{fa_frz} = $$vline[1] + $ref_len - 1; # freeze changes until this position } vcftools_0.1.11/perl/vcf-annotate0000755000000000000000000014406212156354770015507 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my %filters = ( MinAB => { dflt=>2, usage=>'INT', desc=>'Minimum number of alternate bases (INFO/DP4)', nick=>'a' }, SnpCluster => { dflt=>undef, usage=>'INT1,INT2', desc=>"Filters clusters of 'INT1' or more SNPs within a run of 'INT2' bases", nick=>'c' }, MinDP => { dflt=>2, usage=>'INT', desc=>"Minimum read depth (INFO/DP or INFO/DP4)", nick=>'d' }, MaxDP => { dflt=>10_000_000, usage=>'INT', desc=>"Maximum read depth (INFO/DP or INFO/DP4)", nick=>'D' }, MinMQ => { dflt=>10, usage=>'INT', desc=>"Minimum RMS mapping quality for SNPs (INFO/MQ)", nick=>'q' }, SnpGap => { dflt=>10, usage=>'INT', desc=>"SNP within INT bp around a gap to be filtered", nick=>'w' }, GapWin => { dflt=>3, usage=>'INT', desc=>"Window size for filtering adjacent gaps", nick=>'W' }, StrandBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for strand bias (INFO/PV4)", nick=>'1' }, BaseQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for baseQ bias (INFO/PV4)", nick=>'2' }, MapQualBias => { dflt=>0, usage=>'FLOAT', desc=>"Min P-value for mapQ bias (INFO/PV4)", nick=>'3' }, EndDistBias => { dflt=>1e-4, usage=>'FLOAT', desc=>"Min P-value for end distance bias (INFO/PV4)", nick=>'4' }, RefN => { dflt=>'', usage=>'', desc=>"Reference base is N", nick=>'r' }, Qual => { dflt=>'10', usage=>'INT', desc=>"Minimum value of the QUAL field", nick=>'Q' }, VDB => { dflt=>'0', usage=>'FLOAT', desc=>"Minimum Variant Distance Bias (INFO/VDB)", nick=>'v' }, HWE => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE and F<0 (invokes --fill-HWE)", nick=>'H' }, HWE_G3 => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE and F<0 (INFO/HWE and INFO/G3)", nick=>'HG' }, HWE2 => { dflt=>undef, usage=>'FLOAT', desc=>"Minimum P-value for HWE (plus F<0) (INFO/AC and INFO/AN or --fill-AC-AN)", nick=>'H2' }, ); my $opts = parse_params(); annotate($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } my @filters; for my $key (sort {lc($filters{$a}{nick}) cmp lc($filters{$b}{nick})} keys %filters) { push @filters, sprintf("\t%s, %-25s\t\t%s [%s]\n", $filters{$key}{nick},$key.' '.$filters{$key}{usage},$filters{$key}{desc},defined($filters{$key}{dflt})? $filters{$key}{dflt} : ''); } print "About: Annotates VCF file, adding filters or custom annotations. Requires tabix indexed file with annotations.\n", " Currently it can annotate ID, QUAL, FILTER and INFO columns, but will be extended on popular demand.\n", " For examples of user-defined filters see online documentation or examples/filters.txt in vcftools distribution.\n", "Usage: cat in.vcf | vcf-annotate [OPTIONS] > out.vcf\n", "Options:\n", " -a, --annotations The tabix indexed file with the annotations: CHR\\tFROM[\\tTO][\\tVALUE]+.\n", " -c, --columns The list of columns in the annotation file, e.g. CHROM,FROM,TO,-,QUAL,INFO/STR,INFO/GN. The dash\n", " in this example indicates that the third column should be ignored. If TO is not\n", " present, it is assumed that TO equals to FROM. When REF and ALT columns are present, only\n", " matching lines are annotated.\n", " -d, --description Header annotation, e.g. key=INFO,ID=HM2,Number=0,Type=Flag,Description='HapMap2 membership'.\n", " The descriptions can be read from a file, one annotation per line.\n", " --fill-AC-AN (Re)Calculate AC and AN tags\n", " --fill-HWE (Re)Calculate HWE, AC and AN tags\n", " --fill-ICF (Re)Calculate Inbreeding Coefficient F, HWE, AC and AN\n", " --fill-type Annotate INFO/TYPE with snp,del,ins,mnp,complex\n", " -f, --filter Apply filters, list is in the format flt1=value/flt2/flt3=value/etc. If argument to -f is a file,\n", " user-defined filters be applied. See User Defined Filters below.\n", " -H, --hard-filter Remove lines with FILTER anything else than PASS or \".\"\n", " -n, --normalize-alleles Make REF and ALT alleles more compact if possible (e.g. TA,TAA -> T,TA).\n", " -r, --remove Comma-separated list of tags to be removed (e.g. ID,INFO/DP,FORMAT/DP,FILTER).\n", " -h, -?, --help This help message.\n", "Filters:\n", sprintf("\t+ %-25s\t\tApply all filters with default values (can be overriden, see the example below).\n",''), sprintf("\t-X %-25s\t\tExclude the filter X\n",''), join('',@filters), "Examples:\n", " zcat in.vcf.gz | vcf-annotate -a annotations.gz -d descriptions.txt -c FROM,TO,CHROM,ID,INFO/DP | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -f +/-a/c=3,10/q=3/d=5/-D -a annotations.gz -d key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name' | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -a dbSNPv132.tab.gz -c CHROM,POS,REF,ALT,ID,-,-,- | bgzip -c >out.vcf.gz \n", " zcat in.vcf.gz | vcf-annotate -r FILTER/MinDP | bgzip -c >out.vcf.gz \n", "Where descriptions.txt contains:\n", " key=INFO,ID=GN,Number=1,Type=String,Description='Gene Name'\n", " key=INFO,ID=STR,Number=1,Type=Integer,Description='Strand'\n", "The file dbSNPv132.tab.gz with dbSNP IDs can be downloaded from\n", " ftp://ftp.sanger.ac.uk/pub/1000genomes/pd3/dbSNP/\n", "\n"; exit -1; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { args=>[$0, @ARGV], }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-d' || $arg eq '--description' ) { my $desc = shift(@ARGV); if ( -e $desc ) { open(my $fh,'<',$desc) or error("$desc: $!"); while (my $line=<$fh>) { if ( $line=~/^\s*$/ or $line=~/^#/ ) { next; } chomp($line); push @{$$opts{desc}}, $line; } close($fh); } else { push @{$$opts{desc}}, $desc; } next; } if ( $arg eq '-f' || $arg eq '--filter' ) { my $filter = shift(@ARGV); parse_filters($opts,$filter); next; } if ( $arg eq '-c' || $arg eq '--columns' ) { my $cols = shift(@ARGV); $$opts{cols} = [ split(/,/,$cols) ]; next; } if ( $arg eq '-r' || $arg eq '--remove' ) { my $tags = shift(@ARGV); my @tags = split(/,/,$tags); for my $tag (@tags) { my ($col,$tag) = split(m{/},$tag); if ( !defined $tag ) { if ( $col eq 'ID' ) { $$opts{remove}{$col}=1; next; } if ( $col eq 'QUAL' ) { $$opts{remove}{$col}=1; next; } if ( $col eq 'FILTER' ) { $$opts{remove}{$col}=1; next; } $$opts{remove}{INFO}{$col} = 1; $$opts{remove}{FORMAT}{$col} = 1; } elsif ( $col eq 'FILTER' ) { $$opts{remove}{$col}{$tag} = 0; } else { $$opts{remove}{$col}{$tag} = 1; } } next; } if ( $arg eq '-n' || $arg eq '--normalize-alleles' ) { $$opts{normalize} = 1; next } if ( $arg eq '-a' || $arg eq '--annotations' ) { $$opts{annotations} = shift(@ARGV); next } if ( $arg eq '--fill-type' ) { $$opts{fill_type}=1; $$opts{fill}=1; next } if ( $arg eq '--fill-AC-AN' ) { $$opts{fill_ac_an} = 1; $$opts{fill}=1; next } if ( $arg eq '--fill-HWE' ) { $$opts{fill_ac_an} = $$opts{fill_hwe} = 1; $$opts{fill}=1; next } if ( $arg eq '--fill-ICF' ) { $$opts{fill_ac_an} = $$opts{fill_hwe} = $$opts{fill_icf} = 1; $$opts{fill}=1; next } if ( $arg eq '-t' || $arg eq '--tag' ) { $$opts{tag} = shift(@ARGV); next } if ( $arg eq '-H' || $arg eq '--hard-filter' ) { $$opts{hard_filter} = 1; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( -e $arg ) { $$opts{file}=$arg; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) ) { if ( !exists($$opts{annotations}) && !exists($$opts{remove}) && !exists($$opts{fill}) && !exists($$opts{normalize}) && !exists($$opts{hard_filter}) ) { error("Missing one of the -a, -f, -n, -r or --fill-* options.\n") } } if ( exists($$opts{annotations}) && !exists($$opts{cols}) ) { error("Missing the -c option.\n"); } return $opts; } sub parse_user_defined_filters { my ($opts,$str) = @_; my $filters = [ do $str ]; if ( $@ ) { error("do $str: $@"); } for my $filter (@$filters) { if ( !exists($$filter{tag}) ) { error("Missing 'tag' key for one of the filters in $str\n"); } if ( $$filter{tag}=~m{^INFO/(.+)$} ) { $$filter{info_tag} = $1; } elsif ( $$filter{tag}=~m{^FORMAT/(.+)$} ) { $$filter{format_tag} = $1; } elsif ( $$filter{tag} eq 'Dummy' ) { $$filter{any_tag} = $1; $$filter{name} = 'Dummy'; $$filter{desc} = 'Dummy'; } else { error("Currently only INFO, FORMAT and Dummy tags are supported. Could not parse the tag [$$filter{tag}]\n"); } my $name = $$filter{name}; if ( !exists($$filter{name}) ) { error("Missing 'name' key for the filter [$$filter{tag}]\n"); } if ( !exists($$filter{desc}) ) { error("Missing 'desc' key for the filter [$$filter{tag}]\n"); } if ( exists($$filter{header}) ) { push @{$$opts{desc}}, ref($$filter{header}) eq 'ARRAY' ? @{$$filter{header}} : $$filter{header}; } elsif ( $$filter{tag} ne 'Dummy' ) { push @{$$opts{desc}}, "key=FILTER,ID=$name,Description='$$filter{desc}'"; } if ( !exists($$filter{apply_to}) or lc($$filter{apply_to}) eq 'all' ) { $$opts{udef_filters}{'all'}{$name} = $filter; $$opts{udef_filters}{'s'}{$name} = $filter; $$opts{udef_filters}{'i'}{$name} = $filter; } elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'snps' ) { $$opts{udef_filters}{'s'}{$name} = $filter; $$opts{udef_filters_typecheck_needed} = 1; } elsif ( exists($$filter{apply_to}) and lc($$filter{apply_to}) eq 'indels' ) { $$opts{udef_filters}{'i'}{$name} = $filter; $$opts{udef_filters_typecheck_needed} = 1; } } } sub parse_filters { my ($opts,$str) = @_; if ( -e $str ) { parse_user_defined_filters($opts,$str); return; } my $has_filters = 0; my $set_defaults = 0; my @filters = split(m{/},$str); for my $fltr (@filters) { if ( $fltr eq '+' ) { $set_defaults=1; last; } } my %mapping; for my $flt (keys %filters) { if ( exists($mapping{$filters{$flt}{nick}}) ) { error("FIXME: the nick $filters{$flt}{nick} is not unique.\n"); } $mapping{$filters{$flt}{nick}} = $flt; if ( !defined($filters{$flt}{dflt}) ) { next; } if ( $set_defaults ) { $$opts{filters}{$flt} = $filters{$flt}{dflt}; } } for my $filter (@filters) { my ($key,$val) = split(/=/,$filter); if ( $key eq '+' ) { next; } my $to_be_deleted = 0; if ( $key=~/^-(.+)$/ ) { $to_be_deleted=1; $key = $1; } if ( !exists($filters{$key}) ) { $key = $mapping{$key}; } if ( !exists($filters{$key}) && !exists($mapping{$key}) ) { error("The filter [$key] not recognised.\n"); } if ( $to_be_deleted ) { delete($$opts{filters}{$key}); next; } if ( $key eq 'c' || $key eq 'SnpCluster' ) { ($$opts{SnpCluster_count},$$opts{SnpCluster_win}) = split(/,/,$val); # Simple sanity check if ( $$opts{SnpCluster_count}>$$opts{SnpCluster_win} ) { error("Did you really mean snpCluster=$$opts{SnpCluster_count},$$opts{SnpCluster_win}? The win (INT2) must be bigger or equal to count (INT1)."); } $$opts{SnpCluster_buffer} = []; push @{$$opts{desc}}, "key=FILTER,ID=SnpCluster,Description='$filters{SnpCluster}{desc} [win=$$opts{SnpCluster_win},count=$$opts{SnpCluster_count}]'"; $has_filters = 1; next; } $$opts{filters}{$key} = $val; $has_filters = 1; } for my $key (keys %{$$opts{filters}}) { push @{$$opts{desc}}, "key=FILTER,ID=$key,Description='$filters{$key}{desc}" . (defined $$opts{filters}{$key} ? " [$$opts{filters}{$key}]'" : "'"); } if ( !$has_filters && !scalar keys %{$$opts{filters}} ) { delete($$opts{filters}); } if ( exists($$opts{filters}{HWE}) ) { $$opts{fill_ac_an}=$$opts{fill_hwe}=1; } } # Convert text descriptions given on command line to hashes which will be # passed to Vcf::add_header_line sub parse_descriptions { my ($descs) = @_; my @out; for my $str (@$descs) { my $desc = {}; my $tmp = $str; while ($tmp) { my ($key,$value); if ( $tmp=~/^([^=]+)=["']([^\"]+)["']/ ) { $key=$1; $value=$2; } elsif ( $tmp=~/^([^=]+)=([^,"]+)/ && $1 eq 'Description' ) { # The command line eats the quotes $key=$1; $value=$2.$'; $$desc{$key} = $value; last; } elsif ( $tmp=~/^([^=]+)=([^,"]+)/ ) { $key=$1; $value=$2; } else { error(qq[Could not parse the description: [$str]\n]); } $$desc{$key} = $value; $tmp = $'; if ( $tmp=~/^,/ ) { $tmp = $'; } } if ( !exists($$desc{ID}) ) { error("No ID in description? [$str]\n"); } push @out, $desc; } return \@out; } # Create mapping from the annotation IDs to column indexes. The mapping is used # to determine which columns should be used from the annotation file. The # following structure is returned: # { # CHROM => col_idx, # FROM => col_idx, # TO => col_idx, # annots => # [ # { col=>col_idx, id=>annot_id, vcf_col=>vcf_column, is_flag=>0 }, # ] # } # If {annots}{is_flag} is nonzero, "annot_id" will be written to VCF instead of "annot_id=value". # Currently only one VCF column (INFO) is supported. # sub parse_columns { my ($cols,$descriptions) = @_; my %desc = (); my %out = ( annots=>[] ); if ( !defined $cols ) { return \%out; } for my $d (@$descriptions) { $desc{$$d{key}.'/'.$$d{ID}} = $d; } for (my $i=0; $i<@$cols; $i++) { my $col = $$cols[$i]; if ( $col eq '-' ) { next; } elsif ( $col eq 'CHROM' ) { $out{$col}=$i; } elsif ( $col eq 'FROM' ) { $out{$col}=$i; } elsif ( $col eq 'POS' ) { $out{'FROM'}=$i; } elsif ( $col eq 'TO' ) { $out{$col}=$i; } elsif ( $col eq 'ID' ) { $out{$col}=$i; } elsif ( $col eq 'FILTER' ) { $out{$col}=$i; } elsif ( $col eq 'REF' ) { $out{$col}=$i; } elsif ( $col eq 'ALT' ) { $out{$col}=$i; } elsif ( $col eq 'QUAL' ) { $out{$col}=$i; } else { if ( !exists($desc{$col}) && exists($desc{"INFO/$col"}) ) { print STDERR qq[The description for "$col" does not exist, assuming "INFO/$col"\n]; $col = "INFO/$col"; } if ( !exists($desc{$col})) { error("Missing the -d parameter for the column [$col]\n"); } if ( !($col=~m{^(.+)/(.+)$}) ) { error("Could not parse the column [$col].\n"); } my $key = $1; my $id = $2; my $rec = { col=>$i, id=>$id, vcf_col=>$key, is_flag=>($desc{$col}{Type} eq 'Flag' ? 1 : 0) }; push @{$out{annots}}, $rec; if ( $key ne 'INFO' ) { error("TODO: other than INFO columns\n"); } } } if ( !exists($out{CHROM}) ) { $out{CHROM}=0; } if ( !exists($out{FROM}) ) { $out{FROM}=1; } if ( !exists($out{TO}) ) { $out{TO}=$out{FROM}; } if ( exists($out{REF}) && !exists($out{ALT}) or !exists($out{REF}) && exists($out{ALT}) ) { error("Expected both REF and ALT columns in the annotation file.\n"); } return \%out; } sub annotate { my ($opts) = @_; # Init the variables my $descs = parse_descriptions($$opts{desc}); my $cols = parse_columns($$opts{cols},$descs); # Open VCF file and add all required header lines my %args = exists($$opts{file}) ? (file=>$$opts{file}) : (fh=>\*STDIN); my $vcf = $$opts{vcf} = Vcf->new(%args); $vcf->parse_header(); if ( exists($$opts{remove}) ) { for my $col (keys %{$$opts{remove}}) { if ( ref($$opts{remove}{$col}) ne 'HASH' ) { # remove all filters at once if ( $col eq 'FILTER' ) { $vcf->remove_header_line(key=>$col); } next; } for my $tag (keys %{$$opts{remove}{$col}}) { $vcf->remove_header_line(key=>$col, ID=>$tag); } } } for my $desc (@$descs) { $vcf->add_header_line($desc,silent=>1); } if ( $$opts{fill_type} ) { $vcf->add_header_line({key=>'INFO',ID=>'TYPE',Number=>'A',Type=>'String',Description=>'Variant type'}); } if ( $$opts{fill_ac_an} ) { $vcf->add_header_line({key=>'INFO',ID=>'AC',Number=>'A',Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO',ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); } if ( $$opts{fill_hwe} ) { $vcf->add_header_line({key=>'INFO',ID=>'HWE',Number=>1,Type=>'Float',Description=>'Hardy-Weinberg equilibrium test (PMID:15789306)'}); $vcf->add_header_line({key=>'INFO',ID=>'ICF',Number=>1,Type=>'Float',Description=>'Inbreeding coefficient F'}); } $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); print $vcf->format_header(); my ($prev_chr,$prev_pos,$annot_from,$annot_to,$annot_line); my @annots = @{$$cols{annots}}; my $id_col = exists($$cols{ID}) ? $$cols{ID} : undef; my $fltr_col = exists($$cols{FILTER}) ? $$cols{FILTER} : undef; my $from_col = $$cols{FROM}; my $to_col = $$cols{TO}; my $ref_col = exists($$cols{REF}) ? $$cols{REF} : undef; my $alt_col = exists($$cols{ALT}) ? $$cols{ALT} : undef; my $qual_col = exists($$cols{QUAL}) ? $$cols{QUAL} : undef; # Initialize the annotation reader my $reader; if ( exists($$opts{annotations}) ) { $reader = Reader->new(file=>$$opts{annotations}); my $line = $vcf->next_line(); if ( !defined $line ) { # VCF file is empty undef $reader; } else { my @rec = split(/\t/,$line); $prev_chr = $rec[0]; $prev_pos = $rec[1]; $vcf->_unread_line($line); $reader->open(region=>"$prev_chr:$prev_pos"); } } while (defined $reader) { # Read next annotation group, i.e. all records with the same position (or overlapping in case of intervals) my (@annot_lines,$annot_prev_from,$annot_prev_to); while ($reader) { my $annot_line = $reader->next_line(); if ( !defined $annot_line ) { last; } my $annot_from = $$annot_line[$from_col]; my $annot_to = $$annot_line[$to_col]; if ( !@annot_lines ) { push @annot_lines, $annot_line; $annot_prev_from = $annot_from; $annot_prev_to = $annot_to; next; } if ( $annot_from <= $annot_prev_to or $annot_to <= $annot_prev_to ) { push @annot_lines, $annot_line; if ( $annot_prev_to < $annot_to ) { $annot_prev_to = $annot_to; } next; } $reader->unread_line($annot_line); last; } # Now loop through the VCF records my $line; while ($line = $vcf->next_line()) { my @rec = split(/\t/,$line); if ( $$opts{normalize} ) { my ($ref,@alts) = $vcf->normalize_alleles($rec[3],$rec[4]); $rec[3] = $ref; $rec[4] = join(',',@alts); } my $chr = $rec[0]; my $pos = $rec[1]; chomp($rec[-1]); if ( $chr ne $prev_chr ) { $vcf->_unread_line($line); $prev_chr = $chr; $prev_pos = $pos; $reader->open(region=>"$prev_chr:$prev_pos"); last; } if ( exists($$opts{remove}) ) { remove_tags($opts,\@rec); } # Quick position-based check: Is there an annotation for this record? if ( !defined $annot_prev_from or $pos < $annot_prev_from ) { output_line($opts,\@rec); next; } if ( $pos > $annot_prev_to ) { $vcf->_unread_line($line); last; } # Initialize the REF,ALT-based check. If there are multiple records with the same # position, they can appear in any order. A single ALT allele is expected in the # annot file but multiple ALTs can be present in the VCF. As long as one of them # matches the annot file, the record will be annotated. # The annot file can contain mutliallelic sites too. At least one ALT from the VCF # has to match an ALT from the annot file. my (%ref_alt_pairs); if ( defined $alt_col ) { my $ref = $rec[3]; for my $alt (split(/,/,$rec[4])) { my ($r,@a) = $vcf->normalize_alleles($ref,$alt); $ref_alt_pairs{$r.'-'.$a[0]} = 1; } } # Now fill the annotations; Existing annotations with the same tag will be overwritten my %values = (); my %ids = (); for my $annot_line (@annot_lines) { # Skip annotation lines which are not relevant to this VCF record if ( $$annot_line[$from_col] > $pos or $$annot_line[$to_col] < $pos ) { next; } if ( defined $alt_col && $$annot_line[$ref_col] ne '.' ) { my $alt_match = 0; for my $alt (split(/,/,$$annot_line[$alt_col])) { my ($r,@a) = $vcf->normalize_alleles($$annot_line[$ref_col],$alt); if ( exists($ref_alt_pairs{$r.'-'.$a[0]}) ) { $alt_match = 1; last; } } if ( !$alt_match ) { next; } } for my $info (@annots) { my $val = $$annot_line[$$info{col}]; if ( $val eq '' or $val eq '.' ) { $val=undef; } # Existing annotation should be removed elsif ( $$info{is_flag} ) { if ( $val ) { $val=''; } # Flag annotation should be added else { $val=undef; } # Flag annotation should be removed } # A single undef value can be overriden by other overlapping records (?) if ( !defined $val && exists($values{$$info{id}}) ) { next; } elsif ( exists($values{$$info{id}}) && !defined $values{$$info{id}}[0] ) { $values{$$info{id}}[0] = $val; next; } push @{$values{$$info{id}}}, $val; } if ( defined $id_col && $$annot_line[$id_col] ne '' ) { $ids{$$annot_line[$id_col]} = 1; } if ( defined $fltr_col && $$annot_line[$fltr_col] ne '' ) { $rec[6] = $$annot_line[$fltr_col]; } if ( defined $qual_col && $$annot_line[$qual_col] ne '' ) { $rec[5] = $$annot_line[$qual_col]; } } if ( scalar keys %ids ) { $rec[2] = join(';', keys %ids); } if ( scalar keys %values ) { for my $key (keys %values) { # Cannot use join on undef values $values{$key} = scalar @{$values{$key}} == 1 ? $values{$key}[0] : join(',', @{$values{$key}}); } $rec[7] = $vcf->add_info_field($rec[7],%values); } output_line($opts,\@rec); } if ( !defined $line ) { last; } } # Finish the VCF, no annotations for this part while (my $line=$vcf->next_line) { my @rec = split(/\t/,$line); if ( $$opts{normalize} ) { my ($ref,@alts) = $vcf->normalize_alleles($rec[3],$rec[4]); $rec[3] = $ref; $rec[4] = join(',',@alts); } chomp($rec[-1]); if ( exists($$opts{remove}) ) { remove_tags($opts,\@rec); } output_line($opts,\@rec); } # Output any lines left in the buffer output_line($opts); } sub fill_ac_an_hwe { my ($opts,$line) = @_; my $igt = $$opts{vcf}->get_tag_index($$line[8],'GT',':'); if ( $igt==-1 ) { return; } my %counts = ( 0=>0 ); my %dpl_counts = ( 0=>0 ); if ( $$line[4] ne '.' ) { my $idx=0; my $cnt=0; $counts{++$cnt} = 0; while ( ($idx=index($$line[4],',',$idx))>0 ) { $idx++; $counts{++$cnt} = 0; } } my $nhets = 0; my $ngts = 0; my $ncols = @$line; for (my $isample=9; $isample<$ncols; $isample++) { my $gt = $$opts{vcf}->get_field($$line[$isample],$igt); my ($a1,$a2) = $$opts{vcf}->split_gt($gt); if ( $a1 ne '.' ) { $counts{$a1}++ } if ( defined $a2 && $a2 ne '.' ) { $counts{$a2}++; if ( $a1 ne '.' ) { $dpl_counts{$a1}++; $dpl_counts{$a2}++; if ( $a1 ne $a2 ) { $nhets++ } $ngts++; } } } my $an = 0; my $ac; my $max_ac = 0; for my $key (sort {$a<=>$b} keys %counts) { if ( $key eq 0 ) { $an += $counts{$key}; next; } if ( defined $ac ) { $ac .= ','; } $ac .= $counts{$key}; $an += $counts{$key}; if ( exists($dpl_counts{$key}) && $dpl_counts{$key}>$max_ac ) { $max_ac = $dpl_counts{$key}; } } my %tags = (AN=>$an); if ( defined $ac ) { $tags{AC}=$ac } my $nall = $dpl_counts{0} + $max_ac; if ( scalar keys %counts==2 ) { if ( $$opts{fill_hwe} && $nall && scalar keys %counts==2 ) { my $freq_obs = 2*$nhets/$nall; my $freq_exp = 2*($max_ac/$nall)*(1-($max_ac/$nall)); $$opts{icf} = $freq_exp ? 1-$freq_obs/$freq_exp : 0; $$opts{hwe} = eval_hwe(($max_ac-$nhets)/2,($dpl_counts{0}-$nhets)/2,$nhets ,$line); $tags{HWE} = sprintf "%.6f", $$opts{hwe}; if ( $$opts{fill_icf} ) { $tags{ICF} = sprintf "%.5f", $$opts{icf}; } } } $$line[7] = $$opts{vcf}->add_info_field($$line[7],%tags); } # Wigginton 2005, PMID: 15789306 sub eval_hwe { my ($obs_hom1,$obs_hom2,$obs_hets , $line) = @_; if ( $obs_hom1 + $obs_hom2 + $obs_hets == 0 ) { return 1; } my $obs_homc = $obs_hom1 < $obs_hom2 ? $obs_hom2 : $obs_hom1; my $obs_homr = $obs_hom1 < $obs_hom2 ? $obs_hom1 : $obs_hom2; my $rare_copies = 2 * $obs_homr + $obs_hets; my $genotypes = $obs_hets + $obs_homc + $obs_homr; my @het_probs = ((0) x ($rare_copies+1)); # start at midpoint my $mid = int($rare_copies * (2 * $genotypes - $rare_copies) / (2 * $genotypes)); # check to ensure that midpoint and rare alleles have same parity if (($rare_copies & 1) ^ ($mid & 1)) { $mid++; } my $curr_hets = $mid; my $curr_homr = ($rare_copies - $mid) / 2; my $curr_homc = $genotypes - $curr_hets - $curr_homr; $het_probs[$mid] = 1.0; my $sum = $het_probs[$mid]; for ($curr_hets=$mid; $curr_hets > 1; $curr_hets -= 2) { $het_probs[$curr_hets - 2] = $het_probs[$curr_hets] * $curr_hets * ($curr_hets - 1.0) / (4.0 * ($curr_homr + 1.0) * ($curr_homc + 1.0)); $sum += $het_probs[$curr_hets - 2]; # 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote $curr_homr++; $curr_homc++; } $curr_hets = $mid; $curr_homr = int(($rare_copies - $mid) / 2); $curr_homc = $genotypes - $curr_hets - $curr_homr; for ($curr_hets = $mid; $curr_hets <= $rare_copies - 2; $curr_hets += 2) { $het_probs[$curr_hets + 2] = $het_probs[$curr_hets] * 4.0 * $curr_homr * $curr_homc /(($curr_hets + 2.0) * ($curr_hets + 1.0)); $sum += $het_probs[$curr_hets + 2]; # add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote $curr_homr--; $curr_homc--; } for (my $i = 0; $i <= $rare_copies; $i++) { $het_probs[$i] /= $sum; } my $p_hwe = 0.0; # p-value calculation for p_hwe for (my $i = 0; $i <= $rare_copies; $i++) { if ($het_probs[$i] > $het_probs[$obs_hets]) { next; } $p_hwe += $het_probs[$i]; } return $p_hwe > 1.0 ? 1.0 : $p_hwe; } sub fill_type { my ($opts,$line) = @_; my @types; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt); if ( $type eq 'i' ) { push @types, $len>0 ? 'ins' : 'del'; } elsif ( $type eq 's' ) { push @types, $len==1 ? 'snp' : 'mnp'; } elsif ( $type eq 'o' ) { push @types, 'complex'; } elsif ( $type eq 'b' ) { push @types, 'break'; } elsif ( $type eq 'u' ) { push @types, 'other'; } } $$line[7] = $$opts{vcf}->add_info_field($$line[7],TYPE=>(@types ? join(',',@types) : undef)); } # Stage the lines and then apply filtering if requested, otherwise just print the line sub output_line { my ($opts,$line) = @_; if ( defined $line ) { if ( $$opts{fill_ac_an} ) { fill_ac_an_hwe($opts,$line); } if ( $$opts{fill_type} ) { fill_type($opts,$line); } } if ( !exists($$opts{filters}) && !exists($$opts{udef_filters}) ) { # No filters requested, print the line print_line($opts, $line); return; } if ( defined $line ) { # Local filters return the line back immediately if ( scalar keys %{$$opts{filters}} ) { $line = apply_local_filters($opts,$line); } if ( exists($$opts{udef_filters}) ) { $line = apply_user_defined_filters($opts,$line); } } # Staging filters may return nothing or multiple lines. If $line is not defined, they will # empty the buffers my @lines; if ( exists($$opts{filters}{SnpGap}) ) { @lines = apply_snpgap_filter($opts,$line); if ( defined $line && !scalar @lines ) { return; } } elsif ( defined $line ) { @lines=($line); } if ( exists($$opts{filters}{GapWin}) ) { my @tmp; if ( !defined $line ) { push @lines,undef; } for my $line (@lines) { push @tmp, apply_gapwin_filter($opts,$line); } @lines = @tmp; } if ( exists($$opts{SnpCluster_count}) ) { my @tmp; if ( !defined $line ) { push @lines,undef; } for my $line (@lines) { push @tmp, apply_snpcluster_filter($opts,$line); } @lines = @tmp; } for my $line (@lines) { print_line($opts, $line); } } sub remove_tags { my ($opts,$line) = @_; # Remove INFO tags for my $tag (keys %{$$opts{remove}{INFO}}) { my $ifrom=0; my $ito; my $tag_len = length($tag); while (1) { $ifrom = index($$line[7],$tag,$ifrom); if ( $ifrom==-1 ) { last; } if ( $ifrom!=0 && substr($$line[7],$ifrom-1,1) ne ';' ) { $ifrom++; next; } if ( length($$line[7])!=$ifrom+$tag_len ) { my $c = substr($$line[7],$ifrom+$tag_len,1); if ( $c ne ';' && $c ne '=' ) { $ifrom+=$tag_len; next; } } $ito = index($$line[7],';',$ifrom+1); last; } if ( !defined $ito ) { next; } # not found my $out; if ( $ifrom>0 ) { $out .= substr($$line[7],0,$ifrom-1); if ( $ito!=-1 ) { $out .= ';'; } } if ( $ito!=-1 ) { $out .= substr($$line[7],$ito+1); } $$line[7] = defined $out ? $out : '.'; } # Remove FORMAT tags for my $tag (keys %{$$opts{remove}{FORMAT}}) { my $idx = $$opts{vcf}->get_tag_index($$line[8],$tag,':'); if ( $idx==-1 ) { next; } for (my $i=8; $i<@$line; $i++) { $$line[$i] = $$opts{vcf}->remove_field($$line[$i],$idx,':'); } } # Remove FILTER if ( exists($$opts{remove}{FILTER}) ) { $$line[6] = ref($$opts{remove}{FILTER}) eq 'HASH' ? $$opts{vcf}->add_filter($$line[6],%{$$opts{remove}{FILTER}}) : 'PASS'; } # Remove ID and QUAL if ( exists($$opts{remove}{ID}) ) { $$line[2] = '.' } if ( exists($$opts{remove}{QUAL}) ) { $$line[5] = '.' } } sub apply_user_defined_filters { my ($opts,$line) = @_; our($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF); $CHROM = $$line[0]; $POS = $$line[1]; $FAIL = 1; $PASS = 0; $RECORD = $line; $VCF = $$opts{vcf}; my %filters = (); if ( $$opts{udef_filters_typecheck_needed} ) { # Check if the line has an indel, SNP or both for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $$opts{vcf}->event_type($$line[3],$alt); if ( exists($$opts{udef_filters}{$type}) ) { %filters = ( %filters, %{$$opts{udef_filters}{$type}} ); } } # Return if the line does not have the wanted variant type if ( !scalar %filters ) { return $line; } } else { %filters = %{$$opts{udef_filters}{all}}; } my %apply; for my $filter (values %filters) { if ( exists($$filter{info_tag}) ) { $MATCH = $$opts{vcf}->get_info_field($$line[7],$$filter{info_tag}); if ( !defined $MATCH ) { next; } } elsif ( exists($$filter{format_tag}) ) { my $idx = $$opts{vcf}->get_tag_index($$line[8],$$filter{format_tag},':'); if ( $idx<0 ) { next; } $MATCH = $$opts{vcf}->get_sample_field($line,$idx); } $apply{ $$filter{name} } = &{$$filter{test}} == $PASS ? 0 : 1; } if ( scalar keys %apply ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply); } return $line; } sub apply_local_filters { my ($opts,$line) = @_; if ( !defined $line ) { return; } my $filters = $$opts{filters}; my %apply; if ( exists($$filters{RefN}) ) { $apply{RefN} = ($$line[3]=~/N/) ? 1 : 0; } if ( exists($$filters{Qual}) && $$line[5] ne '.' ) { $apply{Qual} = $$line[5] < $$filters{Qual} ? 1 : 0; } if ( exists($$filters{HWE_G3}) && $$line[7]=~/G3=([^,]+),([^,]+),/ ) { my ($rr,$ra); $rr = $1; $ra = $2; $apply{HWE_G3} = 0; if ( $$line[7]=~/HWE_G3=([^;\t]+)/ && $1<$$filters{HWE_G3} ) { my $p = 2*$rr + $ra; if ( $p>0 && $p<1 && (1-$ra)/($p*(1-$p))<0 ) { $apply{HWE_G3} = 1; } #printf "xHWE: f=%f rr=$rr ra=$ra hwe=$1 p=$p ($$line[1])\n"; } } if ( exists($$filters{HWE}) ) { $apply{HWE} = $$opts{hwe}<$$filters{HWE} && $$opts{icf}<0 ? 1 : 0; } if ( exists($$filters{VDB}) && $$line[7]=~/VDB=([^;,\t]+)/ ) { $apply{VDB} = $1 < $$filters{VDB} ? 1 : 0; } if ( exists($$filters{MinDP}) or exists($$filters{MaxDP}) ) { my $dp; if ( $$line[7]=~/DP=(\d+)/ ) { $dp = $1; } elsif ( $$line[7]=~/DP4=(\d+),(\d+),(\d+),(\d+)/ ) { $dp = $1 + $2 + $3 + $4; } if ( defined $dp ) { if ( exists($$filters{MinDP}) ) { $apply{MinDP} = $dp < $$filters{MinDP} ? 1 : 0; } if ( exists($$filters{MaxDP}) ) { $apply{MaxDP} = $dp > $$filters{MaxDP} ? 1 : 0; } } } if ( exists($$filters{MinAB}) && $$line[7]=~/DP4=\d+,\d+,(\d+),(\d+)/ ) { $apply{MinAB} = $1 + $2 < $$filters{MinAB} ? 1 : 0; } if ( exists($$filters{MinMQ}) && $$line[7]=~/MQ=(\d+)/ ) { $apply{MinMQ} = $1 < $$filters{MinMQ} ? 1 : 0; } if ( (exists($$filters{StrandBias}) or exists($$filters{BaseQualBias}) or exists($$filters{MapQualBias}) or exists($$filters{EndDistBias})) && $$line[7]=~/PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ ) { if ( exists($$filters{StrandBias}) ) { $apply{StrandBias} = $1 < $$filters{StrandBias} ? 1 : 0; } if ( exists($$filters{BaseQualBias}) ) { $apply{BaseQualBias} = $2 < $$filters{BaseQualBias} ? 1 : 0; } if ( exists($$filters{MapQualBias}) ) { $apply{MapQualBias} = $3 < $$filters{MapQualBias} ? 1 : 0; } if ( exists($$filters{EndDistBias}) ) { $apply{EndDistBias} = $4 < $$filters{EndDistBias} ? 1 : 0; } } if ( scalar keys %apply ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],%apply); } return $line; } sub apply_snpgap_filter { my ($opts,$line) = @_; if ( !exists($$opts{SnpGap_buffer}) ) { $$opts{SnpGap_buffer}=[]; } my $vcf = $$opts{vcf}; my $win = $$opts{filters}{SnpGap}; my $buffer = $$opts{SnpGap_buffer}; my ($indel_chr,$indel_pos,$to); if ( defined $line ) { # There may be multiple variants, look for an indel. Anything what is not ref can be filtered. my $is_indel = 0; my $can_be_filtered = 0; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt); if ( $type eq 'i' ) { $is_indel = 1; $indel_chr = $$line[0]; $indel_pos = $$line[1]+1; } elsif ( $type ne 'r' ) { $can_be_filtered = 1; } } # The indel boundaries are based on REF (POS+1,POS+rlen-1). This is not # correct as the indel can begin anywhere in the VCF4.x record with # respect to POS. Specifically mpileup likes to write REF=CAGAGAGAGA # ALT=CAGAGAGAGAGA. Thus this filtering is more strict and may remove # some valid SNPs. $to = $is_indel ? $$line[1]+length($$line[3])-1 : $$line[1]; push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, exclude=>0, can_be_filtered=>$can_be_filtered, is_indel=>$is_indel }; } my $n = @$buffer; # Is the new line an indel? If yes, check the distance to all previous lines if ( defined $indel_chr ) { for (my $i=0; $i<$n-1; $i++) { my $buf = $$buffer[$i]; if ( $$buf{chr} ne $indel_chr ) { next; } if ( !$$buf{can_be_filtered} ) { next; } if ( $$buf{is_indel} ) { next; } if ( $$buf{to}>=$indel_pos-$win ) { $$buf{exclude}=1; } } } if ( defined $line && $$buffer[0]{chr} eq $$buffer[-1]{chr} && $win+$$buffer[0]{to}>=$$buffer[-1]{from} ) { # There are not enough rows in the buffer: the SnpGap window spans them all. Wait until there is more rows # or a new chromosome return (); } # 'Look-behind' filtering was done above, now comes 'look-ahead' filtering my $indel_to; for (my $i=0; $i<$n; $i++) { my $buf = $$buffer[$i]; if ( $$buf{is_indel} ) { $indel_to = $$buf{to}; $indel_chr = $$buf{chr}; next; } if ( !defined $indel_to ) { next; } if ( !$$buf{can_be_filtered} ) { next; } if ( $$buf{chr} ne $indel_chr ) { undef $indel_to; next; } if ( $$buf{from}<=$indel_to+$win ) { $$buf{exclude}=1; } } # Output. If no $line was given, output everything $to = $$buffer[-1]{from}-$win; my $chr = $$buffer[-1]{chr}; my @out; while (@$buffer) { if ( $$buffer[0]{chr} eq $chr && $$buffer[0]{to}+$win>=$to && defined $line ) { last; } my $buf = shift(@$buffer); if ( $$buf{exclude} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpGap'=>0); } push @out,$$buf{line}; } return @out; } sub apply_gapwin_filter { my ($opts,$line) = @_; if ( !exists($$opts{GapWin_buffer}) ) { $$opts{GapWin_buffer}=[]; } my $vcf = $$opts{vcf}; my $win = $$opts{filters}{GapWin}; my $buffer = $$opts{GapWin_buffer}; my $n = @$buffer; my ($indel_chr,$indel_pos,$to); if ( defined $line ) { # There may be multiple variants, only indels can be filtered my $is_indel = 0; my $indel_len = 0; for my $alt (split(/,/,$$line[4])) { my ($type,$len,$ht) = $vcf->event_type($$line[3],$alt); if ( $type eq 'i' ) { $is_indel = 1; $indel_chr = $$line[0]; $indel_pos = $$line[1] + 1; $indel_len = abs($len); # This may remove valid insertions but also artefacts last; } } $to = $$line[1] + $indel_len; my $af = 0; if ( $is_indel ) { # Collect allele frequency to make an educated guess which of the indels to keep $af = $vcf->get_info_field($$line[7],'AF'); if ( !defined $af ) { $af = $vcf->get_info_field($$line[7],'AF1'); # assuming that all records have the same set of annotations, otherwise comparing later AC with AF will be wrong if ( !defined $af ) { $af = $vcf->get_info_field($$line[7],'AC'); } } if ( !defined $af ) { $af=0 } else { $af = $vcf->get_field($af,0,',') } } push @$buffer, { line=>$line, chr=>$$line[0], from=>defined $indel_pos ? $indel_pos : $$line[1], to=>$to, is_indel=>$is_indel, AF=>$af, exclude=>0 }; # printf "%d-%d\t%d-%d\n", $$buffer[0]{from},$$buffer[0]{to},$$buffer[-1]{from},$$buffer[-1]{to}; # Update the latest gap position and check if the buffer can be flushed if ( !exists($$opts{GapWin_chr}) ) { $$opts{GapWin_chr} = $$line[0]; $$opts{GapWin_to} = $$line[1]; } my $flush = ( $$opts{GapWin_chr} eq $$line[0] && $$line[1]<=$$opts{GapWin_to} ) ? 0 : 1; if ( $is_indel ) { # Check distance to previous indels and set the exclude flags for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{is_indel} ) { next; } if ( $$buffer[$i]{to}>=$indel_pos-$win ) { $$buffer[$i]{exclude}=1; $$buffer[-1]{exclude}=1; } } if ( $$opts{GapWin_chr} ne $$line[0] or $to+$win>$$opts{GapWin_to} ) { $$opts{GapWin_to} = $to+$win; } } $$opts{GapWin_chr} = $$line[0]; if ( !$flush ) { return (); } if ( !$is_indel ) { $$opts{GapWin_to} = 0; } } # Let one of the gaps go through. It may not be the best one, but as there are more # it is likely that at least one of them is real. Better to have the wrong one # than miss it completely. Base the decision on AF. If not present, let the first # one through. my $max_af=-1; for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{exclude} ) { next; } if ( $max_af<$$buffer[$i]{AF} ) { $max_af=$$buffer[$i]{AF} } } for (my $i=0; $i<$n; $i++) { if ( !$$buffer[$i]{exclude} ) { next; } if ( $max_af==$$buffer[$i]{AF} ) { $$buffer[$i]{exclude}=0; last; } } my @out; for (my $i=0; $i<$n; $i++) { my $buf = shift(@$buffer); if ( $$buf{exclude} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'GapWin'=>0); } push @out,$$buf{line}; } return @out; } sub apply_snpcluster_filter { my ($opts,$line) = @_; my $buffer = $$opts{SnpCluster_buffer}; my $n = @$buffer; # The buffer is empty and the line contains only reference alleles, print directly if ( $n==0 && defined $line && $$line[4] eq '.' ) { $$line[6] = $$opts{vcf}->add_filter($$line[6],'SnpCluster'=>0); return $line; } # Store the line in buffer and check how many lines can be printed my $to; # All lines up to and including this index will be printed my $win = $$opts{SnpCluster_win}; if ( defined $line ) { # Exclude REF (and maybe also other filters?) form SnpCluster my $can_be_filtered = $$line[4] eq '.' ? 0 : 1; push @$buffer, { line=>$line, chr=>$$line[0], pos=>$$line[1], can_be_filtered=>$can_be_filtered, in_cluster=>0 }; $n++; # Does the buffer hold enough lines now? my $last_chr = $$buffer[-1]{chr}; my $last_pos = $$buffer[-1]{pos}; for (my $i=$n-1; $i>=0; $i--) { my $buf = $$buffer[$i]; if ( $$buf{chr} ne $last_chr ) { $to=$i; last; } if ( $last_pos - $$buf{pos} >= $win ) { $to=$i; last; } } if ( !defined $to ) { return; } } if ( !defined $to ) { $to=$n-1; } # Calculate the number of variants within the window my $count = 0; my $max_count = $$opts{SnpCluster_count}; my $start_chr = $$buffer[0]{chr}; my $start_pos = $$buffer[0]{pos}; my $idx; for ($idx=0; $idx<$n; $idx++) { my $buf = $$buffer[$idx]; if ( $$buf{chr} ne $start_chr ) { last; } if ( $$buf{pos} - $win >= $start_pos ) { last; } if ( $$buf{can_be_filtered} ) { $count++; } } # If a SNP cluster was found, set the in_cluster flag for all relevant sites. # The buffer will be flushed and the orphans would pass unnoticed. if ( $count>=$max_count ) { for (my $i=0; $i<$idx; $i++) { if ( $$buffer[$i]{can_be_filtered} ) { $$buffer[$i]{in_cluster}=1; } } } # Now output the lines, adding or removing the filter my @out = (); for (my $i=0; $i<=$to; $i++) { my $buf = shift(@$buffer); if ( $$buf{in_cluster} ) { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>1); } else { $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0); } push @out,$$buf{line}; } # Output all non-variant lines at the beggining of the buffer while (@$buffer) { if ( $$buffer[0]{can_be_filtered} ) { last; } my $buf = shift(@$buffer); $$buf{line}[6] = $$opts{vcf}->add_filter($$buf{line}[6],'SnpCluster'=>0); push @out,$$buf{line}; } return @out; } sub print_line { my ($opts, $line) = @_; if ( !defined $line ) { return; } if ( $$opts{hard_filter} && $$line[6] ne '.' && $$line[6] ne 'PASS' ) { return; } print join("\t",@$line) . "\n"; } #--------------------------------- package Reader; use strict; use warnings; use Carp; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( !$$self{delim} ) { $$self{delim} = qr/\t/; } if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0) if ( !$$self{from} ) { $$self{from} = 1; } # the index of the from column if ( !$$self{to} ) { $$self{to} = 2; } # the index of the to column return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub open { my ($self,%args) = @_; if ( !$$self{file} ) { return; } $self->close(); open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!"); } sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } close($$self{fh}); delete($$self{fh}); delete($$self{buffer}); } sub unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } sub next_line { my ($self) = @_; if ( !$$self{fh} ) { return undef; } # Run in dummy mode if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } my $line; # Skip comments while (1) { $line = readline($$self{fh}); if ( !defined $line ) { return undef; } if ( $line=~/^#/ ) { next; } last; } my @items = split($$self{delim},$line); chomp($items[-1]); return \@items; } vcftools_0.1.11/perl/vcf-isec0000755000000000000000000005126512156354770014623 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); vcf_isec($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Create intersections, unions, complements on bgzipped and tabix indexed VCF or tab-delimited files.\n", " Note that lines from all files can be intermixed together on the output, which can yield\n", " unexpected results.\n", "Usage: vcf-isec [OPTIONS] file1.vcf file2.vcf ...\n", "Options:\n", " -a, --apply-filters Ignore lines where FILTER column is anything else than PASS or '.'\n", " -c, --complement Output positions present in the first file but missing from the other files.\n", " -d, --debug Debugging information\n", " -f, --force Continue even if the script complains about differing columns, VCF versions, etc.\n", " -o, --one-file-only Print only entries from the left-most file. Without -o, all unique positions will be printed.\n", " -n, --nfiles [+-=] Output positions present in this many (=), this many or more (+), or this many or fewer (-) files.\n", " -p, --prefix If present, multiple files will be created with all possible isec combinations. (Suitable for Venn Diagram analysis.)\n", " -r, --regions Do only the given regions (comma-separated list or one region per line in a file).\n", " -t, --tab Tab-delimited file with indexes of chromosome and position columns. (1-based indexes)\n", " -w, --win In repetitive sequences, the same indel can be called at different positions. Consider\n", " records this far apart as matching (be it a SNP or an indel).\n", " -h, -?, --help This help message.\n", "Examples:\n", " bgzip file.vcf; tabix -p vcf file.vcf.gz\n", " bgzip file.tab; tabix -s 1 -b 2 -e 2 file.tab.gz\n", "\n"; } sub parse_params { $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)"; my $opts = { positions=>0, args=>[$0, @ARGV], force=>0, split=>0, report_from_all=>1, apply_filters=>0 }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '-p' || $arg eq '--prefix' ) { my $prefix = shift(@ARGV); $$opts{prefix} = init_outdir($opts,$prefix); $$opts{split} = 1; next; } if ( $arg eq '-f' || $arg eq '--force' ) { $$opts{force}=1; next; } if ( $arg eq '-a' || $arg eq '--apply-filters' ) { $$opts{apply_filters}=1; next; } if ( $arg eq '-r' || $arg eq '--regions' ) { $$opts{chromosomes}=shift(@ARGV); next; } if ( $arg eq '-o' || $arg eq '--one-file-only' ) { $$opts{report_from_all}=0; next; } if ( $arg eq '-c' || $arg eq '--complement' ) { $$opts{complement}=1; next; } if ( $arg eq '-n' || $arg eq '--nfiles' ) { my $nfiles = shift(@ARGV); if ( !($nfiles=~/^([\-+=])(\d+)$/) ) { error("Could not parse: [$nfiles]\n"); } $$opts{isec_op} = $1; $$opts{isec_nfiles} = $2; next; } if ( $arg eq '-d' || $arg eq '--debug' ) { $$opts{debug}=1; next; } if ( $arg eq '-w' || $arg eq '--win' ) { $$opts{win}=shift(@ARGV); next; } if ( $arg eq '-t' || $arg eq '--tab' ) { my $tab = shift(@ARGV); my ($chr,$pos,$file) = split(/:/,$tab); push @{$$opts{files}}, Reader->new(file=>$file,chr=>$chr-1,pos=>$pos-1); next; } if ( -e $arg ) { push @{$$opts{files}}, $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{files}) ) { error("What files should be intersected?\n") } if ( !$$opts{force} ) { $SIG{__WARN__} = sub { error(@_); } } return $opts; } sub init_outdir { my ($opts,$prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub read_chrom_list { my ($fname) = @_; my @chroms; if ( -e $fname ) { open(my $chrms,'<',$fname) or error("$fname: $!"); while (my $line=<$chrms>) { chomp($line); push @chroms, $line; } close($chrms); } else { @chroms = split(/,/,$fname); } return (@chroms); } sub check_columns { my ($opts,$vcfs) = @_; # Do the check for VCF files only for (my $ivcf=0; $ivcf<@$vcfs; $ivcf++) { if ( !exists($$vcfs[$ivcf]{has_column}) ) { next; } for (my $jvcf=0; $jvcf<$ivcf; $jvcf++) { if ( !exists($$vcfs[$jvcf]{has_column}) ) { next; } if ( scalar @{$$vcfs[$ivcf]{columns}} != scalar @{$$vcfs[$jvcf]{columns}} ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The number of sample columns is different:\n", (@icols>9 ? scalar @icols - 9 : 0), ": ", join(',',@icols[9..$#icols]),"\n", scalar @jcols - 9, ": ", join(',',@jcols[9..$#jcols]),"\n", ); return; } for my $cname (keys %{$$vcfs[$ivcf]{has_column}}) { if ( !exists($$vcfs[$jvcf]{has_column}{$cname}) or $$vcfs[$ivcf]{has_column}{$cname}!=$$vcfs[$jvcf]{has_column}{$cname} ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The column names do not match (e.g. $cname):\n", join(',',@icols[9..$#icols]),"\n", join(',',@jcols[9..$#jcols]),"\n", ); return; } } for my $cname (keys %{$$vcfs[$jvcf]{has_column}}) { if ( !exists($$vcfs[$ivcf]{has_column}{$cname}) ) { my @icols = @{$$vcfs[$ivcf]{columns}}; my @jcols = @{$$vcfs[$jvcf]{columns}}; warn("Warning: The column names do not match (e.g. $cname):\n", join(',',@icols[9..$#icols]),"\n", join(',',@jcols[9..$#jcols]),"\n", ); return; } } } } } sub vcf_isec { my ($opts) = @_; $$opts{match} = {}; # Open the VCF files and initialize the list of chromosomes my @vcfs; my (@chroms,%has_chrom); if ( exists($$opts{chromosomes}) ) { @chroms = read_chrom_list($$opts{chromosomes}); } my $source; my $vcf_version; my $vcf_version_warned; for (my $ifile=0; $ifile<@{$$opts{files}}; $ifile++) { my $file = $$opts{files}[$ifile]; my ($vcf,$file_name); if ( ref($file) eq '' ) { $vcf = Vcf->new(file=>$file); $file_name = $file; } else { $vcf = $file; $file_name = $$file{file}; } $vcf->parse_header(); $vcf->close(); $$vcf{nread} = 0; push @vcfs, $vcf; # Check if the VCF versions are identical if ( ref($file) eq '' ) { if ( !defined $vcf_version ) { $vcf_version = $$vcf{version} } if ( $vcf_version ne $$vcf{version} && !$vcf_version_warned ) { warn("Warning: Mixed VCF format versions, use vcf-convert to unify.\n"); $vcf_version_warned = 1; } } # Update the list of known chromosomes if ( !exists($$opts{chromosomes}) ) { my $chrms = $vcf->get_chromosomes(); for my $chr (@$chrms) { if ( exists($has_chrom{$chr}) ) { next; } $has_chrom{$chr} = 1; push @chroms, $chr; } } if ( $ifile ) { # To get the missig fields filled by the default values if ( !$vcfs[0]{delim} ) { for my $hline (@{$$vcf{header_lines}}) { $vcfs[0]->add_header_line($hline,silent=>1); } } $source .= ','; } $source .= "$ifile:$file_name"; $$vcf{vcf_isec_ID} = $ifile; } check_columns($opts,\@vcfs); $$opts{vcfs} = \@vcfs; if ( !$vcfs[0]{delim} && !$$opts{split} ) { $vcfs[0]->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcfs[0]->add_header_line({key=>'sourceFiles',value=>$source},append=>'timestamp'); $vcfs[0]->add_header_line({key=>'INFO',ID=>'SF',Number=>-1,Type=>'String',Description=>'Source File (index to sourceFiles, f when filtered)'},silent=>1); print $vcfs[0]->format_header(); } # Go through all the files simultaneously and get the stats. for my $chr (@chroms) { # Open files for my $vcf (@vcfs) { delete($$vcf{last_line}); $vcf->open(region=>$chr); delete($$vcf{eof}); } do_chrm_isec($opts,\@vcfs); } for my $vcf (@vcfs) { if ( !$$vcf{nread} ) { warn("Warning: Read 0 lines from $$vcf{file}, the tabix index may be broken.\n"); } } } sub do_chrm_isec { my ($opts,$vcfs) = @_; my $debug = $$opts{debug} ? 1 : 0; my $win = $$opts{win} ? $$opts{win} : 0; my $complement = $$opts{complement} ? 1 : 0; my $report_from_all = $$opts{report_from_all} ? 1 : 0; my $nfiles = scalar @{$$opts{files}}; my $isec_nfiles = $nfiles; my $isec_op = '='; if ( exists($$opts{isec_nfiles}) ) { $isec_nfiles = $$opts{isec_nfiles}; $isec_op = $$opts{isec_op}; } my $split = $$opts{split}; while (1) { my $grp = read_next_group($opts,$vcfs,$win); if ( !$grp || !scalar @$grp ) { last } if ( $debug ) { print "Group:\n"; for my $rec (@$grp) { print "$$rec{chr}\t$$rec{pos}\t$$rec{vcf}{file}\n"; } print "\n"; } my %files; my %srcs; for my $rec (@$grp) { my $vcf = $$rec{vcf}; my $src = $$vcf{vcf_isec_ID}; push @{$files{$src}}, $rec; if ( !$$vcf{delim} ) { # This is a VCF, check filters my $fltr = $$rec{line}[6]; if ( !$split && $fltr ne $$vcf{filter_passed} && $fltr ne $$vcf{defaults}{default} ) { $src .= 'f'; } } $srcs{$$rec{pos}}{$src} = $rec; } if ( $split ) { write_line($opts,$grp,\%srcs); next; } my $nmatches = scalar keys %files; if ( $complement ) { my $src = $$vcfs[0]{vcf_isec_ID}; if ( !exists($files{$src}) ) { next; } if ( $nmatches!=1 ) { next; } } elsif ( $isec_op eq '=' && $isec_nfiles!=$nmatches ) { next; } elsif ( $isec_op eq '+' && $isec_nfiles>$nmatches ) { next; } elsif ( $isec_op eq '-' && $isec_nfiles<$nmatches ) { next; } # The hits are sorted by position in @$grp my ($prev_chr,$prev_pos,$prev_id); for my $rec (@$grp) { if ( !$report_from_all && $$rec{vcf}{vcf_isec_ID}!=0 ) { next; } elsif ( defined $prev_chr && $prev_chr eq $$rec{chr} && $prev_pos eq $$rec{pos} && $prev_id ne $$rec{vcf}{vcf_isec_ID} ) { next; } if ( !$$rec{vcf}{delim} ) { # This is a VCF file, add annotation my @tags = split(/;/,$$rec{line}[7]); my $i; for ($i=0; $i<@tags; $i++) { if ( $tags[$i] eq '.' or $tags[$i]=~/^SF=/ ) { last; } } my $src = join(',',sort keys %{$srcs{$$rec{pos}}}); $tags[$i] = 'SF='.$src; $$rec{line}[7] = join(';',@tags); print join("\t",@{$$rec{line}}) . "\n"; } else { print $$rec{line}; } $prev_chr = $$rec{chr}; $prev_pos = $$rec{pos}; $prev_id = $$rec{vcf}{vcf_isec_ID}; } } } sub write_line { my ($opts,$grp,$srcs) = @_; for my $hash (values %$srcs) { my $src = join('_',sort keys %$hash); if ( !exists($$opts{out_files}{$src}) ) { my $id = (sort keys %$hash)[0]; my $vcf = $$opts{vcfs}[$id]; $$opts{out_vcfs}{$src} = $vcf; $$opts{out_recs}{$src} = $id; open($$opts{out_files}{$src},"| bgzip -c > $$opts{prefix}$src.vcf.gz") or error("| bgzip -c > $$opts{prefix}$src.vcf.gz: $!"); if ( !exists($$opts{readme_fh}) ) { open($$opts{readme_fh},'>',"$$opts{prefix}_README") or error("$$opts{prefix}_README: $!"); print {$$opts{readme_fh}} "# This file was produced by vcf-isec. The command line was:\n#\t",join(' ',@{$$opts{args}}),"\n#\n"; } print {$$opts{readme_fh}} "Using file '$$opts{prefix}$src.vcf.gz' for records present in:\n"; for my $rec (sort values %$hash) { print {$$opts{readme_fh}} "\t$$rec{vcf}{file}\n"; } if ( !$$vcf{delim} ) { my $fnames = join(',',sort values %$hash); $vcf->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp'); $vcf->add_header_line({key=>'sourceFiles',value=>$fnames},append=>'timestamp'); print {$$opts{out_files}{$src}} $vcf->format_header(); } } } #use Data::Dumper; print Dumper($srcs); for my $pos (keys %$srcs) { my $src = join('_',sort keys %{$$srcs{$pos}}); my $fh = $$opts{out_files}{$src}; my $irec = $$opts{out_recs}{$src}; my $vcf = $$opts{out_vcfs}{$src}; my $rec = $$srcs{$pos}{$irec}; if ( !$$vcf{delim} ) { print $fh join("\t",@{$$rec{line}}) . "\n"; } else { print $fh $$rec{line}; } } } sub read_next_group { my ($opts,$vcfs,$win) = @_; my @grp; my $prev_vcf; my $start; while (1) { my $min_vcf = get_min_position($opts,$vcfs); # No more lines in the buffer? if ( !$min_vcf ) { last; } # Nothing new has been added? if ( $prev_vcf && $prev_vcf eq $$min_vcf{buf}[0] ) { last; } $prev_vcf = $$min_vcf{buf}[0]; # Read everything what falls in the window. The window moves to encompass complete clusters. if ( !$start or $start+$win >= $$min_vcf{buf}[0]{pos} ) { my $rec = shift(@{$$min_vcf{buf}}); push @grp,$rec; $start = $$rec{pos}; next; } } return \@grp; } # Return the minimum position across all opened files. If there is no line in the file's buffer, # advance to the next line. sub get_min_position { my ($opts,$vcfs) = @_; my ($min_pos,$min_vcf); for my $vcf (@$vcfs) { # Check if there is a line in the buffer, if not, read. If still empty, the file reached eof if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { read_line($opts,$vcf); } if ( !$$vcf{buf} or !scalar @{$$vcf{buf}} ) { next; } my $line = $$vcf{buf}[0]; # Designate this position as the minimum of all the files if: # .. is this the first file? if ( !$min_pos ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } # .. has this file lower position? if ( $min_pos>$$line{pos} ) { $min_pos = $$line{pos}; $min_vcf = $vcf; next; } } return $min_vcf; } # Read one line from a VCF or Reader, split it and save it to a buffer. sub read_line { my ($opts,$vcf) = @_; if ( $$vcf{eof} ) { return; } my $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } $$vcf{nread}++; my ($chr,$pos,$ref,$alt); if ( $$vcf{delim} ) { my @items = split($$vcf{delim},$line); # Reader object $chr = $items[$$vcf{chr}]; $pos = $items[$$vcf{pos}]; $ref = ''; $alt = ''; } else { # We are reading VCF, not a tab-delimited file. Apply filters when requested. my @items = split(/\t/,$line); while ( $$opts{apply_filters} && $items[6] ne 'PASS' && $items[6] ne '.' ) { $line = $vcf->next_line(); if ( !$line ) { $$vcf{eof} = 1; return; } @items = split(/\t/,$line); } chomp($items[-1]); $chr = $items[0]; $pos = $items[1]; $ref = $items[3]; $alt = $items[4]; $line = \@items; } if ( $$vcf{buf} && @{$$vcf{buf}} ) { my $prev = $$vcf{buf}[-1]; if ( $$prev{pos} == $pos ) { warn("Position $chr:$pos appeared twice in $$vcf{file}\n"); } } push @{$$vcf{buf}}, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line, vcf=>$vcf }; return; } #--------------------------------- package Reader; use strict; use warnings; use Carp; sub new { my ($class,@args) = @_; my $self = @args ? {@args} : {}; bless $self, ref($class) || $class; if ( $$self{cmd} ) { $$self{file} = ''; open($$self{fh},$$self{cmd}) or $self->throw("$$self{cmd}: $!"); } if ( !$$self{file} && !$$self{fh} ) { $self->throw("Expected the file or fh option.\n"); } if ( !$$self{delim} ) { $$self{delim} = qr/\t/; } if ( !$$self{chr} ) { $$self{chr} = 0; } # the index of the chromosome column (indexed from 0) if ( !$$self{pos} ) { $$self{pos} = 1; } # the index of the position column return $self; } sub throw { my ($self,@msg) = @_; confess @msg; } sub open { my ($self,%args) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } $self->close(); if ( $$self{file}=~/\.gz$/i ) { if ( exists($args{region}) && defined($args{region}) ) { open($$self{fh},"tabix $$self{file} $args{region} |") or $self->throw("tabix $$self{file}: $!"); } else { open($$self{fh},"gunzip -c $$self{file} |") or $self->throw("gunzip -c $$self{file} |: $!"); } } else { open($$self{fh},'<',$$self{file}) or $self->throw("$$self{file}: $!"); } } sub close { my ($self) = @_; if ( !$$self{fh} ) { return; } close($$self{fh}); delete($$self{fh}); delete($$self{buffer}); } sub _unread_line { my ($self,$line) = @_; unshift @{$$self{buffer}}, $line; return; } sub next_line { my ($self) = @_; my $line; if ( $$self{buffer} && @{$$self{buffer}} ) { return shift(@{$$self{buffer}}); } return readline($$self{fh}); } sub parse_header { my ($self) = @_; $self->open(); while (1) { my $line = $self->next_line(); if ( !$line ) { last; } if ( $line=~/^#/ ) { push @{$$self{header}},$line; next; } $self->_unread_line($line); last; } } sub format_header { my ($self) = @_; if ( $$self{header} ) { return join('',@{$$self{header}}); } return ''; } sub get_chromosomes { my ($self) = @_; if ( !$$self{file} ) { $self->throw(qq[The parameter "file" not set.\n]); } my (@out) = `tabix -l $$self{file}`; if ( $? ) { $self->throw(qq[The command "tabix -l $$self{file}" exited with an error. Is the file tabix indexed?\n]); } for (my $i=0; $i<@out; $i++) { chomp($out[$i]); } return \@out; } vcftools_0.1.11/perl/vcf-to-tab0000755000000000000000000000453012156354770015057 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); convert_to_tab($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-to-tab [OPTIONS] < in.vcf > out.tab\n", "Options:\n", " -h, -?, --help This help message.\n", " -i, --iupac Use one-letter IUPAC codes\n", "\n"; } sub parse_params { my $opts = { iupac=>0 }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-i' || $arg eq '--iupac' ) { $$opts{iupac}=1; next; } error("Unknown parameter \"$arg\". Run -h for help.\n"); } if ( $$opts{iupac} ) { $$opts{iupac} = { 'GG' => 'G', 'CC' => 'C', 'TT' => 'T', 'AA' => 'A', 'GT' => 'K', 'TG' => 'K', 'AC' => 'M', 'CA' => 'M', 'CG' => 'S', 'GC' => 'S', 'AG' => 'R', 'GA' => 'R', 'AT' => 'W', 'TA' => 'W', 'CT' => 'Y', 'TC' => 'Y', '..' => '.', }; } return $opts; } sub convert_to_tab { my ($opts) = @_; my $iupac; if ( $$opts{iupac} ) { $iupac=$$opts{iupac}; } my $vcf = Vcf->new(fh=>\*STDIN); $vcf->parse_header(); my $header_printed=0; while (my $x=$vcf->next_data_hash()) { if ( !$header_printed ) { print "#CHROM\tPOS\tREF"; for my $col (sort keys %{$$x{gtypes}}) { print "\t$col"; } print "\n"; $header_printed = 1; } print "$$x{CHROM}\t$$x{POS}\t$$x{REF}"; for my $col (sort keys %{$$x{gtypes}}) { my ($al1,$sep,$al2) = exists($$x{gtypes}{$col}{GT}) ? $vcf->parse_alleles($x,$col) : ('.','/','.'); my $gt = $al1.'/'.$al2; if ( $iupac ) { if ( !exists($$iupac{$gt}) ) { error(qq[Unknown IUPAC code for "$al1$sep$al2" .. $$x{CHROM}:$$x{POS} $col\n]); } $gt = $$iupac{$gt}; } print "\t".$gt; } print "\n"; } } vcftools_0.1.11/perl/vcf-sort0000755000000000000000000000511412156354770014657 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; my $opts = parse_params(); sort_vcf($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-sort > out.vcf\n", " cat file.vcf | vcf-sort > out.vcf\n", "Options:\n", " -c, --chromosomal-order Use natural ordering (1,2,10,MT,X) rather then the default (1,10,2,MT,X). This requires\n", " new version of the unix \"sort\" command which supports the --version-sort option.\n", " -t, --temporary-directory Use a directory other than /tmp as the temporary directory for sorting.\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = {}; while (my $arg=shift(@ARGV)) { if ( $arg eq '-c' || $arg eq '--chromosomal-order' ) { $$opts{chromosomal_order}=1; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( $arg eq '-t' || $arg eq '--temporary-directory' ) { $$opts{temp_dir}=shift(@ARGV); next; } if ( -e $arg ) { $$opts{file}=$arg; next } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub sort_vcf { my ($opts) = @_; my $fh; if ( exists($$opts{file}) ) { if ( $$opts{file}=~/\.gz$/i ) { open($fh,"gunzip -c $$opts{file} |") or error("$$opts{file}: $!"); } else { open($fh,'<',$$opts{file}) or error("$$opts{file}: $!"); } } else { $fh = *STDIN; } my $sort_opts = check_sort_options($opts); my $cmd; if ( exists($$opts{temp_dir}) ) { $cmd = "sort $sort_opts -T $$opts{temp_dir} -k2,2n"; } else { $cmd = "sort $sort_opts -k2,2n"; } open(my $sort_fh,"| $cmd") or error("$cmd: $!"); my $unflushed = select(STDOUT); $| = 1; while (my $line=<$fh>) { if ( $line=~/^#/ ) { print $line; next; } print $sort_fh $line; last; } select($unflushed); while (my $line=<$fh>) { print $sort_fh $line; } } sub check_sort_options { my ($opts) = @_; if ( !$$opts{chromosomal_order} ) { return '-k1,1d' } my @has_version_sort = `sort --help | grep -- --version-sort`; if ( scalar @has_version_sort ) { return '-k1,1V'; } error("Old version of sort command installed, please run without the -c option.\n"); return '-k1,1d'; } vcftools_0.1.11/perl/vcf-stats0000755000000000000000000001141212156354770015024 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use VcfStats; my $opts = parse_params(); vcf_stats($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "Usage: vcf-stats [OPTIONS] file.vcf.gz\n", "Options:\n", " -d, --dump Take an existing dump file and recreate the files (works with -p)\n", " -f, --filters List of filters such as column/field (any value), column/field=bin:max (cluster in bins),column/field=value (exact value)\n", " -p, --prefix Prefix of output files. If slashes are present, directories will be created.\n", " -s, --samples Process only the listed samples, - for none. Excluding unwanted samples may increase performance considerably.\n", " -h, -?, --help This help message.\n", "\n", "Examples:\n", " # Calculate stats separately for the filter field, quality and non-indels\n", " vcf-stats file.vcf.gz -f FILTER,QUAL=10:200,INFO/INDEL=False -p out/\n", "\n", " # Calculate stats for all samples\n", " vcf-stats file.vcf.gz -f FORMAT/DP=10:200 -p out/\n", "\n", " # Calculate stats only for the sample NA00001\n", " vcf-stats file.vcf.gz -f SAMPLE/NA00001/DP=1:200 -p out/\n", "\n", " vcf-stats file.vcf.gz > perl.dump\n", "\n"; } sub parse_params { my $opts = { filters=>{}, filter_param=>'' }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-d' || $arg eq '--dump' ) { $$opts{dump}=shift(@ARGV); next; } if ( $arg eq '-f' || $arg eq '--filters' ) { $$opts{filter_param}=shift(@ARGV); next; } if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } if ( $arg eq '-s' || $arg eq '--samples' ) { my $samples = shift(@ARGV); $$opts{samples} = [ split(/,/,$samples) ]; next; } if ( -e $arg ) { $$opts{file} = $arg; next } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter or nonexistent file: \"$arg\". Run -h for help.\n"); } if ( exists($$opts{dump}) && !exists($$opts{prefix}) ) { error("Expected -p option with -d.\n"); } return $opts; } sub init_filters { my ($opts,$vcf) = @_; for my $filter (split(/,/,$$opts{filter_param})) { my ($key,$value) = split(/=/,$filter); my $rec = { value=>$value, exact=>0, any=>0, bin=>0, is_flag=>0 }; if ( $key=~m{^INFO/} ) { my $tag = $'; $$rec{tag} = $tag; if ( exists($$vcf{header}{'INFO'}) && exists($$vcf{header}{'INFO'}{$tag}) && $$vcf{header}{'INFO'}{$tag}{Type} eq 'Flag' ) { $$rec{is_flag} = 1; $$rec{value} = $value eq 'False' ? 0 : 1; $key = "INFO/$tag=". ($$rec{value} ? 'True':'False'); } } elsif ( $key eq 'INFO' ) { # All INFO flags should be counted for my $tag (keys %{$$vcf{header}{'INFO'}}) { if ( $$vcf{header}{'INFO'}{$tag}{Type} ne 'Flag' ) { next; } $$opts{filters}{"INFO/$tag=True"} = { %$rec, is_flag=>1, value=>1, tag=>$tag }; } next; } if ( ! defined $value ) { $$rec{any} = 1; } elsif ( $value=~/^(.+):(.+)$/ ) { $$rec{bin} = 1; $$rec{bin_size} = $1; $$rec{max} = $2; } else { $$rec{exact} = 1; } $$opts{filters}{$key} = $rec; } } sub vcf_stats { my ($opts) = @_; if ( exists($$opts{dump}) ) { # Use existing dump to recreate the files my $vcf = VcfStats->new(file=>'/dev/null'); $$vcf{stats} = do $$opts{dump}; $vcf->save_stats($$opts{prefix}); return; } # Open the VCF file my $vcf = $$opts{file} ? VcfStats->new(file=>$$opts{file}) : VcfStats->new(fh=>\*STDIN); $vcf->parse_header(); init_filters($opts,$vcf); # Include only requested samples if ( exists $$opts{samples} ) { my @include = (); if ( scalar @{$$opts{samples}}>1 or $$opts{samples}[0] ne '-' ) { for my $sample (@{$$opts{samples}}) { push @include,$sample; } } $vcf->set_samples(include=>\@include); } while (my $rec=$vcf->next_data_hash()) { $vcf->collect_stats($rec,$$opts{filters}); } $vcf->save_stats($$opts{prefix}); } vcftools_0.1.11/perl/ChangeLog0000644000000000000000000001076312156354770014746 0ustar rootroot2012-05-02 15:53 petr.danecek@sanger * vcf-consensus * vcf-indel-stats * vcf-compare: handle spaces in file names 2012-02-23 09:45 petr.danecek@sanger * vcf-merge: redundant ALT alleles are no longer removed by default but only with -t. * vcf-annotate: - set the FILTER column, remove and annotate in one go (e.g. ID) - support of genotype columns in user filters - new --fill-type option 2012-01-23 10:41 petr.danecek@sanger * Notable changes since the last release: - fill-fs: new script for annotating VCFs with flanking sequence - fill-ref-md5: new script for annotating VCFs with 'reference' and 'contig' tags recommended by VCFv4.1 - vcf-annotate: now also removes annotations and can apply user-defined filters - vcf-compare: changed output format, more stats reported and plots the results - vcf-fix-newlines: new script for fixing newline representation - vcf-phased-join: new script for joining pre-phased VCFs - vcf-query: significant speed up for some type of queries - vcf-sort: chromosomal ordering (1,2,10,MT,X rather than 1,10,2,MT,X) with new versions of unix sort - Vcf.pm: new set of API methods for faster access - some of the tools now work also with remote files 2011-04-04 14:00 petr.danecek@sanger * VCFtools now support VCFv4.1 * fill-ref-md5: New tool backfilling sequence MD5s into VCF header * Renamed merge-vcf, compare-vcf etc. to consistent naming vcf-merge, vcf-compare * vcf-merge: Now merging also GL and other Number=[AG] tags * vcf-compare: Comparing indel haplotypes 2011-02-21 12:31 petr.danecek@sanger * vcf-stats: new -s option to speed up parsing when stats computed for selected samples only * merge-vcf: allow to merge arbitrary chunks; -c option now deprecated, use -r instead * compare-vcf: change in output format and more detailed comparison 2011-02-17 17:36 petr.danecek@sanger * vcf-stats: allow querying stats of individual samples 2011-02-16 12:07 petr.danecek@sanger * vcf-stats: major revision * vcf-annotate: more filtering options 2011-02-04 14:43 petr * merge-vcf: if possible, calculate AC,AN even for sites without genotypes 2011-02-03 15:04 petr * merge-vcf: fixed a bug introduced by the previous fix. 2011-02-02 21:02 petr * merge-vcf: fixed a bug in merging indel ALTs. Only VCFs without samples were affected. 2011-01-28 15:38 petr * vcf-subset: new option for printing rows with calls private to the subset group 2011-01-24 13:38 petr * Vcf.pm: uppercase floating point number expressions (such as 1.0382033E-6) now pass validation 2011-01-20 08:28 petr * vcf-concat: print header also for empty VCFs with the -s option 2011-01-04 08:59 petr * vcf-isec, vcf-sort, Vcf.pm: replaced "zcat" by "gunzip -c" 2010-12-22 14:18 petr * vcf-annotate: New --SnpCluster option * Vcf.pm: new sub add_filter() 2010-12-15 13:44 petr * vcf-isec: By default output records from all files with unique positions (duplicate records from the same file still should be printed). With the -o switch, only positions from the left-most file will be printed. 2010-12-09 14:48 petr * query-vcf: Output 'True' for Flag tags when present and . when absent * vcf-annotate: Fix: the command line eats quotes when they are not escaped 2010-12-08 12:06 petr * Vcf.pm: throw an error when tabix fails. * query-vcf: enable streaming of files when region is not specified. 2010-12-02 11:53 petr * Vcf.pm: allow ALT alleles which are not present in samples * vcf-isec: Multiple files can be created simultaneously with all possible isec combinations. Suitable for Venn Diagram analysis. * merge-vcf: Do not remove ALT alleles if no samples are present * merge-vcf: Do FILTER merging more intelligently. * merge-vcf: Join the QUAL column: use average value weighted by the number of samples. 2010-11-28 08:34 petr * vcf-concat: Partial sort * vcf-validator: Added -u option * VcfStats.pm: dump_counts 2010-11-27 13:04 petr * vcf-subset: Filter variants by type 2010-11-26 09:08 petr * vcf-annotate: Added possibility to read header descriptions from a file 2010-11-24 13:25 petr * Fix in Vcf.pm:fill_ref_alt_mapping. VCF files processed with merge-vcf were affected when containing IDs in the ALT column. 2010-11-23 13:12 petr * Major revamp of Vcf.pm to allow better inheritance. Problems likely. vcftools_0.1.11/perl/VcfStats.pm0000644000000000000000000004334312156354770015267 0ustar rootroot# # Author: petr.danecek@sanger # =head1 NAME VcfStats.pm. Module for collecting stats from VCF files. =head1 SYNOPSIS use VcfStats; my $vstats = VcfStats->new(file=>'example.vcf.gz'); while (my $x=$vstats->next_data_hash()) { $vstats->collect_stats($x); } $vstats->dump(); =cut package VcfStats; use strict; use warnings; use Carp; use Data::Dumper; use base 'Vcf'; =head2 new About : Creates new VcfStats. Usage : my $vstats = VcfStats->new(file=>'my.vcf'); Args : See Vcf.pm =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); for my $version (@{$$self{versions}}) { if ( $self->isa($version) ) { eval "use base '$version'"; } } bless($self,$class); return $self; } sub parse_header { my ($self,@args) = @_; $self->SUPER::parse_header(@args); } =head2 get_stats_key About : Creates relevant stats hash key, used by select_stats Usage : Args [1]: Hash with filter definition (value to match, range, etc.) [2]: Prefix of the stat [3]: Value of the filter =cut sub get_stats_key { my ($self,$filter,$key,$value) = @_; my $stat_key; if ( $$filter{exact} ) { if ( $value ne $$filter{value} ) { next; } $stat_key = $key.'/'.$value; } elsif ( $value eq '.' ) { $stat_key = $key.'/.'; } elsif ( $$filter{any} ) { $stat_key = $key.'/'.$value; } elsif ( $$filter{bin} ) { my $bin = int($value/$$filter{bin_size}) * $$filter{bin_size}; if ( $bin>$$filter{max} ) { $bin=">$$filter{max}"; } $stat_key = $key.'/'.$bin; } else { $self->throw("TODO: $key...\n"); } return $stat_key; } =head2 select_stats About : Selects relevant stats hashes Usage : Args [1]: Hash record from next_data_hash [2]: Filters =cut sub select_stats { my ($self,$rec,$filters) = @_; if ( !exists($$self{stats}{all}) ) { $$self{stats}{all}={}; } my @mandatory = ( $$self{stats}{all} ); my %samples; for my $sample (keys %{$$rec{gtypes}}) { if ( !exists($$self{stats}{samples}{$sample}) ) { $$self{stats}{samples}{$sample} = {}; } push @{$samples{$sample}}, $$self{stats}{samples}{$sample}; } if ( !defined $filters ) { return (\@mandatory,\%samples); } while (my ($key,$filter) = each %$filters) { if ( $key eq 'FILTER' ) { for my $value (@{$$rec{FILTER}}) { my $stats_key = $self->get_stats_key($filter,$key,$value); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } } elsif ( $key eq 'QUAL' ) { my $stats_key = $self->get_stats_key($filter,$key,$$rec{QUAL}); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } elsif ( $key=~m{^INFO/} ) { if ( $$filter{is_flag} ) { if ( $$filter{value} && !exists($$rec{INFO}{$$filter{tag}}) ) { next; } elsif ( !$$filter{value} && exists($$rec{INFO}{$$filter{tag}}) ) { next; } if ( !exists($$self{stats}{$key}) ) { $$self{stats}{$key}={}; } push @mandatory, $$self{stats}{$key}; next; } elsif ( exists($$rec{INFO}{$$filter{tag}}) ) { my $stats_key = $self->get_stats_key($filter,$key,$$rec{INFO}{$$filter{tag}}); if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; } push @mandatory, $$self{stats}{$stats_key}; } } elsif ( $key=~m{^FORMAT/([^/]+)$} ) { while (my ($sample,$hash) = each %{$$rec{gtypes}}) { if ( !exists($$hash{$1}) ) { next; } my $stats_key = $self->get_stats_key($filter,$1,$$hash{$1}); if ( !exists($$self{stats}{samples}{$sample}{user}{$stats_key}) ) { $$self{stats}{samples}{$sample}{user}{$stats_key}={}; } push @{$samples{$sample}}, $$self{stats}{samples}{$sample}{user}{$stats_key}; } } elsif ( $key=~m{^SAMPLE/([^/]+)/([^/]+)$} ) { if ( !exists($$rec{gtypes}{$1}{$2}) ) { next; } my $stats_key = $self->get_stats_key($filter,$2,$$rec{gtypes}{$1}{$2}); if ( !exists($$self{stats}{samples}{$1}{user}{$stats_key}) ) { $$self{stats}{samples}{$1}{user}{$stats_key}={} } push @{$samples{$1}}, $$self{stats}{samples}{$1}{user}{$stats_key}; } else { $self->throw("The feature currently not recognised: $key.\n"); } } return (\@mandatory,\%samples); } =head2 collect_stats About : Collect stats Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats($x); Args : =cut sub collect_stats { my ($self,$rec,$filters) = @_; # Ts/Tv and custom numbers based on INFO, QUAL etc. for the mandatory columns my ($mandatory_stats,$sample_stats) = $self->select_stats($rec,$filters); $self->collect_stats_mandatory($rec,$mandatory_stats); # Ts/Tv for samples while (my ($sample,$stats) = each %$sample_stats) { $self->collect_stats_sample($rec,$sample,$stats); } my %type_keys = ( r=>'ref', s=>'snp', i=>'indel' ); # Private calls and the number of shared SNPs. Check if: # - there is a nonref variant present only in this sample (samples->sample_name->private) # - there is a nonref variant in N samples (samples->all->shared) # - there is a non-empty call (samples->sample_name->count) my $shared = 0; my $sample_name; while (my ($sample,$stats) = each %$sample_stats) { my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample); if ( $is_empty ) { next; } my $is_hom=1; my %types; my $is_ref = 1; for my $al (@$alleles) { if ( $$alleles[0] ne $al ) { $is_hom=0; } my ($type,$len,$ht) = $self->event_type($rec,$al); $types{$type} = 1; if ( $type eq 'r' ) { next; } $is_ref = 0; } for my $stat (@$stats) { $$stat{count}++; } for my $type (keys %types) { my $key = exists($type_keys{$type}) ? $type_keys{$type} : 'other'; $key .= '_count'; for my $stat (@$stats) { $$stat{$key}++; } } my $key; if ( exists($types{r}) ) { if ( $is_hom ) { $key='hom_RR'; } else { $key='het_RA' } } elsif ( $is_hom ) { $key='hom_AA'; } else { $key='het_AA'; } $key .= '_count'; for my $stat (@$stats) { $$stat{$key}++; } $key = $is_phased ? 'phased' : 'unphased'; for my $stat (@$stats) { $$stat{$key}++; } if ( $is_ref ) { next; } $shared++; if ( !defined $sample_name ) { $sample_name = $sample; } } $$self{stats}{all}{shared}{$shared}++; if ( $shared==1 ) { for my $stat (@{$$sample_stats{$sample_name}}) { $$stat{private}++; } } } =head2 collect_stats_mandatory About : Collect stats based on mandatory columns Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats_mandatory($x); Args : =cut sub collect_stats_mandatory { my ($self,$rec,$stats) = @_; # How many mono,bi,tri-allelic etc sites are there my $nalt = 0; if ( !scalar keys %{$$rec{gtypes}} ) { $nalt = scalar @{$$rec{ALT}}; if ( $nalt==1 && $$rec{ALT}[0] eq '.' ) { $nalt=0 } } elsif ( exists($$rec{INFO}{AC}) ) { for my $ac (split(/,/,$$rec{INFO}{AC})) { if ( $ac ) { $nalt++; } } } else { my ($an,$ac,$acs) = $self->calc_an_ac($$rec{gtypes}); for my $ac (@$acs) { if ( $ac ) { $nalt++; } } } my %types; for my $alt (@{$$rec{ALT}}) { if ( $alt eq '.' ) { $alt=$$rec{REF}; } my $type = $self->add_variant($rec,$alt,$stats); $types{$type} = 1; } # Increment counters for my $stat (@$stats) { $$stat{'nalt_'.$nalt}++; $$stat{count}++; for my $type (keys %types) { $$stat{$type.'_count'}++; } } } =head2 collect_stats_sample About : Collect stats for given sample Usage : my $x=$vstats->next_data_hash(); $vstats->collect_stats_sample($x,'NA0001'); Args [1] hash row from next_data_hash [2] sample name [3] stats to collect =cut sub collect_stats_sample { my ($self,$rec,$sample,$stats) = @_; my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample); if ( @$alleles > 2 ) { $self->throw("FIXME: currently handling diploid data only (easy to fix)\n"); } my $prev; for my $al (@$alleles) { if ( !defined $prev or $prev ne $al ) { # Only heterozygous SNPs will be counted twice $self->add_variant($rec,$al,$stats); } $prev = $al; } } =head2 add_variant About : Register mutation type in the selected pool Usage : $vstats->add_variant('A','AT',$stats); $vstats->add_variant($rec,'AT',$stats); Args [1] Reference haplotype or VCF data line parsed by next_data_hash [2] Variant haplotype [3] Array of hash stats Returns : The event type (snp,indel,ref) =cut sub add_variant { my ($self,$ref,$alt,$stats) = @_; my $key_type = 'other'; my %key_subt; if ( $alt eq '.' ) { $key_type = 'missing'; } else { my ($type,$len,$ht) = $self->event_type($ref,$alt); if ( $type eq 's' ) { $key_type = 'snp'; # The SNP can be encoded for example as GTTTTTTT>CTTTTTTT my $ref_str = ref($ref) eq 'HASH' ? $$ref{REF} : $ref; my $ref_len = length($ref_str); if ( $ref_len>1 ) { for (my $i=0; $i<$ref_len; $i++) { my $ref_nt = substr($ref_str,$i,1); my $alt_nt = substr($alt,$i,1); if ( $ref_nt ne $alt_nt ) { $key_subt{$ref_nt.'>'.$alt_nt}++; } } } else { $key_subt{$ref_str.'>'.$alt}++; } } elsif ( $type eq 'i' ) { $key_type = 'indel'; $key_subt{$len}++; } elsif ( $type eq 'r' ) { $key_type = 'ref'; } } for my $stat (@$stats) { if ( %key_subt ) { while (my ($subt,$value)=each %key_subt) { $$stat{$key_type}{$subt}+=$value; } } else { $$stat{$key_type}++; } } return $key_type; } =head2 dump About : Produce Data::Dumper dump of the collected stats Usage : Args : Returns : The dump. =cut sub dump { my ($self) = @_; return Dumper($$self{stats}); } sub _calc_tstv { my ($self,$stat) = @_; my $ts = 0; for my $mut (qw(A>G G>A C>T T>C)) { if ( exists($$stat{$mut}) ) { $ts += $$stat{$mut}; } } my $tv = 0; for my $mut (qw(A>C C>A G>T T>G A>T T>A C>G G>C)) { if ( exists($$stat{$mut}) ) { $tv += $$stat{$mut}; } } my $ratio = $tv ? $ts/$tv : 0; return ($ts,$tv,$ratio); } =head2 dump_tstv About : Calculate transitions/transversions ratio and output string Usage : Args : Returns : Formatted string =cut sub dump_tstv { my ($self,$stats) = @_; my $out = "#Transitions\tTransversions\tts/tv\tSample\n"; for my $key (sort keys %$stats) { if ( !exists($$stats{$key}{snp}) ) { next; } my $stat = $$stats{$key}{snp}; my ($ts,$tv,$ratio) = $self->_calc_tstv($stat); $out .= sprintf "%d\t%d\t%.2f\t%s\n", $ts,$tv,$ratio,$key; } return $out; } =head2 dump_qual_tstv About : Calculate marginal transitions/transversions ratios for QUAL/* stats Usage : Args : Returns : Formatted string =cut sub dump_qual_tstv { my ($self,$file) = @_; my @values; for my $stat (keys %{$$self{stats}}) { if ( !($stat=~m{^QUAL/(.+)}) ) { next; } my $qual = $1; # The quality record can be also of the form ">200". Exclude these from numeric comparison if ( !($qual=~/^[0-9.]+$/) ) { $qual = "#$qual"; } my $count = $$self{stats}{$stat}{count}; if ( !exists($$self{stats}{$stat}{snp}) ) { next; } my ($ts,$tv,$ratio) = $self->_calc_tstv($$self{stats}{$stat}{snp}); push @values, [$qual,$count,$ratio]; } my @svalues = sort { if ($$a[0]=~/^#/ or $$b[0]=~/^#/) { return $$a[0] cmp $$b[0]; } return $$a[0] <=> $$b[0]; } @values; my $out = "#Quality\tMarginal count\tMarginal Ts/Tv\n"; for my $val (@svalues) { if ( $$val[0]=~/^#/ ) { $out .= sprintf "%s\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2]; } else { $out .= sprintf "%.2f\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2]; } } return $out; } =head2 dump_counts About : Usage : Args : Returns : Formatted string =cut sub dump_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{count}) ) { next; } $out .= sprintf "%d\t%s\n", $$self{stats}{$key}{count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{count},$key; } return $out; } sub dump_snp_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{snp_count}) ) { next; } $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{snp_count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{snp_count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{snp_count},$key; } return $out; } sub dump_indel_counts { my ($self) = @_; my $out = "#Count\tFilter\n"; for my $key (sort keys %{$$self{stats}}) { if ( !exists($$self{stats}{$key}{indel_count}) ) { next; } $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{indel_count},$key; } for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{indel_count}) ) { next; } $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{indel_count},$key; } return $out; } sub dump_shared_counts { my ($self) = @_; my $out = "#Shared SNPs\tFrequency\n"; for my $key (sort {$a<=>$b} keys %{$$self{stats}{all}{shared}}) { $out .= sprintf "%d\t%s\n", $key,$$self{stats}{all}{shared}{$key}; } return $out; } sub dump_private_counts { my ($self) = @_; my $out = "#Private SNPs\tSample\n"; for my $key (sort keys %{$$self{stats}{samples}}) { if ( !exists($$self{stats}{samples}{$key}{private}) ) { next; } $out .= sprintf "%d\t%s\n", $$self{stats}{samples}{$key}{private},$key; } return $out; } sub _init_path { my ($self,$prefix) = @_; if ( $prefix=~m{/} ) { # A directory should be created. This will populate dir and prefix, for example # prefix -> dir prefix # ---------------------------- # out out.dump # out/ out/ out/out.dump # out/xxx out/ out/xxx.dump # my $dir = ''; if ( $prefix=~m{/[^/]+$} ) { $dir=$`; } elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; } elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; } if ( $dir ) { `mkdir -p $dir`; } } return $prefix; } sub legend { my ($self) = @_; return q[ count Number of positions with known genotype nalt_X Number of monoallelic (X=0), biallelic (X=1), etc. sites ref, ref_count Number of sites containing reference allele shared Number of sites having a non-reference allele in 0,1,2,etc samples snp_count Number of positions with SNPs ]; } =head2 save_stats About : Save all collected stats to files Usage : Args : The prefix of output files. Non-existent directories will be created. Returns : N/A =cut sub save_stats { my ($self,$prefix) = @_; if ( !defined $prefix ) { print $self->dump(); return; } my $path = $self->_init_path($prefix); $self->_write_file($path.'.legend', $self->legend()); $self->_write_file($path.'.dump', $self->dump()); $self->_write_file($path.'.tstv', $self->dump_tstv($$self{stats})); $self->_write_file($path.'.counts', $self->dump_counts()); $self->_write_file($path.'.snps', $self->dump_snp_counts()); $self->_write_file($path.'.indels', $self->dump_indel_counts()); $self->_write_file($path.'.qual-tstv',$self->dump_qual_tstv); $self->_write_file($path.'.shared',$self->dump_shared_counts()); $self->_write_file($path.'.private',$self->dump_private_counts()); if ( exists($$self{stats}{samples}) ) { $self->_write_file($path.'.samples-tstv',$self->dump_tstv($$self{stats}{samples})); } } sub _write_file { my ($self,$fname,$text) = @_; open(my $fh,'>',$fname) or $self->throw("$fname: $!"); print $fh $text; close($fh); } 1; vcftools_0.1.11/perl/vcf-query0000755000000000000000000003654312156354770015047 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # use strict; use warnings; use Carp; use Vcf; my $opts = parse_params(); read_data($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { confess @msg; } die "Usage: vcf-query [OPTIONS] file.vcf.gz\n", "Options:\n", " -c, --columns List of comma-separated column names or one column name per line in a file.\n", " -f, --format The default is '%CHROM:%POS\\t%REF[\\t%SAMPLE=%GT]\\n'\n", " -l, --list-columns List columns.\n", " -r, --region chr:from-to Retrieve the region. (Runs tabix.)\n", " --use-old-method Use old version of API, which is slow but more robust.\n", " -h, -?, --help This help message.\n", "Expressions:\n", " %CHROM The CHROM column (similarly also other columns)\n", " %GT Translated genotype (e.g. C/A)\n", " %GTR Raw genotype (e.g. 0/1)\n", " %INFO/TAG Any tag in the INFO column\n", " %LINE Prints the whole line\n", " %SAMPLE Sample name\n", " [] The brackets loop over all samples\n", " %* All format fields printed as KEYVALUE\n", "Examples:\n", " vcf-query file.vcf.gz 1:1000-2000 -c NA001,NA002,NA003\n", " vcf-query file.vcf.gz -r 1:1000-2000 -f '%CHROM:%POS\\t%REF\\t%ALT[\\t%SAMPLE:%*=,]\\n'\n", " vcf-query file.vcf.gz -f '[%GT\\t]%LINE\\n'\n", " vcf-query file.vcf.gz -f '[%GT\\ ]%LINE\\n'\n", " vcf-query file.vcf.gz -f '%CHROM\\_%POS\\t%INFO/DP\\t%FILTER\\n'\n", "\n"; } sub parse_params { my $opts = { columns=>'', format_string=>"%CHROM:%POS\t%REF[\t%SAMPLE=%GT]\n" }; while (defined(my $arg=shift(@ARGV))) { if ( $arg eq '--use-old-method' ) { $$opts{use_old_method}=1; next } if ( $arg eq '-f' || $arg eq '--format' ) { $$opts{format_string}=shift(@ARGV); next } if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{columns}=shift(@ARGV); next } if ( $arg eq '-l' || $arg eq '--list-columns' ) { $$opts{list_columns}=1; next } if ( $arg eq '-r' || $arg eq '--region' ) { $$opts{region}=shift(@ARGV); next } if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } if ( !exists($$opts{region}) && exists($$opts{file}) && ($arg=~/^[^:]+:[0-9,]+-[0-9,]+$/ or $arg=~/^[^\:]+$/) ) { $$opts{region}=$arg; next; } error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); } if ( !exists($$opts{file}) && exists($$opts{region}) ) { error("The region cannot be used when streaming the file.\n"); } if ( exists($$opts{columns}) && -e $$opts{columns} ) { my @cols; open(my $fh,'<',$$opts{columns}) or error("$$opts{columns}: $!"); while (my $line=<$fh>) { if ( $line=~/^\s*$/ ) { next; } $line =~ s/^\s*//; $line =~ s/\s*$//; push @cols, $line; } close($fh); $$opts{columns} = join(',', @cols); } return $opts; } sub parse_format_string { my ($str,$hash) = @_; my (@arr,%idx,$join1,$join2); $str =~ s/\\n/\n/g; $str =~ s/\\t/\t/g; while ($str) { if ( !($str=~/%/) ) { push @arr,$str; last; } my $before = $`; $str = $'; my $match; if ( $str=~/^[*](.)(.)/ ) { $match = '*'; $join1=$1; $join2=$2; } elsif ( $str=~m{([A-Za-z0-9/_]+)} ) { $match = $1; } else { error("FIXME: $str"); } if ( defined $before && $before ne '' ) { push @arr,$before; } push @arr,'.'; # If the tag is not present in the VCF, a missing value ('.') will be printed instead. if ( exists($idx{$match}) ) { warn("The tag \"$match\" given multiple times, only the last occurance will be used\n"); } $idx{$match} = $#arr; $str = $'; } for (my $i=0; $i<@arr; $i++) { $arr[$i] =~ s/\\{1}//g; } $$hash{format} = \@arr; $$hash{idx} = \%idx; $$hash{join1} = $join1; $$hash{join2} = $join2; } sub parse_format { my ($opts,$cols) = @_; $$opts{before} = {}; $$opts{repeat} = {}; $$opts{after} = {}; my ($before,$repeat,$after); my $str = $$opts{format_string}; $before = $str; if ( $str=~/\[([^\]]+)\]/ ) { $before = $`; $repeat = $1; $after = $'; } if ( $before ) { parse_format_string($before,$$opts{before}); } if ( $repeat ) { parse_format_string($repeat,$$opts{repeat}); } if ( $after ) { parse_format_string($after,$$opts{after}); } } sub copy_array { my ($arr) = @_; my @out; for my $item (@$arr) { push @out,$item; } return @out; } sub get_columns { my ($vcf) = @_; my @cols = (); my $ncols = @{$$vcf{columns}}; for (my $i=9; $i<$ncols; $i++) { push @cols, $$vcf{columns}[$i]; } return \@cols; } sub get_sample_idxs { my ($vcf,@samples) = @_; my @idxs; for my $sample (@samples) { if ( !exists($$vcf{has_column}{$sample}) ) { error("No such sample: [$sample]\n"); } push @idxs, $$vcf{has_column}{$sample} - 1; } return @idxs; } sub list_columns { my ($opts) = @_; my $cols = get_columns($$opts{vcf}); for my $col (@$cols) { print "$col\n"; } } sub read_data { my ($opts) = @_; if ( exists($$opts{use_old_method}) ) { read_data_slow_hash($opts); return; } my %args = ( print_header=>1 ); if ( $$opts{region} ) { $args{region} = $$opts{region}; } if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } else { $args{fh} = \*STDIN; } my $vcf = Vcf->new(%args); $$opts{vcf} = $vcf; $vcf->parse_header(); if ( $$opts{list_columns} ) { list_columns($opts); exit; } my @cols = split(/,/,$$opts{columns}); if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } my @sample_idxs = get_sample_idxs($$opts{vcf},@cols); # The hash opts will be filled with the keys 'before','repeat','after' with formatting information parse_format($opts); while (my $line=$vcf->next_line()) { my $x = $vcf->next_data_array($line); # Fill everything what comes before the repeat [] if ( $$opts{before} ) { my (@out) = copy_array($$opts{before}{format}); while (my ($fieldname,$idx) = each %{$$opts{before}{idx}}) { if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } elsif ( substr($fieldname,0,5) eq 'INFO/' ) { $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } # Fill the repeaty stuff (the sample columns) if ( $$opts{repeat} ) { my @repeats; for my $sample_idx (@sample_idxs) { push @repeats, [ copy_array($$opts{repeat}{format}) ]; } my @alt; if ( exists($$opts{repeat}{idx}{GT}) ) { @alt = split(/,/,$$x[4]); } while (my ($fieldname,$idx) = each %{$$opts{repeat}{idx}}) { if ( $fieldname eq '*' ) { my $sep1 = $$opts{repeat}{join1}; my $sep2 = $$opts{repeat}{join2}; my @fmt = split(/:/,$$x[8]); for (my $i=0; $i<@sample_idxs; $i++) { my $sample_idx = $sample_idxs[$i]; my @tmp; my $j = 0; for my $value (split(/:/,$$x[$sample_idx])) { push @tmp, $fmt[$j++].$sep1.$value; } $repeats[$i][$idx] = join($sep2,@tmp); } next; } my $fmt_idx = $vcf->get_tag_index($$x[8],$fieldname eq 'GTR' ? 'GT' : $fieldname,':'); for (my $i=0; $i<@sample_idxs; $i++) { my $sample_idx = $sample_idxs[$i]; if ( $fmt_idx!=-1 ) { my $value = $vcf->get_field($$x[$sample_idx],$fmt_idx); if ( $fieldname eq 'GT' ) { $value = $vcf->decode_genotype($$x[3],\@alt,$value); } $repeats[$i][$idx] = $value; } } } if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { my $idx = $$opts{repeat}{idx}{SAMPLE}; for (my $i=0; $i<@cols; $i++) { $repeats[$i][$idx] = $cols[$i] } } for my $repeat (@repeats) { for (my $i=0; $i<@$repeat; $i++) { if (!defined($$repeat[$i])) { $$repeat[$i]='.'; } } print join('',@$repeat); } } # Fill everything what comes after the repeat ([]) if ( $$opts{after} ) { my (@out) = copy_array($$opts{after}{format}); while (my ($fieldname,$idx) = each %{$$opts{after}{idx}}) { if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } elsif ( substr($fieldname,0,5) eq 'INFO/' ) { $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } } sub read_data_slow_hash { my ($opts) = @_; my %args = ( print_header=>1 ); if ( $$opts{region} ) { $args{region} = $$opts{region}; } if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } else { $args{fh} = \*STDIN; } my $vcf = Vcf->new(%args); $$opts{vcf} = $vcf; $vcf->parse_header(); if ( $$opts{list_columns} ) { list_columns($opts); exit; } my @cols = split(/,/,$$opts{columns}); if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } # The hash opts will be filled with the keys 'before','repeat','after' with formatting information parse_format($opts); while (my $line=$vcf->next_line()) { my $x=$vcf->next_data_hash($line); # Fill everything what comes before the repeat [] # Code repetition and not very nice, should be changed at some point... if ( $$opts{before} ) { my (@out) = copy_array($$opts{before}{format}); while (my ($colname,$idx) = each %{$$opts{before}{idx}}) { if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } if ( $colname=~m{INFO/(.+)} ) { if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) { # It is a flag $out[$idx] = 'True'; } else { $out[$idx] = $$x{INFO}{$1}; } next; } if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } # Fill the repeaty stuff (the sample columns) if ( $$opts{repeat} ) { for my $col (@cols) { my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); my (@out) = copy_array($$opts{repeat}{format}); while (my ($colname,$idx) = each %{$$opts{repeat}{idx}}) { if ( exists($$x{gtypes}{$col}{$colname}) ) { $out[$idx] = $$x{gtypes}{$col}{$colname}; } elsif ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { $out[$$opts{repeat}{idx}{SAMPLE}] = $col; } if ( exists($$opts{repeat}{idx}{GTR}) ) { $out[$$opts{repeat}{idx}{GTR}] = $$x{gtypes}{$col}{GT}; } if ( exists($$opts{repeat}{idx}{GT}) ) { my $tmp = $$alleles[0]; for (my $i=0; $i<@$seps; $i++) { $tmp .= $$seps[$i].$$alleles[$i+1]; } $out[$$opts{repeat}{idx}{GT}] = $tmp; } if ( exists($$opts{repeat}{idx}{'*'}) ) { my $sep1 = $$opts{repeat}{join1}; my $sep2 = $$opts{repeat}{join2}; my @tmp; while (my ($key,$value)=each(%{$$x{gtypes}{$col}})) { if ( $key eq 'GT' ) { $value = $$alleles[0]; for (my $i=0; $i<@$seps; $i++) { $value .= $$seps[$i].$$alleles[$i+1]; } } push @tmp, $key.$sep1.$value; } my $idx = $$opts{repeat}{idx}{'*'}; $out[$idx] = join($sep2,@tmp); } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } # Fill everything what comes after the repeat ([]) if ( $$opts{after} ) { my (@out) = copy_array($$opts{after}{format}); while (my ($colname,$idx) = each %{$$opts{after}{idx}}) { if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } if ( $colname=~m{INFO/(.+)} ) { if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) { # It is a flag $out[$idx] = 'True'; } else { $out[$idx] = $$x{INFO}{$1}; } next; } if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } } for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } print join('',@out); } } } vcftools_0.1.11/perl/test.t0000755000000000000000000004360112156354770014340 0ustar rootroot#!/usr/bin/env perl # # Author: petr.danecek@sanger # # Usage: test.t [-d] # use strict; use warnings; use Carp; use IPC::Open2; use FindBin; use lib "$FindBin::Bin"; use Vcf; BEGIN { use Test::Most tests => 75; } my $path = $FindBin::RealBin; my $debug = ($ARGV[0] && $ARGV[0] eq '-d') ? 1 : 0; test_bgzip_and_tabix("$path/../examples/merge-test-a.vcf"); test_validator($path,"$path/../examples/valid-3.3.vcf"); test_validator($path,"$path/../examples/valid-4.0.vcf"); test_validator($path,"$path/../examples/valid-4.1.vcf"); test_validator($path,"$path/../examples/floats.vcf"); test_format_validation($path,'3.3'); test_format_validation($path,'4.0'); test_format_validation($path,'4.1'); test_parse($path); test_vcf_stats($path,"$path/../examples/valid-4.0.vcf"); test_empty_cols($path,'4.0'); test_merge($path,'merge-test.vcf.out','merge-test-a.vcf','merge-test-b.vcf','merge-test-c.vcf'); test_compare($path,'cmp-test-a.vcf','cmp-test-b.vcf','cmp-test.out'); test_isec($path,'-n +2','isec-n2-test.vcf.out','merge-test-a.vcf','merge-test-b.vcf','merge-test-c.vcf'); test_query_vcf("$path/../examples/",'cmp-test-a.vcf','query-test.out','%CHROM:%POS\tref=%REF\talt=%ALT\tqual=%QUAL\t%INFO/DP[\t%SAMPLE=%GT]\n'); test_shuffle("$path/../examples/",'cmp-test-a.vcf','shuffle-test.vcf'); test_concat("$path/../examples/",'concat.out','concat-a.vcf','concat-b.vcf','concat-c.vcf'); test_annotate("$path/../examples/",'-c FROM,TO,CHROM,-,-,-,INFO/HM2,INFO/GN,INFO/DP -d key=INFO,ID=HM2,Number=0,Type=Flag,Description="HapMap2 membership" -d key=INFO,ID=GN,Number=1,Type=String,Description="Gene Name" -d key=INFO,ID=DP,Number=0,Type=Integer,Description="Depth,etc"','annotate.out','concat-a.vcf','annotate.txt'); test_annotate("$path/../examples/",'-c FROM,TO,CHROM,ID,REF,ALT,INFO/HM2,INFO/GN,INFO/DP -d key=INFO,ID=HM2,Number=0,Type=Flag,Description="HapMap2 membership" -d key=INFO,ID=GN,Number=1,Type=String,Description="Gene Name" -d key=INFO,ID=DP,Number=0,Type=Integer,Description="Depth,etc"','annotate3.out','concat-a.vcf','annotate.txt'); test_annotate("$path/../examples/",'-f +/D=34/c=2,3','annotate2.out','annotate-test.vcf'); test_fill_an_ac("$path/../examples/",'fill-an-ac.out','concat-a.vcf'); test_indel_stats("$path/../examples/",'indel-stats.out','indel-stats.vcf','indel-stats.tab'); test_consensus("$path/../examples/",'','consensus.out','consensus.vcf','consensus.fa'); test_consensus("$path/../examples/",'-s NA001','consensus.out2','consensus.vcf','consensus.fa'); test_contrast("$path/../examples/",'-n +D -A,B,C -d 10','contrast.out','contrast.vcf'); test_ploidy("$path/../examples/",'fix-ploidy'); test_api_event_type([qw(A C),'s 1 C'],[qw(A ACGT),'i 3 CGT'],[qw(ACGT A),'i -3 CGT'],[qw(ACGT ACT),'i -1 G'], [qw(ACGT AAA),'o 3 AAA'],[qw(A .),'r 0 A'],[qw(A ),'u 0 '],[qw(ACG AGC),'s 2 AGC'], [qw(A .A),'b'], [qw(A A.),'b']); test_api(); exit; #-------------------------------------- sub test_bgzip_and_tabix { my ($file) = @_; my $cmd; $cmd = "cat $file | bgzip -c > $file.gz"; system($cmd); is($?,0,"Is bgzip OK? .. $cmd"); $cmd = "tabix $file.gz"; system($cmd); is($?,0,"Is tabix OK? .. $cmd"); } sub test_validator { my ($path,$fname) = @_; my $cmd = "perl -I$path -MVcf -e validate $fname"; my @out = `$cmd 2>&1`; my @exp = (); is_deeply(\@out,\@exp,"Testing validator .. $cmd"); } sub test_format_validation { my ($path,$version) = @_; my ($chld_in,$chld_out); my $cmd = "perl -I$path -MVcf -e validate 2>&1"; my $pid = open2($chld_out, $chld_in, $cmd); my $vcf = Vcf->new(version=>$version); $vcf->recalc_ac_an(2); $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'}); $vcf->add_header_line({key=>'INFO', ID=>'AN',Number=>1,Type=>'Integer',Description=>'Total number of alleles in called genotypes'}); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); if ( $version >= 4.0 ) { $vcf->add_header_line({key=>'ALT',ID=>'DEL:ME:ALU', Description=>'Deletion of ALU element'}); } if ( $version >= 4.1 ) { $vcf->add_header_line({key=>'reference',value=>'file:/some/file.fa'}); $vcf->add_header_line({key=>'contig',ID=>'1',length=>12345,md5=>'f126cdf8a6e0c7f379d618ff66beb2da',assembly=>'E.T.'}); } $vcf->add_columns('NA0001','NA0002'); print $vcf->format_header() unless !$debug; print $chld_in $vcf->format_header(); my %rec = ( CHROM=>1, POS=>1, REF=>'A', QUAL=>$$vcf{defaults}{QUAL}, FORMAT=>['GT'] ); $rec{gtypes}{NA0001}{GT} = 'A/A'; $rec{gtypes}{NA0002}{GT} = $$vcf{defaults}{GT}; $vcf->format_genotype_strings(\%rec); print $vcf->format_line(\%rec) unless !$debug; print $chld_in $vcf->format_line(\%rec); $rec{POS} = 2; $rec{gtypes}{NA0002}{GT} = 'IA|D1'; if ( $version >= 4.0 ) { $rec{REF} = 'AC'; $rec{gtypes}{NA0002}{GT} = 'ATC|'; } $vcf->format_genotype_strings(\%rec); print $vcf->format_line(\%rec) unless !$debug; print $chld_in $vcf->format_line(\%rec); close($chld_in); my @exp = (); my @out = (); while (my $line=<$chld_out>) { chomp($line); push @out,$line; } close($chld_out); waitpid $pid, 0; if ( !is_deeply(\@out,\@exp,"Testing formatting followed by validation .. $cmd") ) { print STDERR @out; } } sub test_parse { my ($path) = @_; my $vcf = Vcf->new(file=>"$path/../examples/parse-test.vcf"); $vcf->parse_header; my $line; $line = $vcf->next_data_array; is_deeply($$line[4],"G","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],"G,,T,","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],",G,,T","Testing next_data_array"); $line = $vcf->next_data_array; is_deeply($$line[4],",G,,T,","Testing next_data_array"); } sub test_vcf_stats { my ($path,$file) = @_; my $cmd = "perl -I$path -MVcf $path/vcf-stats $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$file.stats") or confess("$file.stats: $!"); my @exp = <$fh>; close($fh); is_deeply(\@out,\@exp,"Testing vcf-stats .. $cmd"); } sub test_empty_cols { my ($path,$version) = @_; my ($header,$vcf,@out,$exp); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns(qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); is_deeply($out[-1],'#'.$exp,"Testing add_columns with genotypes full, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns('NA0001'); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA0001)); is_deeply($out[-1],'#'.$exp,"Testing add_columns with genotypes brief, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns(); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO)); is_deeply($out[-1],'#'.$exp,"Testing add_columns brief, $version."); $vcf = Vcf->new(version=>$version); $vcf->add_header_line({key=>'FORMAT', ID=>'GT',Number=>1,Type=>'String',Description=>'Genotype'}); $vcf->add_columns('FORMAT'); $header = $vcf->format_header(); @out = split(/\n/,$header); $exp = join("\t",qw(CHROM POS ID REF ALT QUAL FILTER INFO FORMAT)); is_deeply($out[-1],'#'.$exp,"Testing add_columns no gtypes, $version."); } sub test_compare { my ($path,$a,$b,$expected) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); for my $file ($a,$b) { `cat $file | bgzip -c > $file.gz`; `tabix -p vcf -f $file.gz`; } my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-compare -g $a.gz $b.gz | grep -v '^# The command'"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$expected") or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-compare .. $cmd"); } sub test_merge { my ($path,$expected,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-merge"; for my $file (@files) { `cat $file | bgzip -c > $file.gz; tabix -f -p vcf $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>&1 | grep -v ^##source`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-merge .. $cmd"); } sub test_isec { my ($path,$opts,$expected,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-isec -f $opts"; for my $file (@files) { `cat $file | bgzip -c > $file.gz; tabix -f -p vcf $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>&1 | grep -v ^##source`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-isec .. $cmd"); } sub test_query_vcf { my ($path,$file,$expected,$query) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-query -f '$query' $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',$expected) or confess("$expected: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-query .. $cmd"); } sub test_shuffle { my ($path,$template,$file) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-shuffle-cols -t $template $file"; my @out = `$cmd 2>&1`; open(my $fh,'<',$template) or confess("$template: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-shuffle-cols .. $cmd"); } sub test_concat { my ($path,$out,@files) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-concat -s 3"; for my $file (@files) { `cat $file | bgzip -c > $file.gz`; `tabix -p vcf -f $file.gz`; $cmd .= " $file.gz"; } my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-concat .. $cmd"); } sub test_annotate { my ($path,$args,$out,$vcf,$annot) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-annotate $args $vcf"; if ( defined $annot ) { `cat $annot | bgzip -c > $annot.gz`; `tabix -s 3 -b 1 -e 2 -f $annot.gz`; $cmd .= " -a $annot.gz"; } my @out = `$cmd 2>&1 | grep -v ^##source`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-annotate .. $cmd"); } sub test_fill_an_ac { my ($path,$out,$vcf) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/fill-an-ac $vcf"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing fill-an-ac .. $cmd"); } sub test_indel_stats { my ($path,$out,$vcf,$tab) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-indel-stats -e $tab < $vcf"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing fill-an-ac .. $cmd"); } sub test_consensus { my ($path,$args,$out,$vcf,$fa) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); `cat $vcf | bgzip -c > $vcf.gz`; `tabix -p vcf -f $vcf.gz`; my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-consensus $args $vcf.gz < $fa"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-consensus .. $cmd"); } sub test_contrast { my ($path,$args,$out,$vcf) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "perl -I../perl/ -MVcf ../perl/vcf-contrast $args $vcf | grep -v ^##source"; my @out = `$cmd 2>&1`; open(my $fh,'<',$out) or confess("$out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-contrast .. $cmd"); } sub test_ploidy { my ($path,$prefix) = @_; my $curdir = `pwd`; chomp($curdir); chdir("$path/../examples"); my $cmd = "cat $prefix.vcf | perl -I../perl/ -MVcf ../perl/vcf-fix-ploidy -s $prefix.samples -p $prefix.txt 2>/dev/null | vcf-query -f '\%POS[\\t\%SAMPLE \%GTR \%PL]\\n'"; my @out = `$cmd 2>&1`; open(my $fh,'<',"$prefix.out") or confess("$prefix.out: $!"); my @exp = <$fh>; close($fh); chdir($curdir); is_deeply(\@out,\@exp,"Testing vcf-fix-ploidy .. $cmd"); } sub test_api_event_type { my (@subs) = @_; my $vcf = Vcf->new(); for my $mut (@subs) { my $exp = join(' ', $vcf->event_type($$mut[0],$$mut[1])); is_deeply($$mut[2],$exp,"Testing API event_type($$mut[0],$$mut[1]) .. $exp"); } } sub test_api { my $vcf = Vcf->new(); my $ret; my $fmt = 'GT:GL:PL'; $ret = $vcf->get_tag_index($fmt,'GT',':'); is($ret,0,"Testing get_tag_index($fmt,'GT',':')"); $ret = $vcf->get_tag_index($fmt,'GL',':'); is($ret,1,"Testing get_tag_index($fmt,'GL',':')"); $ret = $vcf->get_tag_index($fmt,'PL',':'); is($ret,2,"Testing get_tag_index($fmt,'PL',':')"); $ret = $vcf->remove_field($fmt,0,':'); is($ret,'GL:PL',"Testing get_tag_index($fmt,0,':')"); $ret = $vcf->remove_field($fmt,1,':'); is($ret,'GT:PL',"Testing get_tag_index($fmt,1,':')"); $ret = $vcf->remove_field($fmt,2,':'); is($ret,'GT:GL',"Testing get_tag_index($fmt,2,':')"); $ret = $vcf->replace_field($fmt,'XX',0,':'); is($ret,'XX:GL:PL',"Testing get_tag_index($fmt,'XX',0,':')"); $ret = $vcf->replace_field($fmt,'XX',1,':'); is($ret,'GT:XX:PL',"Testing get_tag_index($fmt,'XX',1,':')"); $ret = $vcf->replace_field($fmt,'XX',2,':'); is($ret,'GT:GL:XX',"Testing get_tag_index($fmt,'XX',2,':')"); $ret = $vcf->replace_field($fmt,'XX',4,':'); is($ret,'GT:GL:PL::XX',"Testing get_tag_index($fmt,'XX',4,':')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'0/1/2|1/0|1|2'); is($ret,'C/G/T|G/C|G|T',"Testing decode_genotype('C',['G','T'],'0/1/2|1/0|1|2')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'2|1'); is($ret,'T|G',"Testing decode_genotype('C',['G','T'],'2|1')"); $ret = $vcf->decode_genotype('C',[qw(G T)],'2'); is($ret,'T',"Testing decode_genotype('C',['G','T'],'2')"); my $info = 'NS=2;HM;AF=0.333;AFA=T;DB'; $ret = $vcf->get_info_field($info,'NS'); is($ret,'2',"Testing get_info_field($info,'NS')"); $ret = $vcf->get_info_field($info,'AF'); is($ret,'0.333',"Testing get_info_field($info,'AF')"); $ret = $vcf->get_info_field($info,'AFA'); is($ret,'T',"Testing get_info_field($info,'AFA')"); $ret = $vcf->get_info_field($info,'HM'); is($ret,'1',"Testing get_info_field($info,'HM')"); $ret = $vcf->get_info_field($info,'DB'); is($ret,'1',"Testing get_info_field($info,'DB')"); $ret = $vcf->get_info_field($info,'DBX'); is($ret,undef,"Testing get_info_field($info,'DBX')"); $ret = $vcf->get_info_field('DB','DB'); is($ret,'1',"Testing get_info_field('DB','DB')"); $ret = $vcf->get_info_field('XDB','DB'); is($ret,undef,"Testing get_info_field('XDB','DB')"); my @ret; @ret = $vcf->split_gt('0/1'); is_deeply(\@ret,[0,1],"Testing split_gt('0/1')"); @ret = $vcf->split_gt('0'); is_deeply(\@ret,[0],"Testing split_gt('0')"); my @als; @als = ("TTGGTAT","TTGGTATCTAGTGGTAT,TGGTATCTAGTGGTAT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TTGGTATCTAG","TGGTATCTAG"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("TT","TCTAGTGGTAAT,TCT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TCTAGTGGTAA","TC"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("TGGGGGG","TGGGGGGG"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["T","TG"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("CAAAAAA","CAAAAA"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["CA","C"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("CA","CT"); @ret = $vcf->normalize_alleles(@als); is_deeply(\@ret,["CA","CT"],"Testing normalize_alleles(".join(',',@als).")"); @als = ("GAACCCACA","GA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[0,"GAACCCAC","G"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("CAGTAAAA","CAGAAAA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[2,"GT","G"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("CAGTAAA","CAGAAAA"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[3,"T","A"],"Testing normalize_alleles_pos(".join(',',@als).")"); @als = ("GA","GACC"); @ret = $vcf->normalize_alleles_pos(@als); is_deeply(\@ret,[1,"A","ACC"],"Testing normalize_alleles_pos(".join(',',@als).")"); } vcftools_0.1.11/perl/vcf-convert0000755000000000000000000001352212156354770015352 0ustar rootroot#!/usr/bin/env perl use strict; use warnings; use Carp; use Vcf; use FindBin; use lib "$FindBin::Bin"; use FaSlice; my $opts = parse_params(); convert_file($opts); exit; #-------------------------------- sub error { my (@msg) = @_; if ( scalar @msg ) { croak @msg; } die "About: Convert between VCF versions.\n", "Usage: cat in.vcf | vcf-convert [OPTIONS] > out.vcf\n", "Options:\n", " -r, --refseq The reference sequence in samtools faindexed fasta file. (Not required with SNPs only.)\n", " -v, --version 4.0, 4.1\n", " -h, -?, --help This help message.\n", "\n"; } sub parse_params { my $opts = { version=>'4.1' }; while (my $arg=shift(@ARGV)) { if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } if ( $arg eq '-v' || $arg eq '--version' ) { $$opts{version}=shift(@ARGV); next; } if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } error("Unknown parameter \"$arg\". Run -h for help.\n"); } return $opts; } sub convert_file { my ($opts) = @_; # How to recognise a number my $FLOAT_RE = qr/^\-?\d+\.?\d*(?:[eE][+-]\d+)?$/; my $vcf_in = Vcf->new(fh=>\*STDIN); my $vcf_out = Vcf->new(version=>$$opts{version}); if ( $$opts{version} < $$vcf_in{version} ) { warn("Downgrading of VCF versions is experimental: expect troubles!\n"); } # Convert the header $vcf_in->parse_header(); for (my $i=1; $i<@{$$vcf_in{header_lines}}; $i++) { $vcf_out->add_header_line($$vcf_in{header_lines}[$i]); } $vcf_out->add_columns(@{$$vcf_in{columns}}); print $vcf_out->format_header(); # Convert each data line my $fa; while (my $x=$vcf_in->next_data_hash()) { # Convert missing (default) FORMAT values for my $gt (values %{$$x{gtypes}}) { for my $field (@{$$x{FORMAT}}) { # Skip the GT tag, so that ploidy information is not lost ("./." would become ".") if ( $field eq 'GT' ) { next; } if ( $field eq 'FT' && $$gt{$field} eq $$vcf_in{filter_passed} ) { $$gt{$field}=$$vcf_out{filter_passed}; } if ( exists($$vcf_in{defaults}{$field}) && $$vcf_in{defaults}{$field} eq $$gt{$field} ) { $$gt{$field} = $$vcf_out{defaults}{$field}; next; } if ( exists($$vcf_in{header}{FORMAT}{$field}{default}) && $$vcf_in{header}{FORMAT}{$field}{default} eq $$gt{$field} ) { delete($$gt{$field}); next; } } } # Change missing QUAL: In case they are numbers, do numeric comparison, as -1.0 is sometimes used instead of -1 if ( $$x{QUAL} eq $$vcf_in{defaults}{QUAL} or ($$x{QUAL}=~$FLOAT_RE && $$vcf_in{defaults}{QUAL}=~$FLOAT_RE && $$x{QUAL}==$$vcf_in{defaults}{QUAL}) ) { $$x{QUAL} = $$vcf_out{defaults}{QUAL}; } for (my $i=0; $i<@{$$x{FILTER}}; $i++) { if ( $$x{FILTER}[$i] eq $$vcf_in{filter_passed} ) { $$x{FILTER}[$i] = $$vcf_out{filter_passed}; } } # Parse the ALT column and see if there are indels my $has_indel = 0; for my $alt (@{$$x{ALT}}) { my ($type,$len,$ht) = $vcf_in->event_type($x,$alt); if ( $type eq 's' or $type eq 'r' ) { next; } if ( $type ne 'i' ) { error("FIXME: expected indel at $$x{CHROM}:$$x{POS}\n"); } $has_indel = 1; } # If there is an indel, new REF and ALT must be changed if ( $has_indel ) { my $map = {}; my $alt_to_mapref = {}; for my $alt (@{$$x{ALT}}) { my ($type,$len,$ht) = $vcf_in->event_type($x,$alt); if ( $type eq 's' or $type eq 'r' ) { $$alt_to_mapref{$alt} = { ref=>$$x{REF}, alt=>$alt }; $$map{$$x{REF}}{$alt} = 1; next; } if ( $type eq 'i' && $len>0 ) { my $tmp = $$x{REF}.$ht; $$alt_to_mapref{$alt} = { ref=>$$x{REF}, alt=>$tmp }; $$map{$$x{REF}}{$tmp} = 1; next; } elsif ( $type eq 'i' && $len<0 ) { if ( !$fa ) { if ( !$$opts{refseq} ) { error("Indels present, missing the -r option.\n"); } $fa = FaSlice->new(file=>$$opts{refseq},size=>1_000_000); } my $ref = $fa->get_slice($$x{CHROM},$$x{POS},$$x{POS}+abs($len)); my $first = substr($ref,0,1); # Sanity check if ( $$x{REF} ne $first ) { error("Sanity check failed: the ref does not agree at $$x{CHROM}:$$x{POS} .. [$$x{REF}] in .fa, [$first] in .vcf\n"); } $$alt_to_mapref{$alt} = { ref=>$ref, alt=>$$x{REF} }; $$map{$ref}{$$x{REF}} = 1; next; } else { error("Uh, FIXME: $$x{CHROM}:$$x{POS} [$type] [$len] [$ht]\n"); } } $$x{REF} = $vcf_out->fill_ref_alt_mapping($map); for (my $i=0; $i<@{$$x{ALT}}; $i++) { my $ori_ref = $$alt_to_mapref{$$x{ALT}[$i]}{ref}; my $ori_alt = $$alt_to_mapref{$$x{ALT}[$i]}{alt}; $$x{ALT}[$i] = $$map{$ori_ref}{$ori_alt}; } } print $vcf_out->format_line($x); } } vcftools_0.1.11/README.txt0000644000000000000000000000240612156354770013723 0ustar rootrootLicense: The program package is released under the GNU Lesser General Public License version 3.0 (LGPLv3). Credits: Adam Auton (cpp executable) Petr Danecek (perl) Compiling: To compile and install VCFtools, you should type 'make' in the vcftools folder. The perl scripts and cpp executable will be installed in the /vcftools_(version_num)/bin/ folder. It is recommended that you add this folder to you PATH. Documentation: The latest version of the documentation and examples of usage can be found in the website subdirectory or go online: http://vcftools.sourceforge.net/docs.html Getting Help: The best way to get help regarding VCFtools is to email the mailing list: vcftools-help@lists.sourceforge.net Citation: If you make use of VCFtools in your research, we would appreciate a citation of the following paper: The Variant Call Format and VCFtools, Petr Danecek, Adam Auton, Goncalo Abecasis, Cornelis A. Albers, Eric Banks, Mark A. DePristo, Robert Handsaker, Gerton Lunter, Gabor Marth, Stephen T. Sherry, Gilean McVean, Richard Durbin and 1000 Genomes Project Analysis Group, Bioinformatics, 2011 http://dx.doi.org/10.1093/bioinformatics/btr330 vcftools_0.1.11/Makefile0000644000000000000000000000150012156354770013657 0ustar rootroot# # 1) Compiling # Type make in this directory # # 2) Installation # Edit the BINDIR and MODDIR below as necessary or pass the PREFIX variable # to the make command. When not set, the programs will be placed in "bin" # and "lib" subdirectories in this directory. # PREFIX="/install/to/path/prefix" make install # # Add the MODDIR to your PERL5LIB environment variable: # export PERL5LIB=${PREFIX}/lib:${PERL5LIB} # ifndef PREFIX export PREFIX = $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) endif export BINDIR = ${PREFIX}/bin export MODDIR = ${PREFIX}/lib/perl5/site_perl DIRS = cpp perl install: @mkdir -p $(BINDIR); mkdir -p $(MODDIR); \ for dir in $(DIRS); do cd $$dir && $(MAKE) $(MAKEFLAGS) && cd ..; done clean: @for dir in $(DIRS); do cd $$dir && $(MAKE) clean && cd ..; done vcftools_0.1.11/examples/0000755000000000000000000000000012163074506014033 5ustar rootrootvcftools_0.1.11/examples/cmp-test-a.vcf0000644000000000000000000000117612156354771016521 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 ./.:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 1/1:41:1 0/2:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools_0.1.11/examples/consensus.out0000644000000000000000000000201512156354771016611 0ustar rootroot>1:1-500 ATAC*ATAT*TG*T***ATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTT G*AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCAT TAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAA TATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCT CTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAA CTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAA GGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCT GATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAAT CTTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG A*ATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAAAT CAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGTGT CGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAGTT ATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGATT TGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGT TACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGAC TCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT*C AGACACAGTTAATCCAGAC vcftools_0.1.11/examples/annotate3.out0000644000000000000000000000256212156354771016474 0ustar rootroot##fileformat=VCFv4.0 ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 id1_100 GTTT G 1806 q10 DP=5;GN=gene1;HM2 GT:GQ:DP 0/1:409:35 1 110 id2_110 C T,G 1792 PASS DP=6 GT:GQ:DP 0/1:245:32 1 110 id1_110 CAAA C 1792 PASS DP=6 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 id1_130 G T 1016 PASS DP=7;HM2 GT:GQ:DP 0/1:212:22 1 130 id2_130 GAA GG 1016 PASS DP=7;HM2 GT:GQ:DP 0/1:212:22 1 140 id1_140 GT G 727 PASS DP=8 GT:GQ:DP 0/1:150:30 1 150 id1_150 TAAAA TA,T 246 PASS DP=9 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 id2_110_150 CAAA C 1792 PASS GN=gene2;HM2 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 id2_160 TAAAA TA,TC,T 246 PASS DP=11;GN=gene3 GT:GQ:DP 0/2:12:10 vcftools_0.1.11/examples/indel-stats.out0000644000000000000000000000006012156354771017016 0ustar rootroottotal 20 in-frame 9 frameshift 5 ratio 0.357143 vcftools_0.1.11/examples/concat-c.vcf0000644000000000000000000000171412156354771016234 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 2 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools_0.1.11/examples/subset.indels.out0000644000000000000000000000164312156354771017361 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 1/0:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools_0.1.11/examples/invalid-4.0.vcf0000644000000000000000000000404612156354771016473 0ustar rootroot##fileformat=VCFv4.0 ##problem1=The first base of the second ALT allele at 20:1234567 does not match the reference. ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=1000GenomesPilot-NCBI36 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:-1:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 20 1234567 microsat1 G GA,AAC 50 0 NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:-1:4 0/2:17:2 1/1:40:3 20 1235237 . T . -1 . . GT 0\0 0|0 ./. X 10 rsTest AC A,ATG 10 . . GT 0 0/1 0|2 X 11 rsTest2 T A,G 10 q10;s50 . GT:DP:GQ 0:3:10 .:5:20 0:3:10 vcftools_0.1.11/examples/fix-ploidy.vcf0000644000000000000000000002145512156354771016635 0ustar rootroot##fileformat=VCFv4.1 ##samtoolsVersion=0.1.16-dev (r969:252) ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##source_20111007.1=/software/vertres/codebase/scripts/vcf-annotate -f +/D=1200 ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##source_20111007.2=/software/vertres/codebase/scripts/vcf-annotate -f +/D=1200 ##source_20120109.1=vcf-subset(r660) -c QTL190284,QTL190301,QTL190321,QTL190576,QTL190627,QTL190628 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT M1 M2 F3 F4 M5 M6 20 61098 . C A,T 999 PASS AC1=41;AF1=0.2104;DP4=209,284,67,76;DP=658;FQ=999;MQ=45;PV4=0.39,4.4e-10,0.0034,0.2 GT:PL:DP:SP:GQ 0/1:0,9,72,5,6,7:3:212:12 0/0:0,15,140,5,6,7:5:458752:18 1:147,0,5:7:384:24 0:0,131,5:5:208:18 0/0:0,9,83,5,6,7:3:392:12 0/0:0,6,56,5,6,7:2:204:9 20 61270 . A T 93 PASS AC1=5;AF1=0.02733;DP4=149,185,5,10;DP=398;FQ=93.1;MQ=43;PV4=0.44,3.5e-05,0.028,1 GT:PL:DP:SP:GQ 0/0:8,14,58:3:0:19 0/0:0,6,52:2:0:19 0/0:0,6,56:2:0:19 0/0:0,15,117:5:0:28 0/0:0,6,45:2:0:19 0/0:0,12,87:4:0:25 20 61275 . T G 14.5 PASS AC1=14;AF1=0.07104;DP4=76,141,1,24;DP=345;FQ=14.8;MQ=43;PV4=0.001,4e-16,2.4e-07,0.39 GT:PL:DP:SP:GQ 0/0:0,3,13:1:16908804:12 0/0:0,3,28:1:0:12 0/0:8,0,41:3:201985031:3 0/0:0,12,97:4:134480904:21 0/0:0,6,49:2:117901063:15 0/0:0,9,67:3:33686020:18 20 61282 . T C 120 PASS AC1=35;AF1=0.1794;DP4=50,118,1,30;DP=333;FQ=121;G3=0.6049,0.3951,2.481e-10;HWE=0.0152;MQ=44;PV4=0.0012,1.1e-08,5.7e-11,1 GT:PL:DP:SP:GQ 0/1:15,3,0:1:17302017:6 0/0:0,6,51:2:0:10 0/1:6,0,31:3:437393682:4 0/0:0,6,57:2:134283524:10 0/1:7,0,19:2:404167697:5 0/1:16,0,20:2:67633410:13 20 61795 . G A 999 PASS AC1=63;AF1=0.3239;DP4=193,313,105,127;DP=753;FQ=999;MQ=48;PV4=0.075,4e-23,2.6e-26,1 GT:PL:DP:SP:GQ 0/0:0,27,203:9:-1598497894:27 0/0:0,21,174:7:50331648:21 0/0:0,45,229:15:0:45 0/0:0,27,199:9:1056977028:27 0/0:0,24,182:8:0:24 0/0:0,9,85:3:1678500058:10 20 62731 . C A 999 PASS AC1=24;AF1=0.1207;DP4=326,391,40,53;DP=846;FQ=999;MQ=49;PV4=0.74,6.3e-14,5.2e-07,0.44 GT:PL:DP:SP:GQ 0/0:0,27,194:9:134349316:33 0/0:0,24,194:8:0:30 0/0:0,18,141:6:505290270:24 0/0:0,30,201:10:16908801:36 0/0:0,18,153:6:505290270:24 0/0:0,33,202:11:67175432:39 20 63008 . C A 122 PASS AC1=2;AF1=0.01078;DP4=303,374,3,7;DP=692;FQ=122;MQ=49;PV4=0.52,0.0011,0.093,1 GT:PL:DP:SP:GQ 0/0:0,42,255:14:67371265:59 0/0:0,15,128:5:0:32 0/0:0,15,136:5:505290270:32 0/0:0,39,251:13:67634177:56 0/0:0,15,111:5:505290270:32 0/0:0,27,200:9:33818632:44 20 63231 . T A 999 PASS AC1=5;AF1=0.02753;DP4=289,324,7,18;DP=652;FQ=999;MQ=49;PV4=0.067,8.8e-11,0.004,0.49 GT:PL:DP:SP:GQ 0/0:0,42,246:14:0:54 0/0:0,18,141:6:0:30 0/0:0,27,209:9:0:39 0/0:0,24,186:8:0:36 0/0:0,12,110:4:0:24 0/0:0,21,145:7:0:33 20 63244 . A C 999 PASS AC1=37;AF1=0.1905;DP4=273,269,66,56;DP=670;FQ=999;MQ=49;PV4=0.48,4.1e-21,2.5e-21,1 GT:PL:DP:SP:GQ 0/0:0,36,209:12:858855379:39 0/0:0,21,174:7:7:24 0/0:0,27,198:9:0:30 0/0:0,24,184:8:1055941246:27 0/0:0,15,132:5:0:18 0/0:0,21,159:7:203312940:24 20 63328 . A C 26.4 PASS AC1=1;AF1=0.006786;DP4=439,259,2,3;DP=711;FQ=26.4;MQ=49;PV4=0.37,0.00092,0.2,0.31 GT:PL:DP:SP:GQ 0/0:0,42,242:14:0:61 0/0:0,12,110:4:0:31 0/0:0,36,231:12:0:55 0/0:0,36,226:12:0:55 0/0:0,15,135:5:0:34 0/0:0,18,132:6:0:37 20 63452 . C A 86.5 PASS AC1=4;AF1=0.02128;DP4=399,301,5,6;DP=718;FQ=86.6;MQ=49;PV4=0.54,0.084,0.0093,0.49 GT:PL:DP:SP:GQ 0/0:0,27,200:9:-419321220:41 0/0:0,15,123:5:0:29 0/0:0,33,228:11:0:47 0/0:0,9,88:3:1060858040:23 0/0:0,24,171:8:0:38 0/0:0,15,134:5:1094576049:29 20 63799 . C T 999 PASS AC1=68;AF1=0.347;DP4=215,280,110,139;DP=796;FQ=999;MQ=49;PV4=0.88,7.1e-84,0.16,1 GT:PL:DP:SP:GQ 0/0:0,36,211:12:212:36 0/0:0,27,205:9:50331648:27 0/0:0,18,125:6:384:18 0/0:0,15,125:5:208:15 0/1:1,0,150:8:392:4 0/0:0,12,106:5:204:12 20 63967 . A G 999 PASS AC1=5;AF1=0.02598;DP4=384,427,9,10;DP=833;FQ=999;MQ=49;PV4=1,0.0053,1.9e-05,0.043 GT:PL:DP:SP:GQ 0/0:0,30,183:10:0:43 0/0:0,30,206:10:0:43 0/0:0,30,206:10:0:43 0/0:0,33,230:11:0:46 0/0:0,21,160:7:0:34 0/0:0,12,112:4:0:25 20 65288 . G C 999 PASS AC1=23;AF1=0.1172;DP4=217,304,31,52;DP=612;FQ=999;MQ=49;PV4=0.47,1.3e-40,0.001,1 GT:PL:DP:SP:GQ 0/0:0,18,155:6:212:24 0/0:0,12,113:4:0:18 0/0:0,18,144:6:384:24 0/0:0,21,155:7:208:27 0/0:0,21,176:7:392:27 0/0:0,6,63:2:204:12 20 65900 . G A 999 PASS AC1=156;AF1=0.7977;DP4=98,72,334,335;DP=857;FQ=999;MQ=46;PV4=0.086,1,2.3e-21,1 GT:PL:DP:SP:GQ 1/1:162,24,0:8:-1210247533:27 1/1:160,21,0:7:458754:24 1/1:219,30,0:10:0:33 1/1:213,30,0:10:1056438877:33 1/1:248,42,0:14:0:45 1/1:148,21,0:7:-1125025851:24 20 65951 . T A 142 PASS AC1=4;AF1=0.02096;DP4=349,437,7,6;DP=818;FQ=142;MQ=48;PV4=0.58,0.0014,0.18,1 GT:PL:DP:SP:GQ 0/0:0,18,125:6:1040080185:32 0/0:0,15,132:5:0:29 0/0:0,24,183:8:0:38 0/0:0,45,252:15:1063282696:59 0/0:0,30,217:10:0:44 0/0:0,15,101:5:1099561607:29 20 66370 . G A 999 PASS AC1=160;AF1=0.8125;DP4=76,68,292,297;DP=774;FQ=999;MQ=46;PV4=0.52,1,5.3e-34,1 GT:PL:DP:SP:GQ 1/1:255,57,0:19:0:60 1/1:193,24,0:8:131075:27 1/1:97,12,0:4:0:15 1/1:208,30,0:10:0:33 1/1:129,15,0:5:0:18 1/1:72,9,0:3:0:13 20 67184 . C A 110 PASS AC1=2;AF1=0.01039;DP4=366,477,2,10;DP=863;FQ=111;MQ=48;PV4=0.08,1,0.12,0.24 GT:PL:DP:SP:GQ 0/0:0,33,202:11:-2137326772:50 0/0:0,6,57:2:0:23 0/0:0,42,223:14:0:59 0/0:0,21,142:7:1069487242:38 0/0:0,30,181:10:0:47 0/0:0,33,201:11:1190753285:50 20 67760 . C A 16.9 PASS AC1=2;AF1=0.008593;DP4=412,444,1,5;DP=870;FQ=16.9;MQ=47;PV4=0.22,1,0.0035,1 GT:PL:DP:SP:GQ 0/0:0,42,224:14:-1076013622:60 0/0:0,9,77:3:0:27 0/0:0,45,243:15:0:63 0/0:0,21,147:7:1070943924:39 0/0:0,33,205:11:0:51 0/0:0,12,102:4:783554555:30 20 68303 . A G 65.3 PASS AC1=1;AF1=0.005274;DP4=480,351,4,5;DP=874;FQ=65.3;MQ=49;PV4=0.51,9.5e-09,0.34,1 GT:PL:DP:SP:GQ 0/0:0,33,205:11:-1062059871:53 0/0:0,33,236:11:0:53 0/0:0,33,214:11:0:53 0/0:0,27,197:9:1072683922:47 0/0:0,18,149:6:0:38 0/0:0,9,72:3:1109840439:29 20 68618 . G C 62.6 PASS AC1=2;AF1=0.01065;DP4=409,382,5,5;DP=833;FQ=62.6;MQ=49;PV4=1,1.9e-07,0.00018,0.4 GT:PL:DP:SP:GQ 0/0:0,24,176:9:1561273746:41 0/0:0,30,214:10:0:47 0/0:0,21,159:7:0:38 0/0:0,24,191:8:1056871757:41 0/0:0,15,133:5:0:32 0/0:0,18,133:6:253656329:35 vcftools_0.1.11/examples/shuffle-test.vcf0000644000000000000000000000117612156354771017160 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A 1 100100 . G C 0 . DP=1 GT:GQ:DP 0/1:40:1 0|1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0/0:40:1 0|1:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP ./.:40:1 1/1:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 0/2:40:1 1/1:41:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 0/0:40:1 1/1:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 0/0:40:1 1/1:40:1 vcftools_0.1.11/examples/fix-ploidy.samples0000644000000000000000000000005212156354771017511 0ustar rootrootM1 M M2 M F3 F F4 F M5 M M6 M vcftools_0.1.11/examples/concat.out0000644000000000000000000000403212156354771016041 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 141 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 151 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 161 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 171 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 181 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 191 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 142 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 152 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 160 . TAAAA TA,TC,T 246 PASS DP=10 GT:GQ:DP 0/2:12:10 2 162 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 172 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 182 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 192 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools_0.1.11/examples/query-test.out0000644000000000000000000000037712156354771016724 0ustar rootroot1:100100 ref=G alt=C qual=0 1 A=G|C B=G/C 1:100200 ref=G alt=C qual=0 1 A=G|C B=G/G 1:100300 ref=G alt=C qual=0 1 A=C/C B=./. 1:100400 ref=C alt=G,T qual=35 1 A=G/G B=C/T 1:100500 ref=A alt=G qual=0 1 A=G/G B=A/A 1:100600 ref=C alt=G qual=0 1 A=G/G B=C/C vcftools_0.1.11/examples/fix-ploidy.txt0000644000000000000000000000033712156354771016672 0ustar rootrootploidy => { 20 => [ { from=>1, to=>61275, M=>1, F=>2 }, { from=>61282, to=>63231, F=>1 }, { from=>63244, to=>63967, M=>1, F=>0 }, { from=>65288, to=>68303, M=>0, F=>1 }, ], } vcftools_0.1.11/examples/merge-test-a.vcf0000644000000000000000000000170512156354771017037 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 3062915 . GTTT G 1806 q10 DP=35;DP4=1,2,3,4 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 1 3106154 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 3157410 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 3162006 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 3177144 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 3184885 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 3199812 . G GTT,GT 481 PASS DP=26 GT:GQ:DP 1/2:322:26 3 3212016 . CTT C,CT 565 PASS DP=26 GT:GQ:DP 1/2:91:26 4 3258448 . TACACACAC T 325 PASS DP=31 GT:GQ:DP 0/1:325:31 vcftools_0.1.11/examples/subset.SNPs.out0000644000000000000000000000101212156354771016714 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . G C 1806 q10 DP=35 GT:GQ:DP 1/1:409:35 1 120 . T G 628 q10 DP=21 GT:GQ:DP 0|1|1:21:21 1 130 . GAA GTA 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 vcftools_0.1.11/examples/cmp-test.out0000644000000000000000000000511112156354771016325 0ustar rootroot# This file was generated by vcf-compare. # #VN 'Venn-Diagram Numbers'. Use `grep ^VN | cut -f 2-` to extract this part. #VN The columns are: #VN 1 .. number of sites unique to this particular combination of files #VN 2- .. combination of files and space-separated number, a fraction of sites in the file VN 6 cmp-test-a.vcf.gz (100.0%) cmp-test-b.vcf.gz (100.0%) #SN Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part. SN Number of REF matches: 6 SN Number of ALT matches: 5 SN Number of REF mismatches: 0 SN Number of ALT mismatches: 1 SN Number of samples in GT comparison: 2 #GS Genotype Comparison Summary. Use `grep ^GS | cut -f 2-` to extract this part. #GS The columns are: #GS 1 .. variant type #GS 2 .. number of mismatches #GS 3 .. number of matches #GS 4 .. discordance GS hom_RR 0 3 0.00% GS het_RA 1 3 25.00% GS hom_AA 0 4 0.00% GS het_AA 0 0 0.00% SN Non-reference Discordance Rate (NDR): 12.50 SN Summary: NDR 12.50, RR 0.00, RA 25.00, AA 0.00 #GC Genotype Comparison. Use `grep ^GC | cut -f 2-` to extract this part. #GC The columns are: #GC 1 .. Sample #GC 2-6 .. Gtype mismatches: total hom_RR hom_AA het_RA het_AA #GC 7-9 .. Gtype lost: total het_RA het_AA #GC 10-14 .. Gtype gained: total hom_RR hom_AA het_RA het_AA #GC 15-17 .. Phase lost: total het_RA het_AA #GC 18 .. Phase gained #GC 19-23 .. Matching sites: total hom_RR hom_AA het_RA het_AA #GC 24 .. Phased matches: het_RA #GC 25 .. Misphased matches: het_RA GC A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 4 2 0 2 1 GC B 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 4 3 0 1 0 0 0 #AF Number of matching and mismatching genotypes vs non-ref allele frequency. Use `^AF | cut -f 2-` to extract this part. #AF The columns are: #AF 1 .. Non-ref allele count #AF 2 .. Hom(RR) matches #AF 3 .. Het(RA) matches #AF 4 .. Hom(AA) matches #AF 5 .. Het(AA) matches #AF 6 .. Hom(RR) mismatches #AF 7 .. Het(RA) mismatches #AF 8 .. Hom(AA) mismatches #AF 9 .. Het(AA) mismatches AF 0.25 1 1 0 0 0 0 0 0 AF 0.50 2 2 2 0 0 0 0 0 AF 0.75 0 0 1 0 0 1 0 0 AF 1.00 0 0 1 0 0 0 0 0 #DP Counts by depth. Use `grep ^DP | cut -f 2-` to extract this part. #DP The columns are: #DP 1 .. depth #DP 2 .. RR matches #DP 3 .. RA matches #DP 4 .. AA matches #DP 5 .. RR -> RA mismatches #DP 6 .. RR -> AA mismatches #DP 7 .. RA -> RR mismatches #DP 8 .. RA -> AA mismatches #DP 9 .. AA -> RR mismatches #DP 10 .. AA -> RA mismatches DP 1 3 3 4 0 0 1 0 0 0 vcftools_0.1.11/examples/parse-test.vcf0000644000000000000000000000034612156354771016634 0ustar rootroot##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT g 1806 PASS DP=35 1 104 . C g,,t, 1792 PASS DP=32 1 104 . C ,g,,t 1792 PASS DP=32 1 104 . C ,g,,t, 1792 PASS DP=32 vcftools_0.1.11/examples/merge-test-b.vcf0000644000000000000000000000177412156354771017046 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B 1 3062915 . GTTT GT 376 q20 DP=14;DP4=1,2,3,4 GT:GQ:DP:GL 0/1:376:14:-10,0,-10 1 3106154 . CAAAA C 677 PASS DP=15 GT:GQ:DP:GL 0/1:277:15:-10,0,-10 1 3157410 . GA G 249 PASS DP=11 GT:GQ:DP 0/1:49:11 1 3162006 . GAA G 663 PASS DP=19 GT:GQ:DP 0/1:589:19 1 3177144 . GT G 460 PASS DP=24 GT:GQ:DP 0/1:236:24 1 3184885 . TAAA T 598 PASS DP=16 GT:GQ:DP 0/1:435:16 2 3188209 . GA G 162 . DP=15 GT:GQ:DP 0/1:162:15 3 3199812 . G GTT,GT 353 PASS DP=19 GT:GQ:DP 1/2:188:19 3 3199815 . G A 353 PASS DP=19 GT:GQ:DP 0/1:188:19 4 3212016 . CTT C 677 q20 DP=15 GT:GQ:DP 0/1:158:15 vcftools_0.1.11/examples/merge-test.vcf.out0000644000000000000000000000452012156354771017425 0ustar rootrootUsing column name 'A' for merge-test-a.vcf.gz:A Using column name 'B' for merge-test-b.vcf.gz:B Using column name 'C' for merge-test-c.vcf.gz:C ##fileformat=VCFv4.1 ##FILTER= ##FORMAT= ##FILTER= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C 1 3062915 . GTTT G,GT 856.67 q20;q10 AC=2,1;AN=6;DP4=3,6,9,12;DP=59;SF=0f,1f,2 GT:GL:DP:GQ 0/1:-20,-5,-20,.,.,.:35:409 0/2:-10,.,.,0,.,-10:14:376 0/1:.:10:149 1 3106154 . CAAAA CA,C 912.67 PASS AC=3,1;AN=6;DP=56;SF=0,1,2 GT:GL:DP:GQ 0/1:.:32:245 0/2:-10,.,.,0,.,-10:15:277 1/1:.:9:25 1 3157410 . GA G 363.00 q10 AC=4;AN=6;DP=42;SF=0f,1,2 GT:DP:GQ 1/1:21:21 0/1:11:49 0/1:10:52 1 3162006 . GAA G 745.67 PASS AC=3;AN=6;DP=58;SF=0,1,2 GT:DP:GQ 0/1:22:212 0/1:19:589 0/1:17:163 1 3177144 . GT G 466.00 PASS AC=3;AN=6;DP=68;SF=0,1,2 GT:DP:GQ 0/1:30:150 0/1:24:236 0/1:14:151 1 3184885 . TAAAA TA,T 422.00 PASS AC=2,1;AN=4;DP=26;SF=0,1 GT:DP:GQ 1/2:10:12 0/1:16:435 . 1 3199812 . G GT . . AC=2;AN=2;SF=2 GT . . 1/1 2 3188209 . GA G 162.00 . AC=1;AN=2;DP=15;SF=1 GT:DP:GQ . 0/1:15:162 . 2 3199812 . G GTT,GT 481.00 PASS AC=1,1;AN=2;DP=26;SF=0 GT:DP:GQ 1/2:26:322 . . 2 3212016 . CTT C 613.00 . AC=1;AN=2;DP=11;SF=2 GT:DP:GQ . . 0/1:11:41 3 3199812 . G GTT,GT 353.00 PASS AC=1,1;AN=2;DP=19;SF=1 GT:DP:GQ . 1/2:19:188 . 3 3199815 . G T,A 353.00 PASS AC=1,1;AN=4;DP=38;SF=1,2 GT:DP:GQ . 0/2:19:188 0/1:19:188 3 3212016 . CTT C,CT 565.00 PASS AC=1,1;AN=2;DP=26;SF=0 GT:DP:GQ 1/2:26:91 . . 3 3242491 . TT T . . AC=2;AN=2;SF=2 GT . . 1/1 4 3212016 . CTT C 677.00 q20 AC=1;AN=2;DP=15;SF=1f GT:DP:GQ . 0/1:15:158 . 4 3258448 . TACACACAC T 325.00 PASS AC=1;AN=2;DP=31;SF=0 GT:DP:GQ 0/1:31:325 . . 4 3291771 . T TAA,TAAA 336.00 . AC=1,1;AN=2;DP=12;SF=2 GT:DP:GQ . . 1/2:12:2 vcftools_0.1.11/examples/annotate.out0000644000000000000000000000254012156354771016405 0ustar rootroot##fileformat=VCFv4.0 ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=5;GN=gene1;HM2 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=6,6 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=6,6 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=7,7;HM2=, GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=7,7;HM2=, GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=8 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=9 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS GN=gene2;HM2 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 GN=gene2;HM2 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS GN=gene2;HM2 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS GN=gene2;HM2 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS GN=gene2;HM2 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS DP=11;GN=gene3 GT:GQ:DP 0/2:12:10 vcftools_0.1.11/examples/concat-b.vcf0000644000000000000000000000124212156354771016227 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 141 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 151 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 161 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 171 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 181 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 191 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 vcftools_0.1.11/examples/perl-api-1.pl0000755000000000000000000000263712156354771016261 0ustar rootroot#!/usr/bin/env perl # # Example code for generating a minimal VCF file using the perl API # # Author: pd3@sanger # use strict; use warnings; use Carp; use Vcf; my $sample = 'Sample1'; my $vcf_out = Vcf->new(); $vcf_out->add_columns($sample); $vcf_out->add_header_line({key=>'FORMAT',ID=>'GT',Number=>'1',Type=>'String',Description=>"Genotype"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL',Description=>"Deletion"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL:ME:ALU',Description=>"Deletion of ALU element"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DEL:ME:L1',Description=>"Deletion of L1 element"}); $vcf_out->add_header_line({key=>'ALT',ID=>'DUP',Description=>"Duplication"}); $vcf_out->add_header_line({key=>'INFO',ID=>'DP',Number=>1,Type=>'Integer',Description=>"Total Depth"}); $vcf_out->add_header_line({key=>'INFO',ID=>'H2',Number=>0,Type=>'Flag',Description=>"HapMap2 membership"}); print $vcf_out->format_header(); my $pos = 1; for my $gt qw(A/A C/C /C / / /) { $pos++; my %out; $out{CHROM} = '1'; $out{POS} = $pos; $out{ID} = '.'; $out{ALT} = []; $out{REF} = 'C'; $out{QUAL} = '.'; $out{FILTER} = ['.']; $out{INFO} = { DP=>3, H2=>undef }; $out{FORMAT} = ['GT']; $out{gtypes}{$sample}{GT} = $gt; $vcf_out->format_genotype_strings(\%out); print $vcf_out->format_line(\%out); } vcftools_0.1.11/examples/consensus.fa0000644000000000000000000000201612156354771016371 0ustar rootroot>1:1-500 ATACCATATGTGACTTATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTTG CAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG ATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAATC TTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG AGAGATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAA ATCAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGT GTCGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAG TTATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGA TTTGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGT GTTACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTG ACTCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT TCAGACACAGTTAATCCAGAC vcftools_0.1.11/examples/consensus.out20000644000000000000000000000201512156354771016673 0ustar rootroot>1:1-500 ATAC*ATATGTG*T***ATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTT G*AGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCAT TAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAA TATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCT CTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAA CTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAA GGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCT GATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAAT CTTTAAAAACAAAAAAAAAGAA >2:1-500 GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG A*ATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATAAAT CAGAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGTGT CGGGACAGCCTTTTTATAAAATTTTTCTAAATAATGTTGAGGCTTTGATACGTCAAAGTT ATATTTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGATT TGCTGCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGT TACATGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGAC TCCTCTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATT@C AGACACAGTTAATCCAGAC vcftools_0.1.11/examples/annotate-test.vcf0000644000000000000000000000322512156354771017332 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT G 1806 q10 DP=35 1 104 . C . 1792 PASS DP=32 1 105 . C T 246 PASS DP=10 1 106 . C A 246 PASS DP=10 2 107 . C . 1806 q10 DP=35 2 108 . C . 1792 PASS DP=32 2 109 . C . 628 q10 DP=21 2 110 . C G 1016 PASS DP=22 2 111 . C G 727 PASS DP=30 2 112 . C G 246 PASS DP=10 2 113 . C . 246 PASS DP=10 2 114 . T . 246 PASS DP=10 2 115 . T . 246 PASS DP=10 2 116 . T . 246 PASS DP=10 2 117 . T A 246 PASS DP=10 2 118 . T C 246 PASS DP=10 2 119 . TAAA T 246 PASS DP=10 2 124 . TA T 246 PASS DP=10 2 128 . T TA 246 PASS DP=10 2 130 . C A 246 PASS DP=10 2 131 . T A 246 PASS DP=10 2 132 . T A 246 PASS DP=10 2 133 . T A 246 PASS DP=10 2 134 . T A 246 PASS DP=10 2 135 . T C 246 PASS DP=10 2 136 . TT T 246 PASS DP=10;AF=0.1 2 138 . TT T 246 PASS DP=10;AF=0.2 2 140 . TT T 246 PASS DP=10;AF=0.1 17 12412 . CAGAGAGAGA CAGAGAGAGAGA 74.8 . INDEL;DP=5388;AF1=0.006576;CI95=0.005525,0.01105;DP4=2077,2367,21,22;MQ=47;FQ=74.8;PV4=0.88,1,0.34,0.021 17 12427 . G A 999 . DP=5557;AF1=0.06028;CI95=0.04972,0.07182;DP4=2461,2689,106,74;MQ=47;FQ=999;PV4=0.0038,1,2.6e-12,1 17 69284 . G A 14.6 . DP=3946;AF1=0.003468;CI95=0.002762,0.008287;DP4=1529,2177,7,9;MQ=44;FQ=14.6;PV4=1,0.035,0.098,1 17 69293 . GTTTCATTTC GTTTCTTTTCATTTC 999 . INDEL;DP=3568;AF1=0.1295;CI95=0.1077,0.1547;DP4=1014,1238,118,121;MQ=44;FQ=999;PV4=0.22,1,9.4e-54,1 vcftools_0.1.11/examples/consensus.vcf0000644000000000000000000000042312156354771016561 0ustar rootroot##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 1 5 . C * . PASS . GT 0/1 1 10 . G * . PASS . GT 0/0 1 12 . GACT G* . PASS . GT 0/1 1 16 . T T*** . PASS . GT 1/1 1 61 . C * . PASS . GT 1/1 2 61 . AGAG A* . PASS . GT 0/1 2 481 . T *,@ . PASS . GT 0/2 vcftools_0.1.11/examples/annotate2.out0000644000000000000000000000604312156354771016471 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO 1 100 . GTTT G 1806 q10;MaxDP DP=35 1 104 . C . 1792 PASS DP=32 1 105 . C T 246 SnpGap;SnpCluster DP=10 1 106 . C A 246 SnpGap;SnpCluster DP=10 2 107 . C . 1806 q10;MaxDP DP=35 2 108 . C . 1792 PASS DP=32 2 109 . C . 628 q10 DP=21 2 110 . C G 1016 SnpGap;SnpCluster DP=22 2 111 . C G 727 SnpGap;SnpCluster DP=30 2 112 . C G 246 SnpGap;SnpCluster DP=10 2 113 . C . 246 PASS DP=10 2 114 . T . 246 PASS DP=10 2 115 . T . 246 PASS DP=10 2 116 . T . 246 PASS DP=10 2 117 . T A 246 SnpGap;SnpCluster DP=10 2 118 . T C 246 SnpGap;SnpCluster DP=10 2 119 . TAAA T 246 SnpCluster DP=10 2 124 . TA T 246 GapWin DP=10 2 128 . T TA 246 SnpCluster DP=10 2 130 . C A 246 SnpGap;SnpCluster DP=10 2 131 . T A 246 SnpGap;SnpCluster DP=10 2 132 . T A 246 SnpGap;SnpCluster DP=10 2 133 . T A 246 SnpGap;SnpCluster DP=10 2 134 . T A 246 SnpGap;SnpCluster DP=10 2 135 . T C 246 SnpGap;SnpCluster DP=10 2 136 . TT T 246 GapWin;SnpCluster DP=10;AF=0.1 2 138 . TT T 246 SnpCluster DP=10;AF=0.2 2 140 . TT T 246 GapWin;SnpCluster DP=10;AF=0.1 17 12412 . CAGAGAGAGA CAGAGAGAGAGA 74.8 MaxDP INDEL;DP=5388;AF1=0.006576;CI95=0.005525,0.01105;DP4=2077,2367,21,22;MQ=47;FQ=74.8;PV4=0.88,1,0.34,0.021 17 12427 . G A 999 MaxDP;SnpGap DP=5557;AF1=0.06028;CI95=0.04972,0.07182;DP4=2461,2689,106,74;MQ=47;FQ=999;PV4=0.0038,1,2.6e-12,1 17 69284 . G A 14.6 MaxDP;SnpGap DP=3946;AF1=0.003468;CI95=0.002762,0.008287;DP4=1529,2177,7,9;MQ=44;FQ=14.6;PV4=1,0.035,0.098,1 17 69293 . GTTTCATTTC GTTTCTTTTCATTTC 999 MaxDP INDEL;DP=3568;AF1=0.1295;CI95=0.1077,0.1547;DP4=1014,1238,118,121;MQ=44;FQ=999;PV4=0.22,1,9.4e-54,1 vcftools_0.1.11/examples/concat-a.vcf0000644000000000000000000000223412156354771016230 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS DP=10 GT:GQ:DP 0/2:12:10 vcftools_0.1.11/examples/fill-an-ac.out0000644000000000000000000000276412156354771016507 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 100 . GTTT G 1806 q10 AC=1;AN=2;DP=35 GT:GQ:DP 0/1:409:35 1 110 . C T,G 1792 PASS AC=1,0;AN=2;DP=32 GT:GQ:DP 0/1:245:32 1 110 . CAAA C 1792 PASS AC=1;AN=2;DP=32 GT:GQ:DP 0/1:245:32 1 120 . GA G 628 q10 AC=2;AN=2;DP=21 GT:GQ:DP 1/1:21:21 1 130 . G T 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 1 130 . GAA GG 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 1 140 . GT G 727 PASS AC=1;AN=2;DP=30 GT:GQ:DP 0/1:150:30 1 150 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 1 160 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 2 100 . GTTT G 1806 q10 AC=1;AN=2;DP=35 GT:GQ:DP 0/1:409:35 2 110 . CAAA C 1792 PASS AC=1;AN=2;DP=32 GT:GQ:DP 0/1:245:32 2 120 . GA G 628 q10 AC=2;AN=2;DP=21 GT:GQ:DP 1/1:21:21 2 130 . GAA G 1016 PASS AC=1;AN=2;DP=22 GT:GQ:DP 0/1:212:22 2 140 . GT G 727 PASS AC=1;AN=2;DP=30 GT:GQ:DP 0/1:150:30 2 150 . TAAAA TA,T 246 PASS AC=1,1;AN=2;DP=10 GT:GQ:DP 1/2:12:10 2 160 . TAAAA TA,TC,T 246 PASS AC=0,1,0;AN=2;DP=10 GT:GQ:DP 0/2:12:10 vcftools_0.1.11/examples/floats.vcf0000644000000000000000000000106412156354771016033 0ustar rootroot##fileformat=VCFv4.1 ##INFO= ##FORMAT= ##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 19 14370 . G A 29 PASS FLOATTAG=0.0001 GT 0|0 19 14371 . G A 29 PASS FLOATTAG=1e-4 GT 0|0 19 14372 . G A 29 PASS FLOATTAG=1E-4 GT 0|0 19 14373 . G A 29 PASS FLOATTAG=1e4 GT 0|0 vcftools_0.1.11/examples/cmp-test-b-3.3.vcf0000644000000000000000000000101012156354771017006 0ustar rootroot##fileformat=VCFv3.3 ##INFO=DP,1,Integer,"Total Depth" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 1/0:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 1|0:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100400 . C G 35 . DP=1 GT:GQ:DP 1/1:41:1 0/0:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools_0.1.11/examples/filters.txt0000644000000000000000000001144512156354771016260 0ustar rootroot# Examples of user-defined filters. Edit and run with -f filters.txt. # The examples below are self-explanatory. Notice the use of the predefined # variables ($PASS, $FAIL, $MATCH, $RECORD) and methods (error). # In this example, a minimum value of AF1=0.1 is required { tag => 'INFO/AF1', # The VCF tag to apply this filter on name => 'MinAF', # The filter ID desc => 'Minimum AF1 [0.01]', # Description for the VCF header test => sub { return $MATCH < 0.01 ? $FAIL : $PASS }, }, # Filter all indels (presence of INDEL tag is tested) { tag => 'INFO/INDEL', apply_to => 'indels', # Can be one of SNPs, indels, all. Default: [All] name => 'Indel', desc => 'INDEL tag present', test => sub { return $FAIL }, }, # Only loci with enough reads supporting the variant will pass the filter { tag => 'INFO/DP4', name => 'FewAlts', desc => 'Too few reads supporting the variant', apply_to => 'SNPs', test => sub { if ( !($MATCH =~ /^([^,]+),([^,]+),([^,]+),(.+)$/) ) { error("Could not parse INFO/DP4: $CHROM:$POS [$MATCH]"); } if ( 0.1*($1+$2) > $3+$4 ) { return $PASS; } return $FAIL; }, }, # Example of filtering based on genotype columns and the QUAL column { tag => 'FORMAT/PL', name => 'NoHets', desc => 'Inbred homozygous mouse, no hets expected', apply_to => 'SNPs', test => sub { for my $pl (@$MATCH) { my @pls = split(/,/,$pl); if ( $pls[1]<$pls[0] && $pls[1]<$pls[2] ) { return $FAIL; } } return $PASS; }, }, # This example splits the four PV4 values into four tags names PV0, PV1, PV2 and PV3. # Note the use of the 'header' key, and the $RECORD and $VCF variables. { header => [ qq[key=INFO,ID=PV0,Number=1,Type=Float,Description="P-value for strand bias"], qq[key=INFO,ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias"], qq[key=INFO,ID=PV2,Number=1,Type=Float,Description="P-value for mapQ bias"], qq[key=INFO,ID=PV3,Number=1,Type=Float,Description="P-value for tail distance bias"] ], tag => 'INFO/PV4', name => 'SplitPV4', desc => 'Split PV4', apply_to => 'all', test => sub { my @vals = split(/,/,$MATCH); $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'PV0'=>$vals[0],'PV1'=>$vals[1],'PV2'=>$vals[2],'PV3'=>$vals[3]); return $PASS; }, }, # Do whatever you want with every record and edit it according to your needs. This silly # example removes the tag SILLY in records where ID is set and depth is bigger than 5. { tag => 'Dummy', test => sub { if ( $$RECORD[2] eq '.' ) { return $PASS; } # Modify only lines with ID my $dp = $vcf->get_info_field($$RECORD[7],'DP'); if ( $dp>5 ) { $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'SILLY'=>undef); } return $PASS; }, } # Filter records with the value XY absent or not equal to 42 { tag => 'Dummy', header => [ qq[key=FILTER,ID=XY,Description="XY not OK"], ], test => sub { my $xy = $VCF->get_info_field($$RECORD[7],'XY'); my $is_bad = ( !defined $xy or $xy!=42 ) ? 1 : 0; $$RECORD[6] = $VCF->add_filter($$RECORD[6],'XY'=>$is_bad); return $PASS; }, }, # Annotate INFO field with SINGLETON flag when one and only one sample is different from the reference { header => [ qq[key=INFO,ID=SINGLETON,Number=0,Type=Flag,Description="Only one non-ref sample"], ], tag => 'FORMAT/GT', name => 'Dummy', desc => 'Dummy', test => sub { my $nalt = 0; for my $gt (@$MATCH) { my @gt = $VCF->split_gt($gt); for my $allele (@gt) { if ( $allele ne 0 && $allele ne '.' ) { $nalt++; last; } } if ( $nalt>1 ) { last; } } if ( $nalt==1 ) { $$RECORD[7] = $VCF->add_info_field($$RECORD[7],'SINGLETON'=>''); } return $PASS; }, }, # Set genotypes to unknown ("." or "./." depending on ploidy) when coverage is low (by Shane McCarthy). { tag => 'FORMAT/DP', name => 'MinSampleDP', desc => 'Genotypes set to . for samples with DP < 2', apply_to => 'all', test => sub { my $i = 8; for my $dp (@$MATCH) { $i++; next unless ($dp<2); my @format = split(/:/,$$RECORD[$i]); $format[0] = $format[0] =~ /\// ? "./." : "."; $$RECORD[$i] = join(":",@format); } return $PASS; }, }, vcftools_0.1.11/examples/subset.vcf0000644000000000000000000000231512156354771016050 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100 . G C 1806 q10 DP=35 GT:GQ:DP 1/1:409:35 1/1:409:35 1 110 . C . 1792 PASS DP=32 GT:GQ:DP 0/0:245:32 0/0:245:32 1 120 . T G 628 q10 DP=21 GT:GQ:DP 0|1|1:21:21 0|1|1:21:21 1 130 . GAA G,GTA 1016 PASS DP=22 GT:GQ:DP 1/2:212:22 1/2:212:22 1 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 0/1:150:30 1 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 1 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 2 100 . GTTT G 1806 q10 DP=35 GT:GQ:DP 0/1:409:35 0/1:409:35 2 110 . CAAA C 1792 PASS DP=32 GT:GQ:DP 0/1:245:32 0/1:245:32 2 120 . GA G 628 q10 DP=21 GT:GQ:DP 1/1:21:21 1/1:21:21 2 130 . GAA G 1016 PASS DP=22 GT:GQ:DP 0/1:212:22 0/1:212:22 2 140 . GT G 727 PASS DP=30 GT:GQ:DP 0/1:150:30 0/1:150:30 2 150 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 2 160 . TAAAA TA,T 246 PASS DP=10 GT:GQ:DP 1/2:12:10 1/2:12:10 vcftools_0.1.11/examples/annotate.txt0000644000000000000000000000045312156354771016416 0ustar rootroot100 100 1 id1_100 . . HM2 gene1 5 110 110 1 id1_110 CAAA C,CA 0 . 6 110 110 1 id2_110 C T 0 . 6 130 130 1 id1_130 G T HM2 . 7 130 130 1 id2_130 GAA GG HM2 . 7 140 140 1 id1_140 GT G 0 . 8 150 150 1 id1_150 TAAAA T 0 . 9 110 150 2 id2_110_150 CAAA C HM2 gene2 . 160 160 2 id2_160 TAAAA TC 0 gene3 11 vcftools_0.1.11/examples/valid-4.0.vcf.stats0000644000000000000000000001205112156354771017274 0ustar rootroot$VAR1 = { 'samples' => { 'NA00002' => { 'indel_count' => 2, 'indel' => { '2' => 1, '-1' => 1 }, 'het_RA_count' => 6, 'snp_count' => 5, 'count' => 11, 'hom_RR_count' => 4, 'ref' => 10, 'missing' => 1, 'private' => 1, 'phased' => 7, 'het_AA_count' => 1, 'snp' => { 'A>G' => 1, 'A>T' => 2, 'T>A' => 2, 'G>A' => 1 }, 'unphased' => 4, 'ref_count' => 10 }, 'NA00001' => { 'het_RA_count' => 1, 'indel_count' => 1, 'indel' => { '1' => 1 }, 'snp_count' => 1, 'count' => 11, 'hom_RR_count' => 9, 'ref' => 10, 'missing' => 1, 'phased' => 6, 'het_AA_count' => 1, 'snp' => { 'A>T' => 1, 'A>G' => 1 }, 'unphased' => 5, 'ref_count' => 10 }, 'NA00003' => { 'hom_AA_count' => 4, 'other_count' => 2, 'indel_count' => 1, 'indel' => { '1' => 1 }, 'het_RA_count' => 5, 'snp_count' => 6, 'count' => 11, 'hom_RR_count' => 2, 'ref' => 7, 'missing' => 1, 'private' => 3, 'phased' => 2, 'other' => 2, 'snp' => { 'A>C' => 1, 'A>T' => 2, 'A>G' => 1, 'T>A' => 1, 'G>A' => 1 }, 'ref_count' => 7, 'unphased' => 9 } }, 'all' => { 'other_count' => 2, 'indel_count' => 2, 'indel' => { '1' => 1, '2' => 1, '-1' => 1 }, 'snp_count' => 8, 'count' => 12, 'ref' => 2, 'nalt_0' => 2, 'nalt_1' => 7, 'shared' => { '1' => 4, '0' => 2, '3' => 2, '2' => 4 }, 'other' => 2, 'nalt_2' => 3, 'snp' => { 'A>C' => 1, 'A>T' => 2, 'A>G' => 2, 'T>A' => 3, 'G>A' => 1 }, 'ref_count' => 2 } }; vcftools_0.1.11/examples/isec-n2-test.vcf.out0000644000000000000000000000214012156354771017562 0ustar rootrootWarning: The column names do not match (e.g. B): B A ##fileformat=VCFv4.0 ##FILTER= ##FORMAT= ##FILTER= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 3062915 . GTTT G 1806 q10 DP=35;DP4=1,2,3,4;SF=0f,1f,2 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 1 3106154 . CAAA C 1792 PASS DP=32;SF=0,1,2 GT:GQ:DP 0/1:245:32 1 3157410 . GA G 628 q10 DP=21;SF=0f,1,2 GT:GQ:DP 1/1:21:21 1 3162006 . GAA G 1016 PASS DP=22;SF=0,1,2 GT:GQ:DP 0/1:212:22 1 3177144 . GT G 727 PASS DP=30;SF=0,1,2 GT:GQ:DP 0/1:150:30 1 3184885 . TAAAA TA,T 246 PASS DP=10;SF=0,1 GT:GQ:DP 1/2:12:10 3 3199815 . G A 353 PASS DP=19;SF=1,2 GT:GQ:DP 0/1:188:19 vcftools_0.1.11/examples/cmp-test-a-3.3.vcf0000644000000000000000000000101212156354771017007 0ustar rootroot##fileformat=VCFv3.3 ##INFO=DP,1,Integer,"Total Depth" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/1:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 ./.:40:1 1 100400 . C G,T 35 . DP=1 GT:GQ:DP 1/1:41:1 0/2:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools_0.1.11/examples/contrast.vcf0000644000000000000000000006664112156354771016414 0ustar rootroot##fileformat=VCFv4.1 ##samtoolsVersion=0.1.18-r572 ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##source_20120424.1=vcf-annotate(r735) --fill-AC-AN -f + ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##INFO= ##INFO= ##source_20120424.2=vcf-annotate(r735) --fill-AC-AN -f + ##FILTER= ##source_20120710.1=vcf-annotate(r761) -f q=30 mpileup-v1/merged.filt.vcf.gz #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D 1 10177 . A C 37 MinMQ DP=495;VDB=0.0168;AF1=0.1596;AC1=1;DP4=167,82,52,21;MQ=13;FQ=37;PV4=0.57,1,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,0,15:101:2:5 0/0:0,36,89:51:5:40 0/0:0,79,103:85:1:83 0/1:41,0,31:85:16:36 1 10250 . A C 61 MinMQ DP=271;VDB=0.0265;AF1=0.125;AC1=1;DP4=87,78,18,9;MQ=17;FQ=61;PV4=0.21,1,1,0.1;AN=8;AC=1 GT:DP:SP:GQ 0/0:60:0:99 0/0:32:5:53 0/0:50:2:83 0/1:50:3:62 1 10257 . A C 31.9 MinMQ DP=400;VDB=0.0245;AF1=0.2404;AC1=2;DP4=93,100,26,10;MQ=16;FQ=31.9;PV4=0.01,1,1,0.013;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,93,197:65:3:95 0/1:13,0,92:41:9:11 0/0:0,91,128:59:0:93 0/1:27,0,70:64:14:25 1 10329 . ACCCC ACCC 26.4 MinMQ INDEL;DP=315;VDB=0.0160;AF1=0.2047;AC1=2;DP4=2,42,9,16;MQ=17;FQ=29.3;PV4=0.0011,1,0.061,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,4,68:22:10:7 0/0:2,0,16:7:0:3 0/0:0,15,61:18:0:17 0/1:46,0,34:22:7:40 1 10352 . TACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC 253 MinMQ INDEL;DP=413;VDB=0.0226;AF1=0.8598;AC1=7;DP4=7,17,13,44;MQ=15;FQ=4.35;PV4=0.58,1,1,0.0055;AN=8;AC=7 GT:PL:DP:SP:GQ 1/1:67,6,0:18:2:11 1/1:14,7,0:12:0:12 1/1:111,22,0:23:0:26 0/1:83,0,22:28:2:18 1 10492 . C T 999 PASS DP=213;VDB=0.0102;AF1=0.375;AC1=3;DP4=84,74,34,19;MQ=32;FQ=999;PV4=0.2,0.11,0.057,0.13;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:85,0,255:57:3:86 0/0:0,123,255:41:0:99 0/1:255,0,255:47:0:99 0/1:114,0,255:66:4:99 1 10583 . G A 20 PASS DP=134;VDB=0.0071;AF1=0.1242;AC1=1;DP4=78,41,6,6;MQ=32;FQ=20;PV4=0.35,0.29,0.052,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:26,0,227:40:8:21 0/0:0,35,255:21:0:40 0/0:0,108,255:36:0:99 0/0:0,21,255:34:0:26 1 10797 . CAGA CAGAGA 90.4 MinMQ INDEL;DP=37;VDB=0.0243;AF1=0.2819;AC1=2;DP4=7,3,0,6;MQ=29;FQ=93.3;PV4=0.011,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,9,104:3:0:10 0/0:0,6,39:2:0:8 0/1:59,0,36:5:0:43 0/1:56,0,32:6:4:39 1 10821 . T A 49.7 MinMQ DP=9;VDB=0.0091;AF1=1;AC1=5;DP4=1,3,0,4;MQ=12;FQ=7.75;PV4=1,1,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:42,9,0:3:0:10 0/1:0,3,4:1:0:2 1/1:12,1,0:2:0:4 0/1:0,6,8:2:0:2 1 14907 . A G 999 MinMQ DP=461;VDB=0.0384;AF1=0.5;G3=8.874e-45,1,8.011e-40;HWE=0.0185;AC1=4;DP4=101,122,129,102;MQ=25;FQ=999;PV4=0.031,0.011,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:225,0,255:133:0:99 0/1:213,0,225:91:20:99 0/1:255,0,188:104:0:99 0/1:255,0,208:126:4:99 1 14930 . A G 999 MinMQ DP=502;VDB=0.0393;AF1=0.5;G3=1.282e-48,1,7.866e-46;HWE=0.0185;AC1=4;DP4=117,121,135,111;MQ=28;FQ=999;PV4=0.24,0.02,0.42,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:150:0:99 0/1:232,0,255:84:9:99 0/1:255,0,250:114:0:99 0/1:255,0,218:136:4:99 1 15118 . A G 196 MinMQ DP=408;VDB=0.0389;AF1=0.4995;G3=4.894e-09,1,2.035e-09;HWE=0.0193;AC1=4;DP4=79,107,98,101;MQ=13;FQ=196;PV4=0.19,1,0.16,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:42,0,97:110:1:45 0/1:14,0,34:82:7:17 0/1:54,0,93:90:2:57 0/1:92,0,15:103:1:18 1 15211 . T G 999 MinMQ DP=381;VDB=0.0374;AF1=0.6993;AC1=6;DP4=52,44,122,137;MQ=17;FQ=156;PV4=0.28,0.31,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 0/1:146,0,101:114:5:99 1/1:77,1,0:67:2:4 0/1:121,0,61:78:7:60 1/1:192,89,0:96:11:90 1 15274 . A T 999 MinMQ DP=229;VDB=0.0313;AF1=1;AC1=8;DP4=0,0,99,120;MQ=11;FQ=-92.5;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:83,114,0:54:0:99 1/1:82,108,0:48:0:99 1/1:84,105,0:47:0:99 1/1:112,175,0:70:0:99 1 15820 . G T 90.6 MinMQ DP=149;VDB=0.0252;AF1=0.374;AC1=3;DP4=24,68,15,40;MQ=17;FQ=90.6;PV4=1,1,2e-07,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:40,0,124:48:1:41 0/0:0,27,153:33:2:26 1/1:65,25,0:15:3:20 0/0:0,65,195:51:5:64 1 15903 . GCC GCCC 158 MinMQ INDEL;DP=14;VDB=0.0182;AF1=1;AC1=8;DP4=0,0,0,7;MQ=29;FQ=-18.2;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:29,3,0:1:0:13 1/1:44,6,0:2:0:16 1/1:29,3,0:1:0:13 1/1:72,9,0:3:0:19 1 16103 . T G 24.3 PASS DP=110;VDB=0.0122;AF1=0.2119;AC1=2;DP4=49,26,23,2;MQ=31;FQ=24.3;PV4=0.01,1,6.9e-13,0.33;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,2,228:39:11:5 0/0:0,7,189:15:6:10 0/0:0,0,218:25:0:4 0/1:26,0,192:21:5:24 1 16378 . T C 999 MinMQ DP=587;VDB=0.0267;AF1=0.5;G3=9.954e-26,1,3.125e-18;HWE=0.0185;AC1=4;DP4=128,75,245,120;MQ=18;FQ=999;PV4=0.36,0.23,0.024,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:127,0,150:194:9:99 0/1:118,0,139:108:4:99 0/1:156,0,80:130:3:83 0/1:166,0,148:136:1:99 1 16495 . G C 97.7 MinMQ DP=644;VDB=0.0239;AF1=0.2493;AC1=2;DP4=226,252,67,87;MQ=19;FQ=97.7;PV4=0.46,0.14,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,41,126:190:5:43 0/0:0,17,168:115:1:19 0/1:19,0,132:166:1:17 0/1:87,0,176:161:4:85 1 16534 . C T 264 MinMQ DP=516;VDB=0.0397;AF1=0.5;G3=3.737e-14,1,2.067e-30;HWE=0.0185;AC1=4;DP4=129,149,109,113;MQ=14;FQ=264;PV4=0.59,0.34,1,0.0011;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:97,0,110:151:4:99 0/1:38,0,92:115:2:41 0/1:50,0,120:118:5:53 0/1:85,0,158:116:5:88 1 16571 . G A 120 MinMQ DP=435;VDB=0.0388;AF1=0.4998;G3=1.594e-10,1,7.561e-11;HWE=0.0189;AC1=4;DP4=94,134,84,109;MQ=10;FQ=120;PV4=0.69,0.018,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:33,0,20:123:0:23 0/1:42,0,24:95:2:27 0/1:18,0,43:107:0:21 0/1:33,0,70:96:4:36 1 17538 . C A 64 MinMQ DP=393;VDB=0.0314;AF1=0.125;AC1=1;DP4=138,205,17,27;MQ=28;FQ=64;PV4=0.87,0.32,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,152,255:148:1:99 0/0:0,29,227:72:4:34 0/0:0,71,255:86:6:76 0/1:70,0,226:81:4:65 1 20144 . G A 98.2 MinMQ DP=304;VDB=0.0356;AF1=0.4851;G3=4.916e-07,1,8.127e-35;HWE=0.0213;AC1=4;DP4=91,122,40,43;MQ=15;FQ=98.2;PV4=0.44,0.0094,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:6,0,72:94:6:9 0/1:32,0,80:44:13:35 0/1:28,0,112:81:9:31 0/1:38,0,62:77:0:41 1 28558 . C T 164 MinMQ DP=142;VDB=0.0026;AF1=0.4529;G3=1.465e-06,1,4.392e-30;HWE=0.0307;AC1=4;DP4=38,62,18,20;MQ=17;FQ=164;PV4=0.34,1,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:0,0,104:44:9:4 0/1:27,0,28:32:3:27 0/1:77,0,113:35:5:79 0/1:64,0,31:27:0:35 1 28563 . A G 999 MinMQ DP=124;VDB=0.0072;AF1=1;AC1=8;DP4=22,31,27,39;MQ=18;FQ=-3.67;PV4=1,1,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:191,6,0:41:1:14 1/1:90,2,0:24:0:11 1/1:213,20,0:31:4:28 1/1:104,0,1:23:0:8 1 28590 . TT TTGGT 116 MinMQ INDEL;DP=112;VDB=0.0233;AF1=0.3933;AC1=3;DP4=5,46,10,16;MQ=19;FQ=54.6;PV4=0.005,1,1,0.00097;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:80,0,2:23:10:8 0/1:9,0,9:15:15:9 0/1:51,0,26:21:2:31 0/0:0,17,39:18:5:16 1 30867 . CCTCTCTCTCTCTCTCTCTCTCTCTC CCTCTCTCTCTCTCTCTCTCTC 999 PASS INDEL;DP=229;VDB=0.0320;AF1=0.5;G3=4.953e-17,1,5e-52;HWE=0.0185;AC1=4;DP4=56,66,27,32;MQ=37;FQ=999;PV4=1,1,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:211,0,255:47:1:99 0/1:74,0,255:20:2:77 0/1:255,0,255:70:3:99 0/1:176,0,255:44:0:99 1 30923 . G T 999 PASS DP=107;VDB=0.0022;AF1=1;AC1=8;DP4=0,0,47,50;MQ=37;FQ=-36;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,75,0:25:0:99 1/1:221,30,0:10:0:72 1/1:255,117,0:39:0:99 1/1:255,69,0:23:0:99 1 40639 . CTTTTTTTTTTTTTTTTTTT CTTTTTTTTTTTTTTTT 118 PASS INDEL;DP=72;VDB=0.0379;AF1=1;AC1=8;DP4=0,0,14,2;MQ=33;FQ=-18.8;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:77,30,0:11:0:40 1/1:16,3,0:1:0:14 1/1:16,3,0:1:0:14 1/1:25,6,0:3:0:16 1 46633 . T A 46.4 MinMQ DP=169;VDB=0.0275;AF1=0.1322;AC1=1;DP4=67,81,9,9;MQ=15;FQ=46.4;PV4=0.8,0.5,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:52,0,60:47:0:47 0/0:0,7,71:30:0:12 0/0:0,114,146:38:0:99 0/0:0,154,179:51:0:99 1 49298 . T C 999 MinMQ DP=124;VDB=0.0376;AF1=1;AC1=8;DP4=17,14,49,36;MQ=24;FQ=-3.76;PV4=0.83,1,1,0.49;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:130,0,4:20:4:6 1/1:127,8,0:12:3:17 1/1:247,27,0:45:26:36 1/1:252,60,0:39:12:69 1 51803 . T C 999 MinMQ DP=88;VDB=0.0284;AF1=1;AC1=8;DP4=9,30,20,25;MQ=15;FQ=-3.64;PV4=0.065,0.21,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:45,2,0:22:11:10 1/1:60,5,0:6:3:13 1/1:105,1,0:30:2:9 1/1:153,6,0:26:2:14 1 51898 . C A 22.2 PASS DP=128;VDB=0.0230;AF1=0.1272;AC1=1;DP4=56,50,14,2;MQ=41;FQ=22.2;PV4=0.013,0.069,9.8e-17,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:28,0,255:23:13:23 0/0:0,35,255:19:3:40 0/0:0,59,255:44:0:64 0/0:0,11,255:36:9:16 1 51928 . G A 54.1 PASS DP=149;VDB=0.0311;AF1=0.1269;AC1=1;DP4=67,52,22,5;MQ=41;FQ=54.1;PV4=0.017,0.0073,7.3e-34,0.37;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:60,0,255:29:13:55 0/0:0,27,255:19:0:32 0/0:0,30,255:51:2:35 0/0:0,13,255:47:11:18 1 52058 . G C 17.5 PASS DP=132;VDB=0.0277;AF1=0.2308;AC1=2;DP4=55,57,15,2;MQ=35;FQ=17.5;PV4=0.0031,0.036,0.00091,0.03;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:13,0,178:20:7:11 0/0:0,60,255:20:0:62 0/1:12,0,255:51:8:10 0/0:0,15,255:38:9:17 1 52238 . T G 999 PASS DP=138;VDB=0.0125;AF1=1;AC1=8;DP4=0,0,65,60;MQ=37;FQ=-42;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,63,0:21:0:99 1/1:255,36,0:12:0:84 1/1:255,166,0:55:0:99 1/1:255,111,0:37:0:99 1 54586 . T C 51.1 PASS DP=116;VDB=0.0136;AF1=0.236;AC1=2;DP4=47,45,14,7;MQ=36;FQ=51.1;PV4=0.23,1,6.1e-11,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:48,0,186:24:0:46 0/0:0,26,252:14:0:28 0/1:11,0,255:32:7:9 0/0:0,40,255:43:9:42 1 54676 . C T 999 PASS DP=143;VDB=0.0244;AF1=0.4969;G3=1.224e-08,1,4.851e-96;HWE=0.0191;AC1=4;DP4=54,48,19,21;MQ=40;FQ=999;PV4=0.58,0.47,8.6e-13,0.2;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:82,0,233:23:7:85 0/1:121,0,237:20:2:99 0/1:173,0,255:49:6:99 0/1:13,0,255:50:2:16 1 54712 . TTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTT TTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTT 423 PASS INDEL;DP=161;VDB=0.0367;AF1=0.8839;AC1=7;DP4=1,0,4,12;MQ=45;FQ=-9.11;PV4=0.29,1,1,0.06;AN=8;AC=7 GT:PL:DP:SP:GQ 0/1:0,3,13:1:0:5 1/1:125,9,0:3:0:14 1/1:67,6,0:2:0:11 1/1:255,33,0:11:0:38 1 54753 . T G 61.5 PASS DP=177;VDB=0.0130;AF1=0.2019;AC1=2;DP4=48,82,1,5;MQ=40;FQ=61.5;PV4=0.42,1,0.27,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:5,0,237:20:3:4 0/1:63,0,193:16:0:60 0/0:0,160,255:53:0:99 0/0:0,141,255:47:0:99 1 54844 . G A 999 MinMQ DP=172;VDB=0.0254;AF1=0.4999;G3=4.104e-12,1,1.068e-33;HWE=0.0185;AC1=4;DP4=70,44,38,18;MQ=20;FQ=999;PV4=0.5,0.27,1,0.29;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:88,0,103:45:5:91 0/1:31,0,120:25:0:34 0/1:97,0,124:58:2:99 0/1:49,0,181:42:5:52 1 55085 . T A 149 MinMQ DP=190;VDB=0.0199;AF1=0.3891;AC1=3;DP4=73,61,13,39;MQ=25;FQ=149;PV4=0.0003,0.35,0.01,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:79,0,161:48:4:80 0/1:9,0,146:22:13:10 0/1:68,0,250:49:12:69 0/0:0,7,228:67:12:7 1 55164 . C A 999 MinMQ DP=96;VDB=0.0334;AF1=1;AC1=8;DP4=0,0,54,35;MQ=23;FQ=-36;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:163,60,0:20:0:99 1/1:124,30,0:10:0:72 1/1:198,69,0:23:0:99 1/1:203,108,0:36:0:99 1 55926 . T C 999 MinMQ DP=56;VDB=0.0269;AF1=1;AC1=8;DP4=0,0,23,32;MQ=14;FQ=-11.9;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:114,30,0:10:0:48 1/1:8,6,0:2:0:24 1/1:130,63,0:21:0:81 1/1:78,66,0:22:0:84 1 57376 . C T 16 MinMQ DP=143;VDB=0.0237;AF1=0.1883;AC1=2;DP4=70,55,2,13;MQ=28;FQ=16;PV4=0.002,0.034,0.001,0.027;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:4,0,201:26:8:3 0/1:18,0,188:19:10:15 0/0:0,26,246:38:15:29 0/0:0,154,255:57:0:99 1 57856 . T A 999 MinMQ DP=191;VDB=0.0263;AF1=0.5;G3=5.154e-23,1,1.244e-12;HWE=0.0185;AC1=4;DP4=58,51,29,51;MQ=21;FQ=999;PV4=0.027,0.028,1,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:204,0,154:54:13:99 0/1:121,0,52:29:9:55 0/1:142,0,175:58:2:99 0/1:104,0,198:48:0:99 1 57952 . A C 118 MinMQ DP=30;VDB=0.0356;AF1=0.8939;AC1=7;DP4=1,1,8,19;MQ=10;FQ=7.51;PV4=0.53,0.23,0.0064,0.46;AN=8;AC=7 GT:PL:DP:SP:GQ 1/1:36,21,0:7:0:27 1/1:17,6,0:2:0:12 1/1:40,33,0:11:0:39 0/1:29,0,12:9:4:7 1 58176 . G A 94.7 MinMQ DP=93;VDB=0.0330;AF1=0.3746;AC1=3;DP4=51,13,15,9;MQ=17;FQ=94.7;PV4=0.11,0.0027,1,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:30,0,23:22:0:26 0/1:18,0,15:12:7:17 0/1:55,0,102:29:2:56 0/0:0,42,114:25:9:41 1 58211 . A G 999 MinMQ DP=46;VDB=0.0332;AF1=1;AC1=8;DP4=0,0,30,15;MQ=22;FQ=-11.8;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:113,30,0:10:0:47 1/1:7,6,0:2:0:23 1/1:148,60,0:20:0:77 1/1:143,39,0:13:0:56 1 58771 . T C 999 MinMQ DP=263;VDB=0.0270;AF1=0.375;AC1=3;DP4=99,85,29,44;MQ=20;FQ=999;PV4=0.053,0.46,1,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:95,0,124:60:2:96 0/0:0,123,209:41:0:99 0/1:179,0,208:101:6:99 0/1:94,0,169:55:6:95 1 58866 . C G 119 MinMQ DP=233;VDB=0.0293;AF1=0.2581;AC1=2;DP4=77,98,12,40;MQ=18;FQ=119;PV4=0.0092,0.058,1,0.19;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:58,0,112:49:1:56 0/0:0,102,152:34:0:99 0/1:69,0,180:70:11:67 0/0:0,10,148:74:16:12 1 60332 . T C 97.6 MinMQ DP=239;VDB=0.0192;AF1=0.3089;AC1=2;DP4=77,104,22,32;MQ=17;FQ=97.6;PV4=0.88,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,0,99:61:9:3 0/0:0,111,185:37:0:99 0/1:70,0,145:68:7:70 0/1:33,0,123:69:0:33 1 61219 . T C 42.6 MinMQ DP=180;VDB=0.0243;AF1=0.1351;AC1=1;DP4=41,105,18,15;MQ=24;FQ=42.6;PV4=0.0069,0.27,0.32,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,6,128:36:5:11 0/0:0,57,168:31:0:62 0/1:48,0,227:53:21:43 0/0:0,16,255:59:6:21 1 61442 . A G 999 PASS DP=96;VDB=0.0348;AF1=1;AC1=8;DP4=0,0,41,47;MQ=30;FQ=-27;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:197,42,0:14:0:75 1/1:108,21,0:7:0:54 1/1:255,99,0:33:0:99 1/1:255,102,0:34:0:99 1 61499 . G A 87.1 PASS DP=140;VDB=0.0120;AF1=0.3006;AC1=2;DP4=54,55,18,12;MQ=35;FQ=87.1;PV4=0.41,0.3,2.4e-12,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:61,0,243:24:2:60 0/0:0,1,231:14:7:4 0/1:32,0,255:43:3:31 0/0:0,48,255:58:6:49 1 61579 . G A 107 MinMQ DP=161;VDB=0.0304;AF1=0.25;AC1=2;DP4=88,29,32,10;MQ=20;FQ=107;PV4=1,0.12,0.015,0.43;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,65,171:34:3:67 0/0:0,32,152:18:3:34 0/1:40,0,174:56:0:38 0/1:75,0,207:51:1:73 1 61987 . A G 999 PASS DP=206;VDB=0.0287;AF1=0.5;G3=1.244e-38,1,7.859e-46;HWE=0.0185;AC1=4;DP4=46,65,42,48;MQ=39;FQ=999;PV4=0.48,0.072,0.00033,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:58:0:99 0/1:182,0,218:22:2:99 0/1:255,0,255:62:3:99 0/1:220,0,255:59:6:99 1 61989 . G C 999 PASS DP=208;VDB=0.0311;AF1=0.5;G3=3.141e-40,1,8.15e-49;HWE=0.0185;AC1=4;DP4=47,65,42,49;MQ=39;FQ=999;PV4=0.57,0.058,8.7e-05,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:59:1:99 0/1:190,0,233:22:2:99 0/1:255,0,255:62:3:99 0/1:216,0,255:60:6:99 1 62203 . T C 999 PASS DP=258;VDB=0.0354;AF1=0.5;G3=3.125e-31,1,5e-52;HWE=0.0185;AC1=4;DP4=77,73,47,52;MQ=40;FQ=999;PV4=0.61,1,2.6e-25,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:255,0,255:66:5:99 0/1:145,0,255:38:1:99 0/1:252,0,255:84:1:99 0/1:210,0,255:61:0:99 1 62239 . TACACACACACACACACA TACACACACACACACA 999 PASS INDEL;DP=223;VDB=0.0280;AF1=0.4961;G3=3.069e-08,1,4.923e-103;HWE=0.0192;AC1=4;DP4=83,54,34,25;MQ=41;FQ=999;PV4=0.75,0.056,2.1e-17,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:248,0,255:45:1:99 0/1:12,0,255:23:9:15 0/1:183,0,255:68:2:99 0/1:158,0,255:60:6:99 1 62271 . A G 134 PASS DP=187;VDB=0.0101;AF1=0.498;G3=2.233e-09,1,4.959e-103;HWE=0.0189;AC1=4;DP4=92,56,14,20;MQ=41;FQ=134;PV4=0.033,0.013,1.9e-22,0.0028;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:65,0,255:35:1:68 0/1:19,0,255:28:2:22 0/1:17,0,255:60:11:20 0/1:39,0,255:59:14:42 1 62777 . A T 999 MinMQ DP=251;VDB=0.0308;AF1=0.499;G3=3.104e-08,1,1.551e-35;HWE=0.0187;AC1=4;DP4=80,108,35,26;MQ=21;FQ=999;PV4=0.055,1,1,0.39;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:115,0,137:69:1:99 0/1:124,0,169:34:6:99 0/1:87,0,202:76:5:90 0/1:18,0,109:70:3:21 1 63735 . CCTACTA CCTA 999 MinMQ INDEL;DP=141;VDB=0.0354;AF1=0.5;G3=9.836e-28,1,9.22e-15;HWE=0.0185;AC1=4;DP4=36,24,32,41;MQ=20;FQ=216;PV4=0.082,1.2e-09,1,0.023;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:83,0,86:30:6:84 0/1:130,0,56:21:0:59 0/1:184,0,40:43:5:43 0/1:181,0,55:39:3:58 1 64613 . T A 999 MinMQ DP=328;VDB=0.0091;AF1=0.5;G3=7.862e-12,1,3.13e-37;HWE=0.0185;AC1=4;DP4=134,130,23,36;MQ=27;FQ=999;PV4=0.11,1,1,0.36;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:79,0,175:95:0:82 0/1:48,0,208:47:2:51 0/1:193,0,248:95:13:99 0/1:107,0,211:86:0:99 1 66162 . A T 999 PASS DP=215;VDB=0.0231;AF1=0.4998;G3=1.238e-10,1,1.58e-77;HWE=0.0186;AC1=4;DP4=62,67,26,36;MQ=39;FQ=999;PV4=0.44,1,3.3e-21,1;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:170,0,255:45:3:99 0/1:77,0,255:30:0:80 0/1:183,0,255:69:3:99 0/1:26,0,255:47:0:29 1 66442 . TATATAATATA TATATAATATAATATA 132 PASS INDEL;DP=233;VDB=0.0328;AF1=0.3333;AC1=3;DP4=64,69,21,10;MQ=42;FQ=135;PV4=0.071,1,3.5e-27,0.11;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:62,0,255:30:2:62 0/0:0,50,255:37:0:50 0/1:91,0,255:65:8:91 0/1:3,0,255:32:2:5 1 66507 . T A 999 PASS DP=202;VDB=0.0385;AF1=0.626;AC1=5;DP4=25,14,63,82;MQ=42;FQ=999;PV4=0.03,0.023,1,0.0014;AN=8;AC=5 GT:PL:DP:SP:GQ 0/1:255,0,205:42:7:99 0/1:255,0,20:37:12:21 0/1:255,0,155:57:4:99 1/1:255,72,0:48:0:71 1 66521 . TATATAATATA TATATAATATAATATA 999 PASS INDEL;DP=200;VDB=0.0384;AF1=0.3747;AC1=3;DP4=61,75,25,12;MQ=43;FQ=999;PV4=0.016,1,3.8e-20,0.38;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:233,0,255:40:7:99 0/1:25,0,255:32:16:26 0/1:178,0,255:56:3:99 0/0:0,75,255:45:3:74 1 69511 . A G 999 MinMQ DP=79;VDB=0.0355;AF1=1;AC1=8;DP4=1,0,44,31;MQ=18;FQ=-30.9;PV4=1,1,1,1;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:133,42,0:14:0:79 1/1:95,25,0:12:0:62 1/1:170,57,0:19:0:94 1/1:192,93,0:31:0:99 1 70300 . C T 14.2 MinMQ DP=147;VDB=0.0063;AF1=0.1206;AC1=1;DP4=63,68,7,5;MQ=19;FQ=14.2;PV4=0.56,1,0.057,0.22;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:20,0,181:35:0:15 0/0:0,39,144:25:7:45 0/0:0,122,201:46:0:99 0/0:0,111,207:37:0:99 1 73822 . A G 161 MinMQ DP=175;VDB=0.0290;AF1=0.2494;AC1=2;DP4=67,87,9,10;MQ=25;FQ=161;PV4=0.81,0.5,1,0.45;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:25,0,203:41:1:23 0/1:144,0,170:29:6:99 0/0:0,178,255:59:0:99 0/0:0,132,255:44:0:99 1 73841 . C T 999 PASS DP=182;VDB=0.0366;AF1=0.3748;AC1=3;DP4=50,64,12,26;MQ=30;FQ=999;PV4=0.25,1.6e-10,0.084,1;AN=8;AC=3 GT:PL:DP:SP:GQ 0/1:95,0,255:33:3:96 0/1:174,0,204:27:9:99 0/1:28,0,255:53:17:29 0/0:0,64,255:39:6:63 1 74092 . G A 26.8 MinMQ DP=158;VDB=0.0267;AF1=0.2721;G3=0.7501,7.846e-07,0.2499;HWE=0.0437;AC1=2;DP4=91,48,4,10;MQ=11;FQ=26.8;PV4=0.0093,0.39,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,5,79:32:10:7 1/1:40,26,0:33:7:19 0/0:0,105,85:35:0:93 0/0:0,160,38:53:0:46 1 79033 . A G 217 MinMQ DP=19;VDB=0.0139;AF1=1;AC1=8;DP4=0,0,12,7;MQ=18;FQ=-12.7;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:77,18,0:6:0:36 1/1:56,12,0:4:0:30 1/1:28,18,0:6:0:36 1/1:56,9,0:3:0:27 1 79050 . G T 258 MinMQ DP=29;VDB=0.0203;AF1=1;AC1=8;DP4=0,0,15,11;MQ=16;FQ=-18.3;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:87,21,0:7:0:45 1/1:52,15,0:5:0:39 1/1:45,24,0:8:0:48 1/1:74,18,0:6:0:42 1 79418 . G C 28.5 PASS DP=99;VDB=0.0139;AF1=0.2016;AC1=2;DP4=31,59,1,5;MQ=39;FQ=28.5;PV4=0.66,0.015,0.045,0.00068;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:5,0,236:21:3:4 0/1:30,0,229:18:0:27 0/0:0,78,255:26:0:81 0/0:0,93,255:31:0:96 1 79772 . C G 999 PASS DP=138;VDB=0.0342;AF1=0.25;AC1=2;DP4=68,47,11,9;MQ=30;FQ=999;PV4=0.81,1,0.41,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:132,0,127:29:6:99 0/1:84,0,219:27:2:82 0/0:0,117,255:39:0:99 0/0:0,120,255:40:0:99 1 82115 . A G 51.1 MinMQ DP=137;VDB=0.0291;AF1=0.1264;AC1=1;DP4=71,55,3,7;MQ=27;FQ=51.1;PV4=0.19,0.085,1,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:57,0,204:36:6:52 0/0:0,14,206:20:0:19 0/0:0,123,255:41:0:99 0/0:0,117,255:39:0:99 1 82133 . CAAAAAAAAAAAAAAAAAAAAA CAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAA 81.9 PASS INDEL;DP=107;VDB=0.0354;AF1=1;AC1=8;DP4=0,0,5,17;MQ=37;FQ=-22.7;AN=8;AC=7,1 GT:PL:DP:SP:GQ 1/2:83,73,58,31,0,18:9:0:29 1/1:10,3,0,10,3,10:1:0:17 1/1:32,15,0,32,15,32:7:0:29 1/1:32,15,0,32,15,32:5:0:29 1 82303 . T C 21 PASS DP=111;VDB=0.0241;AF1=0.1243;AC1=1;DP4=47,50,3,8;MQ=38;FQ=21;PV4=0.22,1,1.5e-14,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:27,0,255:20:5:22 0/0:0,24,255:21:0:29 0/0:0,96,255:32:0:99 0/0:0,75,255:35:0:80 1 82456 . A G 49.6 MinMQ DP=151;VDB=0.0367;AF1=0.2495;AC1=2;DP4=77,55,15,0;MQ=20;FQ=49.6;PV4=0.0011,0.29,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:31,0,124:31:8:29 0/1:27,0,107:23:9:25 0/0:0,151,251:50:0:99 0/0:0,129,255:43:0:99 1 82676 . T G 999 PASS DP=152;VDB=0.0213;AF1=0.25;AC1=2;DP4=70,59,9,11;MQ=34;FQ=999;PV4=0.48,0.37,0.0004,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:124,0,255:34:0:99 0/1:89,0,255:31:6:87 0/0:0,138,255:46:0:99 0/0:0,114,255:38:0:99 1 83084 . T A 999 PASS DP=84;VDB=0.0128;AF1=1;AC1=8;DP4=0,0,38,37;MQ=37;FQ=-33;AN=8;AC=8 GT:PL:DP:SP:GQ 1/1:255,48,0:16:0:87 1/1:203,27,0:9:0:66 1/1:255,72,0:24:0:99 1/1:255,78,0:26:0:99 1 83514 . C T 88.2 MinMQ DP=139;VDB=0.0336;AF1=0.4441;AC1=4;DP4=54,53,22,8;MQ=14;FQ=87.9;PV4=0.037,0.05,1,0.35;AN=8;AC=4 GT:PL:DP:SP:GQ 1/1:30,5,0:38:2:4 0/0:0,30,24:20:6:24 0/1:17,0,23:36:4:18 0/1:51,0,62:43:8:53 1 83786 . TAAAAAAAA TAAAAAAAAAAA 134 PASS INDEL;DP=144;VDB=0.0396;AF1=0.2505;AC1=2;DP4=43,54,4,9;MQ=40;FQ=137;PV4=0.39,1,2.7e-06,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:47,0,67:25:0:45 0/1:113,0,16:11:0:24 0/0:0,84,98:29:0:86 0/0:0,135,116:45:0:99 1 83829 . GAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA GAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA 999 PASS INDEL;DP=131;VDB=0.0388;AF1=0.5;AC1=4;DP4=20,14,15,17;MQ=39;FQ=999;PV4=0.46,1,7e-05,1;AN=8;AC=4 GT:PL:DP:SP:GQ 1/1:255,39,0:13:0:36 0/1:36,0,124:6:0:39 0/1:255,0,203:27:2:99 0/0:0,60,255:20:0:57 1 83895 . GAGAAAGAAAGAAAGAAAGA GAGAAAGAAAGAAAGA 999 PASS INDEL;DP=151;VDB=0.0325;AF1=0.75;AC1=6;DP4=18,19,38,34;MQ=44;FQ=999;PV4=0.69,0.00045,0.12,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:255,60,0:20:0:62 1/1:255,48,0:16:0:50 0/1:255,0,255:37:1:99 0/1:255,0,255:36:3:99 1 84010 . G A 37 PASS DP=190;VDB=0.0033;AF1=0.125;AC1=1;DP4=85,71,6,3;MQ=38;FQ=37;PV4=0.73,0.2,1,0.14;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,126,255:42:0:99 0/0:0,81,255:27:0:86 0/1:43,0,255:38:2:38 0/0:0,64,255:58:2:69 1 84014 . G A 38 PASS DP=192;VDB=0.0055;AF1=0.125;AC1=1;DP4=89,67,3,4;MQ=39;FQ=38;PV4=0.47,0.21,1,0.021;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:44,0,255:42:0:39 0/0:0,84,255:28:0:89 0/0:0,72,255:36:0:77 0/0:0,172,255:57:0:99 1 84018 . G A 79.6 PASS DP=188;VDB=0.0107;AF1=0.2497;AC1=2;DP4=77,64,8,3;MQ=41;FQ=79.6;PV4=0.35,0.17,1,0.0032;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,120,255:40:0:99 0/1:28,0,255:24:0:26 0/0:0,108,255:36:0:99 0/1:60,0,255:52:6:58 1 84244 . A C 999 PASS DP=213;VDB=0.0276;AF1=0.25;AC1=2;DP4=83,93,14,22;MQ=41;FQ=999;PV4=0.46,0.29,0.019,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:47:6:99 0/1:208,0,255:49:1:99 0/0:0,178,255:59:0:99 0/0:0,172,255:57:0:99 1 85597 . A C 999 PASS DP=139;VDB=0.0342;AF1=0.25;AC1=2;DP4=47,60,16,14;MQ=30;FQ=999;PV4=0.41,0.057,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:198,0,135:26:0:99 0/1:184,0,221:35:5:99 0/0:0,114,255:38:0:99 0/0:0,114,255:38:0:99 1 86018 . C G 999 PASS DP=181;VDB=0.0399;AF1=0.25;AC1=2;DP4=70,69,13,26;MQ=42;FQ=999;PV4=0.07,1,7e-14,0.12;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:200,0,255:36:11:99 0/1:240,0,255:46:2:99 0/0:0,129,255:43:0:99 0/0:0,160,255:53:0:99 1 86303 . G T 999 PASS DP=182;VDB=0.0329;AF1=0.25;AC1=2;DP4=76,66,17,22;MQ=40;FQ=999;PV4=0.28,1,3.7e-11,0.31;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:44:3:99 0/1:207,0,255:31:1:99 0/0:0,138,255:46:0:99 0/0:0,181,255:60:0:99 1 86331 . A G 999 PASS DP=187;VDB=0.0331;AF1=0.25;AC1=2;DP4=69,74,18,23;MQ=40;FQ=999;PV4=0.72,1,2.4e-05,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/1:255,0,255:51:1:99 0/1:216,0,255:34:0:99 0/0:0,120,255:40:0:99 0/0:0,178,255:59:0:99 1 86656 . G T 36 MinMQ DP=148;VDB=0.0132;AF1=0.125;AC1=1;DP4=76,63,0,9;MQ=28;FQ=36;PV4=0.0012,1,0.028,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:42,0,220:30:8:37 0/0:0,39,255:24:6:44 0/0:0,126,255:42:0:99 0/0:0,157,255:52:0:99 1 89677 . A G 195 MinMQ DP=136;VDB=0.0371;AF1=0.25;AC1=2;DP4=46,51,14,17;MQ=23;FQ=195;PV4=0.84,0.28,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,72,186:24:0:74 0/0:0,63,181:21:0:65 0/1:130,0,145:39:1:99 0/1:73,0,155:44:1:71 1 91072 . A G 200 MinMQ DP=88;VDB=0.0383;AF1=0.25;AC1=2;DP4=44,22,11,10;MQ=25;FQ=200;PV4=0.3,1,1,0.41;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,42,166:14:0:44 0/0:0,33,123:11:0:35 0/1:114,0,203:38:10:99 0/1:94,0,104:24:2:92 1 91075 . T C 187 MinMQ DP=85;VDB=0.0399;AF1=0.25;AC1=2;DP4=41,21,11,9;MQ=26;FQ=187;PV4=0.43,1,1,1;AN=8;AC=2 GT:PL:DP:SP:GQ 0/0:0,42,148:14:0:44 0/0:0,33,136:11:0:35 0/1:100,0,186:36:8:98 0/1:95,0,112:21:5:93 1 91336 . A T 999 MinMQ DP=69;VDB=0.0304;AF1=0.7419;AC1=6;DP4=11,19,1,37;MQ=20;FQ=216;PV4=0.0003,1,1,1;AN=8;AC=6 GT:PL:DP:SP:GQ 1/1:51,10,0:11:3:12 1/1:104,39,0:13:0:41 0/1:54,0,110:17:6:62 0/1:61,0,114:27:13:69 1 98929 . A G 28 MinMQ DP=172;VDB=0.0198;AF1=0.1249;AC1=1;DP4=63,78,12,9;MQ=21;FQ=28;PV4=0.35,0.23,0.15,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/0:0,90,234:30:0:95 0/1:34,0,191:43:3:29 0/0:0,114,245:38:0:99 0/0:0,28,150:51:6:33 1 98999 . TTTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTT TTTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTT 999 PASS INDEL;DP=236;VDB=0.0395;AF1=0.5;G3=1.982e-19,1,1.257e-17;HWE=0.0185;AC1=4;DP4=33,36,32,46;MQ=35;FQ=999;PV4=0.51,1,0.0067,0.38;AN=8;AC=4 GT:PL:DP:SP:GQ 0/1:112,0,192:30:0:99 0/1:255,0,77:31:4:80 0/1:86,0,191:32:2:89 0/1:255,0,100:54:12:99 1 101686 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP:SP:GQ 0/1:95,0,255:58:0:90 0/0:0,72,255:24:0:77 0/0:0,101,255:102:0:99 0/0:0,255,255:111:0:99 X 1 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 0:0,255:11 X 2 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 X 3 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1 GT:PL:DP 0/0:0,95,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 vcftools_0.1.11/examples/cmp-test-b.vcf0000644000000000000000000000117412156354771016520 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 1 100100 . G C 0 . DP=1 GT:GQ:DP 0|1:40:1 1/0:40:1 1 100200 . G C 0 . DP=1 GT:GQ:DP 1|0:40:1 0/0:40:1 1 100300 . G C 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100400 . C G 35 . DP=1 GT:GQ:DP 1/1:41:1 0/0:40:1 1 100500 . A G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 1 100600 . C G 0 . DP=1 GT:GQ:DP 1/1:40:1 0/0:40:1 vcftools_0.1.11/examples/fix-ploidy.out0000644000000000000000000000360412156354771016662 0ustar rootroot61098 M1 0 0,9,72,5,6,7 M2 0 0,15,140,5,6,7 F3 1/1 147,0,5 F4 0/0 0,131,5 M5 0 0,9,83,5,6,7 M6 0 0,6,56,5,6,7 61270 M1 0 8,14,58 M2 0 0,6,52 F3 0/0 0,6,56 F4 0/0 0,15,117 M5 0 0,6,45 M6 0 0,12,87 61275 M1 0 0,3,13 M2 0 0,3,28 F3 0/0 8,0,41 F4 0/0 0,12,97 M5 0 0,6,49 M6 0 0,9,67 61282 M1 0/1 15,3,0 M2 0/0 0,6,51 F3 0 6,0,31 F4 0 0,6,57 M5 0/1 7,0,19 M6 0/1 16,0,20 61795 M1 0/0 0,27,203 M2 0/0 0,21,174 F3 0 0,45,229 F4 0 0,27,199 M5 0/0 0,24,182 M6 0/0 0,9,85 62731 M1 0/0 0,27,194 M2 0/0 0,24,194 F3 0 0,18,141 F4 0 0,30,201 M5 0/0 0,18,153 M6 0/0 0,33,202 63008 M1 0/0 0,42,255 M2 0/0 0,15,128 F3 0 0,15,136 F4 0 0,39,251 M5 0/0 0,15,111 M6 0/0 0,27,200 63231 M1 0/0 0,42,246 M2 0/0 0,18,141 F3 0 0,27,209 F4 0 0,24,186 M5 0/0 0,12,110 M6 0/0 0,21,145 63244 M1 0 0,36,209 M2 0 0,21,174 F3 . 0,27,198 F4 . 0,24,184 M5 0 0,15,132 M6 0 0,21,159 63328 M1 0 0,42,242 M2 0 0,12,110 F3 . 0,36,231 F4 . 0,36,226 M5 0 0,15,135 M6 0 0,18,132 63452 M1 0 0,27,200 M2 0 0,15,123 F3 . 0,33,228 F4 . 0,9,88 M5 0 0,24,171 M6 0 0,15,134 63799 M1 0 0,36,211 M2 0 0,27,205 F3 . 0,18,125 F4 . 0,15,125 M5 0 1,0,150 M6 0 0,12,106 63967 M1 0 0,30,183 M2 0 0,30,206 F3 . 0,30,206 F4 . 0,33,230 M5 0 0,21,160 M6 0 0,12,112 65288 M1 . 0,18,155 M2 . 0,12,113 F3 0 0,18,144 F4 0 0,21,155 M5 . 0,21,176 M6 . 0,6,63 65900 M1 . 162,24,0 M2 . 160,21,0 F3 1 219,30,0 F4 1 213,30,0 M5 . 248,42,0 M6 . 148,21,0 65951 M1 . 0,18,125 M2 . 0,15,132 F3 0 0,24,183 F4 0 0,45,252 M5 . 0,30,217 M6 . 0,15,101 66370 M1 . 255,57,0 M2 . 193,24,0 F3 1 97,12,0 F4 1 208,30,0 M5 . 129,15,0 M6 . 72,9,0 67184 M1 . 0,33,202 M2 . 0,6,57 F3 0 0,42,223 F4 0 0,21,142 M5 . 0,30,181 M6 . 0,33,201 67760 M1 . 0,42,224 M2 . 0,9,77 F3 0 0,45,243 F4 0 0,21,147 M5 . 0,33,205 M6 . 0,12,102 68303 M1 . 0,33,205 M2 . 0,33,236 F3 0 0,33,214 F4 0 0,27,197 M5 . 0,18,149 M6 . 0,9,72 68618 M1 0/0 0,24,176 M2 0/0 0,30,214 F3 0/0 0,21,159 F4 0/0 0,24,191 M5 0/0 0,15,133 M6 0/0 0,18,133 vcftools_0.1.11/examples/valid-3.3.vcf0000644000000000000000000000311412156354771016141 0ustar rootroot##fileformat=VCFv3.3 ##fileDate=20090805 ##phasing=partial ##test meta ##INFO=NS,1,Integer,"Number of Samples With Data" ##INFO=DP,1,Integer,"Total Depth" ##INFO=AN,1,Integer,"Total number of alleles in called genotypes" ##INFO=AC,-1,Integer,"Allele count in genotypes, for each ALT allele, in the same order as listed" ##INFO=AF,-1,Float,"Allele Frequency" ##INFO=AA,1,String,"Ancestral Allele" ##INFO=DB,0,Flag,"dbSNP membership, build 129" ##INFO=H2,0,Flag,"HapMap2 membership" ##FILTER=q10,"Quality below 10" ##FILTER=s50,"Less than 50% of samples have data" ##FORMAT=GT,1,String,"Genotype" ##FORMAT=GQ,1,Integer,"Genotype Quality" ##FORMAT=DP,1,Integer,"Read Depth" ##FORMAT=HQ,2,Integer,"Haplotype Quality" ##FORMAT=DS,1,Float,"Alternative Allele Dosage" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0\1:3,3 20 14370 rs6054257 G A 29 0 NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:-1,-1 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:-1,-1 20 1110696 rs6040355 A G,T 67 0 NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:-1,-1 20 1230237 . T . 47 0 NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:-1:56,60 0|0:48:4:51,51 0/0:61:2:-1,-1 20 1234567 microsat1 G D4,IGA 50 0 NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:-1:4 0/2:17:2 1/1:40:3 20 1235237 . T . -1 . . GT 0\0 0|0 ./. X 10 rsTest A T 10 . . GT:DS 0:0.1 0/1:0.5 0|1:0.5 X 11 rsTest2 T A,G 10 q10;s50 . GT:DP:GQ 0:3:10 .:5:20 0:3:10 vcftools_0.1.11/examples/contrast.out0000644000000000000000000001544712156354771016443 0ustar rootroot##fileformat=VCFv4.1 ##samtoolsVersion=0.1.18-r572 ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##FILTER= ##INFO= ##INFO= ##FILTER= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D 1 10250 . A C 61 MinMQ DP=271;VDB=0.0265;AF1=0.125;AC1=1;DP4=87,78,18,9;MQ=17;FQ=61;PV4=0.21,1,1,0.1;AN=8;AC=1;NOVELAL=D;NOVELTY=255 GT:DP:SP:GQ 0/0:60:0:99 0/0:32:5:53 0/0:50:2:83 0/1:50:3:62 1 10352 . TACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC 253 MinMQ INDEL;DP=413;VDB=0.0226;AF1=0.8598;AC1=7;DP4=7,17,13,44;MQ=15;FQ=4.35;PV4=0.58,1,1,0.0055;AN=8;AC=7;NOVELAL=D;NOVELTY=6 GT:PL:DP:SP:GQ 1/1:67,6,0:18:2:11 1/1:14,7,0:12:0:12 1/1:111,22,0:23:0:26 0/1:83,0,22:28:2:18 1 17538 . C A 64 MinMQ DP=393;VDB=0.0314;AF1=0.125;AC1=1;DP4=138,205,17,27;MQ=28;FQ=64;PV4=0.87,0.32,1,1;AN=8;AC=1;NOVELAL=D;NOVELTY=29 GT:PL:DP:SP:GQ 0/0:0,152,255:148:1:99 0/0:0,29,227:72:4:34 0/0:0,71,255:86:6:76 0/1:70,0,226:81:4:65 1 28563 . A G 999 MinMQ DP=124;VDB=0.0072;AF1=1;AC1=8;DP4=22,31,27,39;MQ=18;FQ=-3.67;PV4=1,1,1,1;AN=8;AC=8;NOVELAL=D;NOVELTY=1 GT:PL:DP:SP:GQ 1/1:191,6,0:41:1:14 1/1:90,2,0:24:0:11 1/1:213,20,0:31:4:28 1/1:104,0,1:23:0:8 1 28590 . TT TTGGT 116 MinMQ INDEL;DP=112;VDB=0.0233;AF1=0.3933;AC1=3;DP4=5,46,10,16;MQ=19;FQ=54.6;PV4=0.005,1,1,0.00097;AN=8;AC=3;NOVELTY=9;NOVELGT=D GT:PL:DP:SP:GQ 0/1:80,0,2:23:10:8 0/1:9,0,9:15:15:9 0/1:51,0,26:21:2:31 0/0:0,17,39:18:5:16 1 55085 . T A 149 MinMQ DP=190;VDB=0.0199;AF1=0.3891;AC1=3;DP4=73,61,13,39;MQ=25;FQ=149;PV4=0.0003,0.35,0.01,1;AN=8;AC=3;NOVELTY=7;NOVELGT=D GT:PL:DP:SP:GQ 0/1:79,0,161:48:4:80 0/1:9,0,146:22:13:10 0/1:68,0,250:49:12:69 0/0:0,7,228:67:12:7 1 58176 . G A 94.7 MinMQ DP=93;VDB=0.0330;AF1=0.3746;AC1=3;DP4=51,13,15,9;MQ=17;FQ=94.7;PV4=0.11,0.0027,1,1;AN=8;AC=3;NOVELTY=18;NOVELGT=D GT:PL:DP:SP:GQ 0/1:30,0,23:22:0:26 0/1:18,0,15:12:7:17 0/1:55,0,102:29:2:56 0/0:0,42,114:25:9:41 1 66507 . T A 999 PASS DP=202;VDB=0.0385;AF1=0.626;AC1=5;DP4=25,14,63,82;MQ=42;FQ=999;PV4=0.03,0.023,1,0.0014;AN=8;AC=5;NOVELTY=20;NOVELGT=D GT:PL:DP:SP:GQ 0/1:255,0,205:42:7:99 0/1:255,0,20:37:12:21 0/1:255,0,155:57:4:99 1/1:255,72,0:48:0:71 1 66521 . TATATAATATA TATATAATATAATATA 999 PASS INDEL;DP=200;VDB=0.0384;AF1=0.3747;AC1=3;DP4=61,75,25,12;MQ=43;FQ=999;PV4=0.016,1,3.8e-20,0.38;AN=8;AC=3;NOVELTY=25;NOVELGT=D GT:PL:DP:SP:GQ 0/1:233,0,255:40:7:99 0/1:25,0,255:32:16:26 0/1:178,0,255:56:3:99 0/0:0,75,255:45:3:74 1 73841 . C T 999 PASS DP=182;VDB=0.0366;AF1=0.3748;AC1=3;DP4=50,64,12,26;MQ=30;FQ=999;PV4=0.25,1.6e-10,0.084,1;AN=8;AC=3;NOVELTY=28;NOVELGT=D GT:PL:DP:SP:GQ 0/1:95,0,255:33:3:96 0/1:174,0,204:27:9:99 0/1:28,0,255:53:17:29 0/0:0,64,255:39:6:63 X 2 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1;NOVELTY=255;NOVELGT=D GT:PL:DP 0/1:95,0,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 X 3 . A G 89 PASS DP=304;VDB=0.0327;AF1=0.125;AC1=1;DP4=99,171,10,15;MQ=30;FQ=89;PV4=0.83,0.3,0.035,1;AN=8;AC=1;NOVELAL=D;NOVELTY=255 GT:PL:DP 0/0:0,95,255:11 0/0:0,72,255:11 0/0:0,101,255:11 1:255,0:11 vcftools_0.1.11/examples/indel-stats.tab0000644000000000000000000000003012156354771016752 0ustar rootroot1 20 30 1 40 50 1 60 80 vcftools_0.1.11/examples/valid-4.1.vcf0000644000000000000000000000566712156354771016157 0ustar rootroot##fileformat=VCFv4.1 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta ##contig= ##contig= ##contig= ##SAMPLE= ##SAMPLE= ##PEDIGREE= ##PEDIGREE= ##pedigreeDB=url #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 20 1234567 microsat1 GTC G,GTCTC 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 20 2234567 . C [13:123457[ACGC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 2234568 . C .TC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 2234569 . C CT. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 3234569 . C 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 20 4234569 . N .[13:123457[ 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 ./.:40:3 20 5234569 . N [13:123457[. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 Y 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GL 0:0,49 0:0,3 1:41,0 vcftools_0.1.11/examples/indel-stats.vcf0000644000000000000000000000101312156354771016764 0ustar rootroot##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO 1 15 . ACGT A . PASS . 1 15 . A ACGT . PASS . 1 18 . ACGT A . PASS . 1 18 . A ACGT . PASS . 1 25 . ACGT A . PASS . 1 25 . A ACGT . PASS . 1 27 . ACGTA A . PASS . 1 27 . A ACGT . PASS . 1 29 . ACGT A . PASS . 1 29 . A ACGT . PASS . 1 35 . ACGT A . PASS . 1 35 . A ACGT . PASS . 1 38 . ACGT A . PASS . 1 38 . A ACGT . PASS . 1 45 . ACGT A . PASS . 1 45 . A ACGT . PASS . 1 47 . ACGTA A . PASS . 1 47 . A AACGT . PASS . 1 49 . ACGT A . PASS . 1 49 . A ACGT . PASS . vcftools_0.1.11/examples/valid-4.0.vcf0000644000000000000000000000420612156354771016142 0ustar rootroot##fileformat=VCFv4.0 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=1000GenomesPilot-NCBI36 ##phasing=partial ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##FILTER= ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##ALT= ##ALT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. 20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 20 1235237 . T . . . . GT 0/0 0|0 ./. X 9 . A T 12.1 . . GT 0 0/1 1/0 X 10 rsTest AC A,ATG 10 PASS . GT 0 0/1 0|2 X 11 rsTest2 T A, 10 q10;s50 . GT:DP:GQ .:3:10 ./. 0|2:3 X 12 . T A 13 . . GT 0 1/0 1/1 vcftools_0.1.11/examples/merge-test-c.vcf0000644000000000000000000000143612156354771017042 0ustar rootroot##fileformat=VCFv4.0 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C 1 3062915 . GTTT G 388 . DP=10;DP4=1,2,3,4 GT:GQ:DP 0/1:149:10 1 3106154 . CAAA C 269 . DP=9 GT:GQ:DP 1/1:25:9 1 3157410 . GA G 212 . DP=10 GT:GQ:DP 0/1:52:10 1 3162006 . GAA G 558 . DP=17 GT:GQ:DP 0/1:163:17 1 3177144 . GT G 211 . DP=14 GT:GQ:DP 0/1:151:14 1 3199812 . G GT . . . GT 1/1 2 3212016 . CTT C 613 . DP=11 GT:GQ:DP 0/1:41:11 3 3199815 . G T 353 PASS DP=19 GT:GQ:DP 0/1:188:19 3 3242491 . TT T . . . GT 1/1 4 3291771 . T TAA,TAAA 336 . DP=12 GT:GQ:DP 1/2:2:12 vcftools_0.1.11/website/0000755000000000000000000000000012163074506013657 5ustar rootrootvcftools_0.1.11/website/default.css0000644000000000000000000001117412156354770016027 0ustar rootroot/*############################################################# Name: Transparentia Date: 2006-08-20 Description: Simple, lightweight and slightly blue. Author: Viktor Persson URL: http://templates.arcsin.se Feel free to use and modify but please provide credits. #############################################################*/ /* standard elements */ * { margin: 0; padding: 0; } a { color: #36C; } a:hover { color: #06F; } body { background: #EEE url(img/bg.gif); color: #444; /*font: normal 62.5% "Lucida Sans Unicode",sans-serif;*/ font: normal 75% "Lucida Sans Unicode",sans-serif; margin: 0; } input { color: #555; font: normal 1.1em "Lucida Sans Unicode",sans-serif; } p,cite,code { /*font-size: 1.2em;*/ padding-bottom: 1.2em; } h1 { font-size: 1.4em; margin-bottom: 6px; } li { /*font-size: 1.2em;*/ } code { background: url(img/bgcode.gif); border: 1px solid #F0F0F0; border-left: 6px solid #39F; color: #555; display: block; font: normal 1.1em "Lucida Sans Unicode",serif; margin-bottom: 12px; padding: 8px 10px; white-space: pre; } cite { background: url(img/quote.gif) no-repeat; color: #666; display: block; font: normal 1.3em "Lucida Sans Unicode",serif; padding-left: 28px; } h1,h2,h3 { color: #367EA6; padding-top: 6px; } .codebox { background: #EEEEEE; border: 1px solid #DDD; padding: 1px; padding-left: 18px; /*font-size: 1.1em;*/ /*font-size: 12px;*/ margin-bottom: 1.2em; /* Not such a good idea after all: want to have links inside white-space: pre; /* To allow multiple lines without
*/ } .codebox A { text-decoration: none; } /* misc */ .clearer { clear: both; } /* structure */ .container { background: url(img/bgcontainer.gif) repeat-y; margin: 0 auto; padding-bottom: 6px; width: 800px; } .header { background: url(img/header.gif) repeat-x; } /* title */ .title { text-align: center; margin-bottom: 1em; } .title a { color: #FFF; font: normal 3em Verdana,sans-serif; height: 150px; line-height: 150px; text-decoration: none; } /* navigation */ .navigation { margin-bottom: 4px; } .navigation a { background-color: #4A91C3; color: #FFF; float: left; font: bold 1.2em "Trebuchet MS",sans-serif; padding: 8px 0; width: 19%; border-right: 1px solid #FFF; text-align: center; text-decoration: none; } .navigation a:hover { background-color: #000; color: #FFF; } /* main */ .main { clear: both; padding: 8px 18px; } /* main left */ .sidenav h1,.sidenav ul { padding-left: 12px; } .sidenav { background: #EEE; border: 1px solid #E5E5E5; float: left; width: 200px; } .sidenav h1 { color: #666; font-size: 1.2em; height: 20px; margin-top: 1.2em; } .sidenav ul { border-top: 1px solid #FAFAFA; background: url(img/bgul.gif) repeat-x; margin: 0; padding: 0; } .sidenav li { border: 1px solid #FAFAFA; border-top: none; list-style: none; margin: 0; } .sidenav li a { color: #777; display: block; font-size: 0.9em; padding: 3px 6px 3px 14px; text-decoration: none; } .sidenav li a:hover { color: #111; } /* content */ .content { float: right; width: 553px; } .content .item { padding: 6px 12px; border: 1px solid #EEE; background: #FFF; margin-bottom: 8px; } .content .descr { color: #333; margin-bottom: 6px; } .content li { list-style: url(img/li.gif); margin-left: 18px; } /* search form */ form { padding: 0 0 6px 8px; } .styled { border: 1px solid #DDD; padding: 4px; } .button { background: url(img/search.gif) no-repeat left bottom; border: none; height: 27px; width: 27px; } /* footer */ .footer { background: #FFF; border: 1px solid #EEE; color: #666; font-size: 0.5em; margin: 0 auto; text-align: center; padding: 6px; width: 753px; } .footer a { color: #36C; text-decoration: none; } .footer a:hover { color: #06F; text-decoration: underline; } /* Custom styles */ ul.faq { list-style-type: none; margin: 0px; padding: 0px; } ul.faq li { list-style: none; } .cmd { text-align: center; color: #367EA6; font-style: italic; } .Q { color: #367EA6; font-weight: bold; } .usageBox { position: relative; text-align: right; } .usageToggle { cursor: pointer; color: #367EA6; font-size: smaller; } .usageText { position: absolute; right: 0; background-color: #ddd; padding: 1em; display: none; text-align: left; border: solid #aaa 1px; z-index: 2; font-family: monospace; white-space: pre; } vcftools_0.1.11/website/Makefile0000644000000000000000000000027112156354770015325 0ustar rootroot FILES=docs index license options perl_module specs links htslib install: @for i in $(FILES); do echo "php src/index.php $$i > $$i.html"; php src/index.php $$i > $$i.html; done; vcftools_0.1.11/website/favicon.png0000644000000000000000000000750212156354770016024 0ustar rootroot‰PNG  IHDR‘h6_iCCPICC ProfilexÚ­—i4Õí×Ç÷ïópdž§ŒqÌSÆÌŽy:È”á˜Çc2$2¥D’¤D*!%")"B¦2ER2DB…â ò€ÐéÏH›¢”ù{y$ÒÏ iïzàI!*<úŸ~!H?À:ÿ~ó?B#(D•^"—¢H |AÍEãC{Ž„±b(=°Ä¤ÌœÁ2Ì&ÀÀÑÌEÅmÃSÎû…_C M𥭰šˆ£h X¸xø¡ ?Iw,^ÊHZU+Ë+‡‘#ɯ+Ì*ö+=T¾ª’y8DÕFMQU}KcH³V+çH ¶¹Ž¼.‡¢·¤?hÐhxÑ(çd¬bÂf²aÚoVm~ÒÂÅRÅŠÍjÇzÖ¦Û¶_lwÂÞÇÁ쨌#£ã†Ó°sƒKž+ñ˜…›¬;›â±áù‰0ã5æýÒ§Ã÷¾_•ÿÅ€ŒÀ˜ ¯`›Py¢h˜`¸@ÄÁH±(l´\ŒRìá8µxõãê j‰‡“”’åOH§:y0•;ùM:’þ=c9s6k8»óôƒœª3Eg3r£ÏyçáóõÎËðÒî\X¸8TÔZ\U’w)®Ôã²q™Ü®ròò¯Wç*†¯u]oª¬¹q©*§:ñfHë-óÛw$j9ïRßýU÷µ~¹á㽹ƙûSÆ›F4¿hyö¨­õAÛÝÇUíeO :²ž&> ëô겎ëVí‘xÁÝKÛû³oùåx×@ã`å«â¡sÃ9#Y£™c™¯ÓßœOH™LžJ˜Ž;3÷.y.û}ÉüÝ]§6©—ø—•VL?{®Æ­|¹·>±A½©³•ü­ýjÛp'gwôßžß~#‰ôÏü‡É%()ë¨ÆŒ¶€n#Âr … ÍŒc)`bçå|Àâ1å-à›ô=xE¨OxYd[tOlW|ãТĴä¶MªFú¢Lªl¨œ‹”>»¾+~Sþ½Í7‚ ‚SBBC݈ø0ópÓóHË(›h»‡XÇ8§x§ãN މIødëf)F'µSUÓdO‰¦óf0eRfîf­f¿;=”óô̽³¹ùçRòˆùÇΛ¨Š_`¹WŠF‹[K®_:]vÙ¡LãŠP9]ù« ¯¯õ\TyçFYUnõ‰›Ä÷[–·µîHÕòÜ¥»KªÛª_kX¹·Ô¸xÿÓƒ…¦ù‡sͳ-SÞ´Ž¶ >îmï|ÒÖÑø´æYygaWöó¤îðï½Æ}j/%û¹èö7_}Z^ù2º5¶óz5A>I5E;Mÿ–a†q–íÿœì{£y‡´U />}^¢_–Y±ú¾Z¸Ööeõ«ôFÒæä·#ßolÓîøï>ýźçµß@" Ò(ô² NJUõcš]:,½7¦Œa”‘‘I9‘å>ë*»‡ç®AJ^M¾þ£‚ß…è„ÙD8EYÅ0âhñCËã’ϱõR—¤Ód‚d­åTä¹å÷Þ)¶+]VŽW±;,£J¡:­Ö ž®á¨)¡¹«Õw¤L;LÇP—W÷›^Ÿ~…AŒ¡©ŸÑ:î‰ñ97S¬é¶Y§y®…³¥¸åw«ëR›p[¼ þ‡]¿}…CôQcGÇU§6ç3..®®ÛǺÝJÜC=L=%ô„¯^£ÞM>%¾ÇýœýÕ¸¶_Ý Î µ$*†ñ†Ó„ÿŒX‹œ‹~Ó[WŸy<&Á;Ñ&éH²ä ¶HY>9’Ú’Vq*+=4Ã.S-‹/•ýñtONÍ™œ³Á¹–çdó˜ò¶ò_Ÿo.¸T˜xÁõ¢V_Ñ~ñLIë¥ÒÒ¸ËeJWX®|+¿Ú^Qu-ÿzb¥ï ë*µj¡›t77k&ouÜ®ºs¦6êî±:Ózù{ØÆC÷Elâ{ÈÕÌÚràM+ºõWÛ·Çkí Of;ÆŸ=ëíììj}~¿»¶§òEYoa_ÎËÔþøˆÁÐWÄ¡˜áô‘’ц±¾×ÞìOpLÊO™Mû¼M™)Ÿ}þnû½ê|ú‡©•O…‹[Ë+W•Öº×Ïm\ÞZÿ‘»kµ'L"üÞ}Ê•ŽÂÆkå ZÀœ`E€W¤»–9@r]þì@%`€„@´Á¼ r¡žÂ[ø‰p"jˆ ’Œ\GúM/ ‡ŠDU †Ñ(´Ú}=JFOfDvЬ›CîD~‹¡p¦xHÉA™@9OeNÕL-N]LCO“J³GOû“î=}FÓÄ`ɰ| “QŒ±›É—™Š¹†Å’e›µ’Íž†ýG§ÂÕÃÇãÂ+»Ê×ÈŸ `(È"øñ`³P¾pˆˆ¹¨¬›I|ùШÄ#É2l’”‹´† Ÿ,ZvEî|·B³b­R•r…ÊåÃEªçÔÒÔ£4|4íµpG4´t°ºâzbúâXC# œ‘±½‰¯éq³|óÛ=–‹Ö462¶öø»Zû™£ŒŽFNÉÎÍ.ß)ºE¹7{8¯ïy_E¿,ÿ¹@ âàíPbc8C„_dG4OLlìx¼Úñ²Dò¤Ðä©““-iØS¥Ô™Y“§urªÏ2æ&œ[Îw>ß_¨{¡¥H±¸î’Li}™Ê•ö«¦“×+IUnÊÔôÝö¯¥¹[[o×@j¬~`Õ´Ùœ÷H¢õécÇö޳Ï$;{Ÿõ0¼¸ßçܨze<ôq$~Œæõ…qÁ‰;SêÓ½3γ«s)ó¬j´> ,9.O}Ưv}‘_/Ý Û Úù®ö£|‡|×ÿçÀžÂ~‰ôwþ r ¶àÇ!nÁsxÂh#$¹ƒŒ!û(quUšC3£ ÑÇÑõè%2!2w²r²äÒäIäÃ’™‹”¦” TFÁOÑEÉVÙ\Åø0N§†S××ÐÔTÐ?§ͪƒÑ¥Ñ£Ô§4 6ıâø%MÔLÍÌÜÍc-ò-ë¬^YoØràµíí‹zî9)8¹T¹~pv÷ñ¨ñüê¥îé3î‡õO ˜ Ò ¾Š&ú‡ DÈEæE}‰1‹­‰§=’0’¤ž|#…ùäÉÔ­S~éo3­²žœÆæœù•K8÷2_å|e!ë…ô‹»Åa%K¥^—ß_ñ,_¨¹¶S™QÅ^]S£}küNÄ]ƺú»{?ïW4™<\k9Ó*ÚÖÒnúdúiH'º«¸[¾§¿7à%MÿíAËWëÃ9£c·ÞÈŽ×MJMÝz+1Só;W7¯ôáÑ‚î§çKË­Ÿ…V×ÖÙ¿:m\Þœû&þ=òG×÷nôϱ=•ý’æO€ Ä@pà DÈ„«Ð“°ƒp!šˆ’‰Ô!(r”<Ê•‡z†úÆ¢ èRôk2f2k²ódäÂä‘ä=‚‰o)PÞ :@•HµFíI=Aƒ§¦µ¥}CçF·BÀ©bÐc˜?Á(Ã8͔ìżÁRÅêÊÆÊ6Ä~ŽÏÉÍù‰«‰û4W›O€ŸŒQ _°îày¡ác"8Qe1qq¾C쬒ìX)QiyMYC9cycœ"NÉLÙFÅåp€j¼ÚYõJvÍ)­]mvY]=}_ƒXÃÓFå¸fãQ“¯fLæJΖiVuÖ³¶ŒxC»$û‡mGU§$ç.W†c.nw<ÀÓPçMçàÛë/p6p3Ø)¤“(VÁ™ŽIŠÝ‹OJ@S“Q'’R~¤†¦-¦{eÌgyg/å„ù•›•Ç•»@«pàâÑ¢©’£—†/›•õ”\íº†»Þ_5s3°fçvv-Ïݺzý†ÉÆð´MUÍú-ó­§‹·tD?èì{ÙÃñâ^ŸáËWvƒo†‡§GÝÆ>¼ ÿ:?~{z–á]öÜμӇúä“ùbÁÒÔŠàgßÕ[kkëÒ_½7r6«·n~+ýÿÃbûÀöÈNê®Â鿤~ ïEísì·\I$€ßÿ% ÷7÷ ÀYÁAQ<™€.ÄÓ¨`‘i…ÿÃѶØÇßÐø{yè›þáx?=‹¿9á†68ÀÃÄê{‡ØÙþõ²2û÷.Ý¿ùÞ¶ÿzâþpx”Ý 5ý›ïå­ÿ·¶ ‹¿žþ‘Æë0 üþgP(½ð¬pûÄÿîK¤wl$€^(1.Üß×/R@‡H òÆ ‡¤±ò²²ªð?éÞ,ªpêæW pHYs  šœtIMEÚ  2HÖä vIDAT(Ïcüÿÿ?)€‰DÀg™×/Ç£îdc$Š øU#+`"F5²F³ºe${î>döÉÆH46„˂ˋÀè~€è†Kcú•€ B n fp!{ªS®Óˆ“‘Œ´DLT@ŒfÂe5.·1Ò<µ-ÞGFÎ)áåIEND®B`‚vcftools_0.1.11/website/src/0000755000000000000000000000000012163074506014446 5ustar rootrootvcftools_0.1.11/website/src/index.php0000644000000000000000000000465212156354770016303 0ustar rootroot 'VCFtools', 'perl_module' => 'VCFtools: Perl tools and API', 'htslib' => 'VCFtools: htslib VCF commands', 'docs' => 'VCFtools Documentation', 'license' => 'VCFtools License', 'specs' => 'VCF Specification', 'links' => 'VCF Links', 'options' => 'vcftools Options', ); if (isset($argc)) { $_GET['pg']=$argv[1]; } $path = array_key_exists('pg',$_GET) ? validate_path($_GET['pg']) : 'index'; $title = array_key_exists($path,$titles) ? $titles[$path] : $titles['index']; ?> <?php echo $title; ?>
vcftools_0.1.11/website/src/perl_module.inc0000644000000000000000000004413612156354770017466 0ustar rootroot

The Perl modules and scripts

VCFtools contains a Perl API (Vcf.pm) and a number of Perl scripts that can be used to perform common tasks with VCF files such as file validation, file merging, intersecting, complements, etc. The Perl tools support all versions of the VCF specification (3.2, 3.3, 4.0 and 4.1), nevertheless, the users are encouraged to use the latest versions VCFv4.0 or VCFv4.1. The VCFtools in general have been used mainly with diploid data, but the Perl tools aim to support polyploid data as well.

Run any of the Perl scripts with the --help switch to obtain more help. Note that the PERL5LIB environment variable must contain the path to your VCFtools installation in order for the scripts to work.

export PERL5LIB=/path/to/your/vcftools-directory/

Many of the Perl scripts require that the VCF files are compressed by bgzip and indexed by tabix (both tools are part of the tabix package, available for download here). The VCF files can be compressed and indexed using the following commands

bgzip my_file.vcf
tabix -p vcf my_file.vcf.gz

The tools

fill-aa

Fill in ancestral alleles.

zcat file.vcf.gz | fill-aa -a ancestral-alleles.fa.gz | bgzip -c > out.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
fill-an-ac

Fill or recalculate AN and AC INFO fields.

zcat file.vcf.gz | fill-an-ac | bgzip -c > out.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
fill-fs

Annotates the VCF file with flanking sequence (INFO/FS tag) masking known variants with N's. Useful for designing primers.

fill-fs -r /path/to/refseq.fa | vcf-query '%CHROM\t%POS\t%INFO/FS\n' > out.tab

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
fill-ref-md5

Fill missing reference info and sequence MD5s into VCF header.

fill-ref-md5 -i "SP:Homo\ Sapiens" -r ref.fasta in.vcf.gz -d ref.dict out.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
fill-rsIDs

Fill missing rsIDs. This script has been discontinued, please use vcf-annotate instead.

vcf-annotate

The script adds or removes filters and custom annotations to VCF files. To add custom annotations to VCF files, create TAB delimited file with annotations such as

#CHR FROM TO ANNOTATION 1 12345 22345 gene1 1 67890 77890 gene2

Compress the file (using bgzip annotations), index (using tabix -s 1 -b 2 -e 3 annotations.gz) and run

cat in.vcf | vcf-annotate -a annotations.gz \
   -d key=INFO,ID=ANN,Number=1,Type=Integer,Description='My custom annotation' \
   -c CHROM,FROM,TO,INFO/ANN > out.vcf

The script is also routinely used to apply filters. There are a number of predefined filters and custom filters can be easily added, see vcf-annotate -h for examples. Some of the predefined filters take advantage of tags added by bcftools, the descriptions of the most frequently asked ones follow:

Strand Bias .. Tests if variant bases tend to come from one strand. Fisher's exact test for 2x2 contingency table where the row variable is being the reference allele or not and the column variable is strand. Two-tail P-value is used.
End Distance Bias .. Tests if variant bases tend to occur at a fixed distance from the end of reads, which is usually an indication of misalignment. (T-test)
Base Quality Bias .. Tests if variant bases tend to occur with a quality bias (T-test). This filter is by default effectively disabled as it is set to 0.
(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
(Read even more)
vcf-compare

Compares positions in two or more VCF files and outputs the numbers of positions contained in one but not the other files; two but not the other files, etc, which comes handy when generating Venn diagrams. The script also computes numbers such as nonreference discordance rates (including multiallelic sites), compares actual sequence (useful when comparing indels), etc.

vcf-compare -H A.vcf.gz B.vcf.gz C.vcf.gz


Note: a fast htslib C version of this tool is now available (see vcf check).
(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-concat

Concatenates VCF files (for example split by chromosome). Note that the input and output VCFs will have the same number of columns, the script does not merge VCFs by position (see also vcf-merge).

In the basic mode it does not do anything fancy except for a sanity check that all files have the same columns. When run with the -s option, it will perform a partial merge sort, looking at limited number of open files simultaneously.

vcf-concat A.vcf.gz B.vcf.gz C.vcf.gz | gzip -c > out.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-consensus

Apply VCF variants to a fasta file to create consensus sequence.

cat ref.fa | vcf-consensus file.vcf.gz > out.fa

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-convert

Convert between VCF versions, currently from VCFv3.3 to VCFv4.0.

zcat file.vcf.gz | vcf-convert -r reference.fa > out.vcf

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-contrast

A tool for finding differences between groups of samples, useful in trio analysises, cancer genomes etc.

In the example below variants with average mapping quality of 30 (-f MinMQ=30) and minimum depth of 10 (-d 10) are considered. Only novel alleles are reported (-n). Then vcf-query is used to extract the INFO/NOVEL* annotations into a table. Finally the sites are sorted by confidence of the site being different in the child (-k5,5nr).

vcf-annotate -f MinMQ=30 file.vcf | vcf-contrast -n +Child -Mother,Father -d 10 -f | vcf-query -f '%CHROM %POS\t%INFO/NOVELTY\t%INFO/NOVELAL\t%INFO/NOVELGT[\t%SAMPLE %GTR %PL]\n' | sort -k3,3nr | head

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-filter

Please take a look at vcf-annotate which does what you are looking for. Apologies for the non-intuitive naming. Note: a fast htslib C version of a filtering tool with somewhat different capabilities is now available (see vcf filter).

vcf-fix-ploidy

Fixes diploid vs haploid genotypes on sex chromosomes, including the pseudoautosomal regions.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-indel-stats

Calculate in-frame ratio.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-isec

Creates intersections and complements of two or more VCF files. Given multiple VCF files, it can output the list of positions which are shared by at least N files, at most N files, exactly N files, etc. The first example below outputs positions shared by at least two files and the second outputs positions present in the files A but absent from files B and C.

vcf-isec -n +2 A.vcf.gz B.vcf.gz | bgzip -c > out.vcf.gz
vcf-isec -c A.vcf.gz B.vcf.gz C.vcf.gz | bgzip -c > out.vcf.gz


Note: a fast htslib C version of this tool is now available.
(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-merge

Merges two or more VCF files into one so that, for example, if two source files had one column each, on output will be printed a file with two columns. See also vcf-concat for concatenating VCFs split by chromosome.

vcf-merge A.vcf.gz B.vcf.gz C.vcf.gz | bgzip -c > out.vcf.gz

Note that this script is not intended for concatenating VCF files. For this, use vcf-concat instead.
Note: a fast htslib C version of this tool is now available.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-phased-join

Concatenates multiple overlapping VCFs preserving phasing.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-query

Powerful tool for converting VCF files into format defined by the user. Supports retrieval of subsets of positions, columns and fields.

vcf-query file.vcf.gz 1:10327-10330
vcf-query file.vcf -f '%CHROM:%POS %REF %ALT [ %DP]\n'

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
Note: a fast htslib C version of this tool is now available (see vcf query).
vcf-shuffle-cols

Reorder columns

vcf-shuffle-cols -t template.vcf.gz file.vcf.gz > out.vcf

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-sort

Sort a VCF file.

vcf-sort file.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-stats

Outputs some basic statistics: the number of SNPs, indels, etc.

vcf-stats file.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-subset

Remove some columns from the VCF file.

vcf-subset -c NA0001,NA0002 file.vcf.gz | bgzip -c > out.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-tstv

A lightweight script for quick calculation of Ts/Tv ratio.

cat file.vcf | vcf-tstv

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-to-tab

A simple script which converts the VCF file into a tab-delimited text file listing the actual variants instead of ALT indexes.

zcat file.vcf.gz | vcf-to-tab > out.tab

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf-validator

vcf-validator file.vcf.gz

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
Vcf.pm

For examples how to use the Perl API, it is best to look at some of the simpler scripts, for example vcf-to-tab. The detailed documentation can be obtained by running

perldoc Vcf.pm

(Read more)
&1",$out); 
    array_splice($out,0,4);array_splice($out,-1,1); echo implode("\n",$out); ?>
vcftools_0.1.11/website/src/htslib.inc0000644000000000000000000001005512156354770016435 0ustar rootroot

The htslib VCF commands

HTSlib is a C library for high-throughput sequencing data formats. It is designed for speed and works with both VCF and BCFv2.

Download and installation

The library is hosted on github. It can be downloaded and compiled the usual way:

# Download. The first command is run only once. The second
# is run whenever the latest snapshot from github is needed.
git clone git://github.com/samtools/htslib.git htslib
git pull

# Compile
cd htslib
make git-stamp

# Run
htscmd vcfcheck file.vcf.gz

# Symlinking the executable as 'vcf' allows terse execution of VCF commands:
ln -s htscmd vcf
vcf check file.vcf.gz

The tools

vcf check

Extract stats from a VCF/BCF file or compare two VCF/BCF files. The resulting text file can be plotted using plot-vcfcheck

vcf check file.vcf.gz > file.vchk
plot-vcfcheck file.vchk -p plots/

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf filter

Filtering by both supervised and unsupervised learning.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf gtcheck

A tool for detecting sample swaps and contaminations

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf isec

Fast alternative to vcf-isec

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf merge

Fast alternative to vcf-merge

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf norm

Left-align and normalize indels to the shortest possible representation.

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcf query

Fast alternative to vcf-query

(Read more)
&1",$out); echo htmlspecialchars(implode("\n",$out)); ?>
vcftools_0.1.11/website/src/index.inc0000644000000000000000000000421612156354770016261 0ustar rootroot

Welcome to VCFtools

Welcome to VCFtools - a program package designed for working with VCF files, such as those generated by the 1000 Genomes Project. The aim of VCFtools is to provide methods for working with VCF files: validating, merging, comparing and calculate some basic population genetic statistics.

Supported VCF versions

VCFtools supports the VCF format v4.0. The vcf-validator, Perl API and scripts now support also VCF format v4.1 and maintain backward compatibility with older versions.
For details, please go to the Documentation page.

Mailing List

Anything VCF or VCFtools related may be discussed on the project's mailing list.

Download

The latest stable release can be downloaded from here:

https://sourceforge.net/projects/vcftools/files/

The latest development version can be retrieved by running the following command:

svn checkout https://svn.code.sf.net/p/vcftools/code vcftools

The above command is required to be run only once, for any subsequent updates run this command from the vcftools directory:

svn update

How to use

The VCFtools package includes a set of tools for
  • validating
  • comparing
  • merging
  • annotating
  • creating intersections and subsets
  • ...
For details, please go to the Documentation page.
vcftools_0.1.11/website/src/license.inc0000644000000000000000000000161612156354770016575 0ustar rootroot

License

The program package is released under the GNU Lesser General Public License version 3.0 (LGPLv3).

Citing VCFtools

If you make use of VCFtools in your research, we would appreciate a citation of the following paper:

The Variant Call Format and VCFtools, Petr Danecek, Adam Auton, Goncalo Abecasis, Cornelis A. Albers, Eric Banks, Mark A. DePristo, Robert Handsaker, Gerton Lunter, Gabor Marth, Stephen T. Sherry, Gilean McVean, Richard Durbin and 1000 Genomes Project Analysis Group, Bioinformatics, 2011

Authors

The program package is maintained and developed by

  • Adam Auton (C++ Module)
  • Petr Danecek (Perl Module, HTSlib)
vcftools_0.1.11/website/src/docs.inc0000644000000000000000000001753712156354770016114 0ustar rootroot

Documentation

The VCFtools package is broadly split into two sections:

  • The vcftools binary program, generally used to analyse VCF files.
  • The Vcf.pm perl module, which is a general Perl API containing a core of the utilities vcf-convert, vcf-merge, vcf-compare, vcf-isec, and others.

Documentation

Examples of usage by topic

Installation

The VCFtools package can be decompressed by the command

tar -xzf vcftools_version_number_source.tar.gz

To build the vcftools executable, type "make" in the vcftools folder.

The Perl scripts require that VCF files are compressed by bgzip and indexed by tabix (both tools are part of the tabix package, available for download here). Both tools must be in directories that are listed in the PATH environment variable. For running the Perl scripts, the PERL5LIB environment variable must be set to include the Vcf.pm module

export PERL5LIB=/path/to/your/vcftools-directory/perl

The tools can be tested by running the script

/path/to/your/vcftools-directory/perl/test.t

If the command complains about missing Test::Most perl module, do not worry, it is needed only for testing, not for running VCFtools.
Annotating

# Add custom annotations
cat in.vcf | vcf-annotate -a annotations.gz \
   -d key=INFO,ID=ANN,Number=1,Type=Integer,Description='My custom annotation' \
   -c CHROM,FROM,TO,INFO/ANN > out.vcf

# Apply SnpCluster filter
cat in.vcf | vcf-annotate --filter SnpCluster=3,10 > out.vcf

Comparing

vcf-compare A.vcf.gz B.vcf.gz C.vcf.gz
vcf check A.vcf.gz B.vcf.gz

Concatenating

vcf-concat A.vcf.gz B.vcf.gz C.vcf.gz | bgzip -c > out.vcf.gz

Converting

# Convert between VCF versions
zcat file.vcf.gz | vcf-convert -r reference.fa | bgzip -c > out.vcf.gz

# Convert from VCF format to tab-delimited text file
zcat file.vcf.gz | vcf-to-tab > out.tab

Filtering

# Filter by QUAL and minimum depth
vcf-annotate --filter Qual=10/MinDP=20

Intersections, complements

# Include positions which appear in at least two files
vcf-isec -o -n +2 A.vcf.gz B.vcf.gz C.vcf.gz | bgzip -c > out.vcf.gz

# Exclude from A positions which appear in B and/or C
vcf-isec -c A.vcf.gz B.vcf.gz C.vcf.gz | bgzip -c > out.vcf.gz

# Fast hstlib implementation vcf isec -n =2 A.vcf.gz B.vcf.gz

Merging

vcf-merge A.vcf.gz B.vcf.gz | bgzip -c > C.vcf.gz
vcf merge A.vcf.gz B.vcf.gz

Querying

vcf-query file.vcf.gz 1:10327-10330 -c NA0001

Reordering columns

vcf-shuffle-cols -t template.vcf.gz file.vcf.gz > out.vcf

Stats

vcf-stats file.vcf.gz
vcf check file.vcf.gz > file.vchk && plot-vcfcheck file.vchk -p plot/

Stripping columns

vcf-subset -c NA0001,NA0002 file.vcf.gz | bgzip -c > out.vcf.gz

Useful shell one-liners

This sections lists some usefull one line commands. Note that there are also dedicated convenience scripts vcf-sort and vcf-concat which do the same but also perform some basic sanity checks. All examples in BASH.

# Replace VCF header. The file must be compressed by bgzip.
tabix -r header.txt in.vcf.gz > out.vcf.gz

# Sort VCF file keeping the header. The head command is for performance.
(zcat file.vcf.gz | head -100 | grep ^#;
zcat file.vcf.gz | grep -v ^# | sort -k1,1d -k2,2n;) \
| bgzip -c > out.vcf.gz

# Merge (that is, concatenate) two VCF files into one, keeping the header
# from first one only.
(zcat A.vcf.gz | head -100 | grep ^#; \
zcat A.vcf.gz | grep -v ^#; \
zcat B.vcf.gz | grep -v ^#; ) \
| bgzip -c > out.vcf.gz

VCF validation

Both vcftools and Vcf.pm can be used for validation. The first validates VCFv4.0, the latter is able to validate the older versions as well.

perl -MVcf -e validate example.vcf
perl -I/path/to/the/module/ -MVcf -e validate example.vcf
vcf-validator example.vcf

...and more

This page gives just a list of basic capabilities. For more, please go to the vcftools's options page and the Perl API and scripts page.

vcftools_0.1.11/website/src/specs.inc0000644000000000000000000000276112156354770016272 0ustar rootroot

VCF (Variant Call Format) specification


  • VCF paper
  • VCF poster
  • VCFtools-spec - Low traffic mailing list intended for VCF format related discussions, such as clarifications of the current format version or proposals of changes to the specification.

  • Binary Call Format (BCF) - Binary format designed for efficient storing and parsing of VCF records. BCF has been recently standardized and implemented in tools such as htslib (C library for high-throughput sequencing data formats) and GATK.
vcftools_0.1.11/website/src/options.inc0000644000000000000000000007312112156354770016646 0ustar rootroot

vcftools: Usage and Options

The vcftools program is intended for analysis of diploid SNP data in VCF format. The program is run from the command line, and the interface is inspired by PLINK, and so should be largely familiar to users of that package. Commands take the following form:

vcftools --vcf file1.vcf --chr 20 --freq

The above command tells vcftools to read in the file file1.vcf, extract sites on chromosome 20, and calculate the allele frequency at each site. The resulting allele frequency estimates are stored in the output file, out.freq. As in the above example, output from vcftools is mainly sent to output files, as opposed to being shown on the screen.

If you want to output a new VCF file, you need the --recode option. For example, the command may look something like:

vcftools --vcf file1.vcf --chr 20 --out chr20 --recode

This command tells vcftools to extract chromosome 20 from file1.vcf and generate a new VCF file, which will be called chr20.recode.vcf.

A description of each of the available options follows. Note that some commands may only be available in the latest version of vcftools. To obtain the latest version, you should use SVN to checkout the latest code, as described on the home page.

Basic Options
Site Filter Options
Individual Filters
Genotype Filters
Output Statistics
Output in Other Formats
Miscellaneous
File Comparison
Options still in development

Basic Options

  • --vcf <filename>

  • This option defines the VCF file to be processed. The files need to be decompressed prior to use with vcftools. vcftools expects files in VCF format v4.0, a specification of which can be found here. Version v4.1 is also supported, with some small limitations.

  • --gzvcf <filename>

  • This option can be used in place of the --vcf option to read compressed (gzipped) VCF files directly.

  • --bcf <filename>

  • This option can be used in place of the --vcf option to read BCF files directly. You do not need to specify if this file is compressed with BGZF encoding. Due to differences in current BCF formatting, you may need to also include the --gatk tag to signify that your BCF file came from that program.

  • --out <prefix>

    This option defines the output filename prefix for all files generated by vcftools. For example, if <prefix> is set to output_filename, then all output files will be of the form output_filename.*** . If this option is omitted, all output files will have the prefix 'out.'.

Site Filter Options

  • --chr <chromosome>
  • --not-chr <chromosome>

    Include or exclude chromosomes with identifiers matching <chromosome>
    . These options can be used more than once to include or exclude multiple chromosomes.
  • --from-bp <integer>
  • --to-bp <integer>

    These options define the physical range of sites will be processed. Sites outside of this range will be excluded. These options can only be used in conjuction with --chr.

  • --snp <string>

    Include SNP(s) with matching ID (e.g. a dbSNP rsID). This command can be used multiple times in order to include more than one SNP.

  • --snps <filename>

    Include a list of SNPs given in a file. The file should contain a list of SNP IDs (e.g. dbSNP rsIDs), with one ID per line.

  • --exclude <filename>

    Exclude a list of SNPs given in a file. The file should contain a list of SNP IDs, with one ID per line.

  • --positions <filename>
  • --exclude-positions <filename>

    Include/exclude a set of sites on the basis of a list of positions in a file. Each line of the input file should contain a (tab-separated) chromosome and position. The file can have comment lines that start with a '#'.

  • --keep-only-indels
  • --remove-indels

    Include or exclude sites that contain an indel. For this option 'indel' means any variant that alters the length of the REF allele.

  • --bed <filename>
  • --exclude-bed <filename>

    Include or exclude a set of sites on the basis of a BED file. Only the first three columns (chrom, chromStart and chromEnd) are required. The BED file should have a header line.

  • --remove-filtered-all
  • --remove-filtered <string>
  • --keep-filtered <string>

    These options are used to filter sites on the basis of their FILTER flag. The first option removes all sites with a FILTER flag. The second option can be used to exclude sites with a specific filter flag. The third option can be used to select sites on the basis of specific filter flags. The second and third options can be used multiple times to specify multiple FILTERs. The --keep-filtered option is applied before the --remove-filtered option. NOTE: The PASS tag does not count as a filter, it is just the absence of a filter. If you would like to keep only the sites that pass all filters use the --remove-filtered-all option.

  • --remove-INFO <string>
  • --keep-INFO <string>

    These options are used to filter sites on the basis of INFO field flags. The first option can be used to exclude sites with a specific INFO flag. The second option can be used to select sites on the basis of specific INFO flags. These options can be used multiple times to specify multiple INFO flags. The --keep-INFO option is applied before the --remove-INFO option. Note that only INFO flags can currently be used as filters (i.e. there is currently no support for filtering by INFO field values).

  • --minQ <float>

    Include only sites with Quality above this threshold.

  • --min-meanDP <float>
  • --max-meanDP <float>

    Include sites with mean Depth within the thresholds defined by these options.

  • --maf <float>
  • --max-maf <float>

    Include only sites with Minor Allele Frequency within the specified range.

  • --non-ref-af <float>
  • --max-non-ref-af <float>

    Include only sites with all Non-Reference Allele Frequencies within the specified range.

  • --mac <int>
  • --max-mac <int>

    Include only sites with Minor Allele Count within the specified range.

  • --non-ref-ac <float>
  • --max-non-ref-ac <float>

    Include only sites with all Non-Reference Allele Counts within the specified range.

  • --hwe <float>

    Assesses sites for Hardy-Weinberg Equilibrium using an exact test, as defined by Wigginton, Cutler and Abecasis (2005). Sites with a p-value below the threshold defined by this option are taken to be out of HWE, and therefore excluded.

  • --geno <float>

    Exclude sites on the basis of the proportion of missing data (defined to be between 0 and 1, where 1 indicates no missing data allowed).

  • --max-missing-count <int>

    Exclude sites with more than this number of missing chromosomes.

  • --min-alleles <int>
  • --max-alleles <int>

    Include only sites with a number of alleles within the specified range. For example, to include only bi-allelic sites, one could use:

    vcftools --vcf file1.vcf --min-alleles 2 --max-alleles 2



  • --thin <int>

    Thin sites so that no two sites are within the specified distance.

  • --mask <filename>
  • --invert-mask <filename>
  • --mask-min <int>

    Include sites on the basis of a FASTA-like file. The provided file contains a sequence of integer digits (between 0 and 9) for each position on a chromosome that specify if a site at that position should be filtered or not. An example mask file would look like:

    >1
    0000011111222...

    In this example, sites in the VCF file located within the first 5 bases of the start of chromosome 1 would be kept, whereas sites at position 6 onwards would be filtered out. The threshold integer that determines if sites are filtered or not is set using the --mask-min option, which defaults to 0. The chromosomes contained in the mask file must be sorted in the same order as the VCF file. The --mask option is used to specify the mask file to be used, whereas the --invert-mask option can be used to specify a mask file that will be inverted before being applied.

Individual Filters

  • --indv <string>

    Specify an individual to be kept in the analysis. This option can be used multiple times to specify multiple individuals.

  • --keep <filename>

    Provide a file containing a list of individuals to include in subsequent analysis. Each individual ID (as defined in the VCF headerline) should be included on a separate line.

  • --remove-indv <string>

    Specify an individual to be removed from the analysis. This option can be used multiple times to specify multiple individuals. If the --indv option is also specified, then the --indv option is executed before the --remove-indv option.

  • --remove <filename>

    Provide a file containing a list of individuals to exclude in subsequent analysis. Each individual ID (as defined in the VCF headerline) should be included on a separate line. If both the --keep and the --remove options are used, then the --keep option is execute before the --remove option.

  • --min-indv-meanDP <float>
  • --max-indv-mean-DP <float>

    Calculate the mean coverage on a per-individual basis. Only individuals with coverage within the range specified by these options are included in subsequent analyses.

  • --mind <float>

    Specify the minimum call rate threshold for each individual.

  • --phased

    First excludes all individuals having all genotypes unphased, and subsequently excludes all sites with unphased genotypes. The remaining data therefore consists of phased data only.

  • --max-indv <int>

    Randomly thins individuals so that only the specified number are retained.

Genotype Filters

  • --remove-filtered-geno-all
  • --remove-filtered-geno <string>

    The first option removes all genotypes with a FILTER flag not equal to PASS or '.'. The second option can be used to exclude genotypes with a specific filter flag.

  • --minGQ <float>

    Exclude all genotypes with a quality below the threshold specified by this option (GQ).

  • --minDP <float>
  • --maxDP <float>

    Exclude all genotypes with a sequencing depth (DP) outside the range defined by these options.

Output Statistics

  • --freq
  • --counts
  • --freq2
  • --counts2

    Output per-site frequency information. The --freq outputs the allele frequency in a file with the suffix '.frq'. The --counts option outputs a similar file with the suffix '.frq.count', that contains the raw allele counts at each site.
    The --freq2 and --count2 options are used to suppress allele information in the output file. In this case, the order of the freqs/counts depends on the numbering in the VCF file.
    With any of these options, adding the --derived command will re-order the output file columns so that the ancestral allele appears first. The --derived option relies on the ancestral allele being specified in the VCF file using the AA tag in the INFO field.

  • --depth

    Generates a file containing the mean depth per individual. This file has the suffix '.idepth'.

  • --site-depth
  • --site-mean-depth

    Generates a file containing the depth per site. The --site-depth option outputs the depth for each site summed across individuals. This file has the suffix '.ldepth'. Likewise, the --site-mean-depth outputs the mean depth for each site, and the output file has the suffix '.ldepth.mean'.

  • --geno-depth

    Generates a (possibly very large) file containing the depth for each genotype in the VCF file. Missing entries are given the value -1. The file has the suffix '.gdepth'.

  • --site-quality

    Generates a file containing the per-site SNP quality, as found in the QUAL column of the VCF file. This file has the suffix '.lqual'.

  • --het

    Calculates a measure of heterozygosity on a per-individual basis. Specfically, the inbreeding coefficient, F, is estimated for each individual using a method of moments. The resulting file has the suffix '.het'.

  • --hardy

    Reports a p-value for each site from a Hardy-Weinberg Equilibrium test (as defined by Wigginton, Cutler and Abecasis (2005)). The resulting file (with suffix '.hwe') also contains the Observed numbers of Homozygotes and Heterozygotes and the corresponding Expected numbers under HWE. 

  • --missing

    Generates two files reporting the missingness on a per-individual and per-site basis. The two files have suffixes '.imiss' and '.lmiss' respectively.

  • --hap-r2
  • --geno-r2
  • --geno-chisq
  • --ld-window <int>
  • --ld-window-bp <int>
  • --min-r2 <float>

    These options are used to report Linkage Disequilibrium (LD) statistics as summarised by the r2, D, D', or Chi-squared statistics,. The --hap-r2 option informs vcftools to output a file reporting the r2, D, and D' statistics using phased haplotypes. These are the traditional measures of LD often reported in the population genetics literature. If phased haplotypes are unavailable then the --geno-r2 option may be used, which calculates the squared correlation coefficient between genotypes encoded as 0, 1 and 2 to represent the number of non-reference alleles in each individual. This is the same as the LD measure reported by PLINK. The D and D' statistics are only available for phased genotypes. If your data contains sites with more than two alleles, then the --geno-chisq option can be used to test for genotype independence via the chi-squared statistic. The haplotype version outputs a file with the suffix '.hap.ld', whereas the genotype version outputs a file with the suffix '.geno.ld'. The haplotype version implies the option --phased.

    The --ld-window option defines the maximum number of SNPs between the SNPs being tested for LD. Likewise, the --ld-window-bp option can be used to define the maximum physical separation (in base-pairs) of SNPs included in the LD calculation. Finally, the --min-r2 sets a minimum value for r2 below which the LD statistic is not reported.

  • --SNPdensity <int>

    Calculates the number and density of SNPs in bins of size defined by this option. The resulting output file has the suffix '.snpden'.

  • --TsTv <int>

    Calculates the Transition / Transversion ratio in bins of size defined by this option. Only uses bi-allelic SNPs. The resulting output file has the suffix '.TsTv'. A summary is also supplied in a file with the suffix '.TsTv.summary'.

  • --TsTv-by-count

    Calculates the Transition / Transversion ratio as a function of alternative allele count. Only uses bi-allelic SNPs. The resulting output file has the suffix '.TsTv.count'.

  • --TsTv-by-qual

    Calculates the Transition / Transversion ratio as a function of SNP quality threshold. Only uses bi-allelic SNPs. The resulting output file has the suffix '.TsTv.qual'.

  • --FILTER-summary

    Generates a summary of the number of SNPs and Ts/Tv ratio for each FILTER category. The output file has the suffix '.FILTER.summary.

  • --filtered-sites

    Creates two files listing sites that have been kept or removed after filtering. The first file, with suffix '.kept.sites', lists sites kept by vcftools after filters have been applied. The second file, with the suffix '.removed.sites', list sites removed by the applied filters.

  • --singletons

    This option will generate a file detailing the location of singletons, and the individual they occur in. The file reports both true singletons, and private doubletons (i.e. SNPs where the minor allele only occurs in a single individual and that individual is homozygotic for that allele). The output file has the suffix '.singletons'.

  • --site-pi
  • --window-pi <int> (--window-pi-step <int>)

    These options are used to estimate levels of nucleotide diversity. The first option does this on a per-site basis, and the output file has the suffix '.sites.pi'. The second option calculates the nucleotide diversity in windows, with the window size defined in the option argument, and an optional window step.

  • --hist-indel-len

    This option will generate a histogram file of the length of all indels (including SNPs). It shows both the count and the percentage of all indels for indel lengths that occur at least once in the input file. SNPs are considered indels with length zero. The output file has the suffix '.indel.hist'.

  • --TajimaD <int>

    Output Tajima's D statistic in bins of size <int>.

  • --hapmap-fst-pop <filename>
  • --weir-fst-pop <filename>
  • --fst-window-size <int>
  • --fst-window-step <int>

    These options are used to calculate two estimates of FST. The --hapmap-fst-pop option is used to calculate the estimator described in the Phase I HapMap paper. The second (preferred) estimator is from Weir and Cockerham's 1984 paper. These options are used to provide a file which lists the individuals in a given population (one individual per line), and is used multiple times to specific multiple populations. These options can be used to calculate FST for more than two populations.

    The --fst-window-size and --fst-window-step options are optional and can be used to calculate FST in windows. If they are not specified, FST is calculated on a per-site basis.

Output in Other Formats

  • --012

    This option outputs the genotypes as a large matrix. Three files are produced. The first, with suffix '.012', contains the genotypes of each individual on a separate line. Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles. Missing genotypes are represented by -1. The second file, with suffix '.012.indv' details the individuals included in the main file. The third file, with suffix '.012.pos' details the site locations included in the main file.

  • --IMPUTE

    This option outputs phased haplotypes in IMPUTE reference-panel format. As IMPUTE requires phased data, using this option also implies --phased. Unphased individuals and genotypes are therefore excluded. Only bi-allelic sites are included in the output. Using this option generates three files. The IMPUTE haplotype file has the suffix '.impute.hap', and the IMPUTE legend file has the suffix '.impute.hap.legend'. The third file, with suffix '.impute.hap.indv', details the individuals included in the haplotype file, although this file is not needed by IMPUTE.

  • --ldhat
  • --ldhat-geno

    These options output data in LDhat format. Use of these options  also require the --chr option to by used. The --ldhat option outputs phased data only, and therefore also implies --phased, leading to unphased individuals and genotypes being excluded. Alternatively, the --ldhat-geno option treats all of the data as unphased, and therefore outputs LDhat files in genotype/unphased format. In either case, two files are generated with the suffixes '.ldhat.sites' and '.ldhat.locs', which correspond to the LDhat 'sites' and 'locs' input files respectively.

  • --BEAGLE-GL
  • --BEAGLE-PL

    These options output genotype likelihood information for input into the BEAGLE program. These options require the VCF file to contain FORMAT fields with GL or PL tags, which can generally be output by SNP callers such as the GATK. Use of this option requires a chromosome to be specified via the --chr option. The resulting output file (with the suffix '.BEAGLE.GL' or '.BEAGLE.PL') contains genotype likelihoods for biallelic sites, and is suitable for input into BEAGLE via the 'like=' argument.

  • --plink

    This option outputs the genotype data in PLINK PED format. Two files are generated, with suffixes '.ped' and '.map'. Note that only bi-allelic loci will be output. Further details of these files can be found in the PLINK documentation.

    Note: This option can be very slow on large datasets. Using the --chr option to divide up the dataset is advised, or alternatively use the --plink-tped option below.

  • --plink-tped

    The --plink option above can be extremely slow on large datasets. An alternative that might be considerably quicker is to output in the PLINK transposed format. This can be achieved using the --plink-tped option, which produces two files with suffixes '.tped' and '.tfam'.

  • --recode

    The --recode option is used to generate a VCF file from the input VCF or BCF file having applied the options specified by the user. The output file has the suffix '.recode.vcf'.

    By default, the INFO fields are removed from the output file, as the INFO values may be invalidated by the recoding (e.g. the total depth may need to be recalculated if individuals are removed). This default functionality can be overridden by using the --recode-INFO <string> option, where <string> defines the INFO key to keep in the output file. The --recode-INFO flag can be used multiple times. Alternatively, the option --recode-INFO-all can be used to retain all INFO fields.

  • --recode-bcf

    The --recode-bcf option is used to generate a BCF file from the input VCF or BCF file having applied the options specified by the user. The output file has the suffix '.recode.bcf'.

    Similar to the --recode option, the INFO fields are removed from the output file but can be restored in the same manner.

  • --recode-to-stream

    The --recode-to-stream option works in the same manner as the --recode option, but the resulting VCF file printed directly to the screen, rather than to file. Logging information normally displayed on the screen is supressed (but still written to the log file). The advantage of this option is that the resulting VCF file can be piped to other programs for processing. For example, to gzip the VCF file, one could use:

    vcftools --vcf file1.vcf --recode-to-stream | gzip -c - > output_file.vcf.gz



  • --recode-bcf-to-stream

    The --recode-bcf-to-stream option works in the same matter as the --recode-to-stream option, printing the output BCF file to the screen. By default, all output BCF files are compressed in BGZF. This stream can be piped through gzip or bgzip to decompress.

Miscellaneous

  • --extract-FORMAT-info <string>

    Extract information from the genotype fields in the VCF file relating to a specfied FORMAT identifier. For example, using the option '--extract-FORMAT-info GT' would extract the all of the GT (i.e. Genotype) entries. The resulting output file has the suffix '.<FORMAT_ID>.FORMAT'.

  • --get-INFO <string>

    This option is used to extract information from the INFO field in the VCF file. The <string> argument specifies the INFO tag to be extracted, and the option can be used multiple times in order to extract multiple INFO entries. The resulting file, with suffix '.INFO', contains the required INFO information in a tab-separated table. For example, to extract the NS and DB flags, one would use the command:

    vcftools --vcf file1.vcf --get-INFO NS --get-INFO DB


  • --force-index-write

    When vcftools first reads a VCF file, it creates an index of the file for faster access next time. In some cases, the index file can get out of sync with the VCF file. This option can be used to force vcftools to regenerate the index file.

VCF File Comparison Options

  • --diff <filename>
  • --gzdiff <filename>
  • --diff-bcf <filename>

    Select a VCF file for comparison with the file specified by the --vcf option. Outputs two files describing the sites and individuals common / unique to each file. These files have the suffixes '.diff.sites_in_files' and '.diff.indv_in_files' respectively. The --gzdiff version can be used to read compressed VCF files and the --diff-bcf version can be used to read BCF files.

  • --diff-site-discordance

    Used in conjuction with the --diff option to calculate discordance on a site by site basis. The resulting output file has the suffix '.diff.sites'.

  • --diff-indv-discordance
  • --diff-indv-map <filename>

    Used in conjuction with the --diff option to calculate discordance on a per-individual basis. The resulting output file has the suffix '.diff.indv'. The --diff-indv-map option can be used to specify a mapping of individual IDs in the second file to those in the first file.

  • --diff-discordance-matrix

    Used in conjuction with the --diff option to calculate a discordance matrix. This option only works with bi-allelic loci with matching alleles that are present in both files. The resulting output file has the suffix '.diff.discordance.matrix'.

  • --diff-switch-error

    Used in conjuction with the --diff option to calculate phasing errors (specifically 'switch errors'). This option generates two output files describing switch errors found between sites, and the average switch error per individual. These two files have the suffixes '.diff.switch' and '.diff.indv.switch' respectively.

Options still in development

The following options are yet to be finalised, are likely to contain bugs, and are likely to change in the future.

  • --LROH

    Identify Long Runs of Homozygosity. This option is currently very slow.

  • --relatedness

    Output Individual Relatedness Statistics.

vcftools_0.1.11/website/src/links.inc0000644000000000000000000000167012156354770016273 0ustar rootroot

VCF related links

If you know of any other VCF-related resources, please let us know.

  • tabix - Tabix is an indexing tool that is required for certain features in VCFtools.
  • vcflib - A simple C++ library for parsing and manipulating VCF files from Erik Garrison.
  • VcfPythonUtils - Some Python utilities to work with VCF from Amit Indap.
  • The Genome Analysis Toolkit - The GATK Java package from the Broad includes a number of routines for generating and manipulating VCF file.
  • vcfCTools - C++ implementation of vcfPytools from Alistair Ward.
  • vcftools_0.1.11/website/favicon.ico0000644000000000000000000000157612156354770016017 0ustar rootrooth(   ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6¦~6¦~6ÿÿÿÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6¦~6ÿÿÿ¦~6¦~6¦~6ÿÿÿ¦~6¦~6ÿÿÿ¦~6ÿÿÿ¦~6¦~6¦~6¦~6¦~6ÿÿÿ¦~6ÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6¦~6¦~6ÿÿÿ¦~6¦~6¦~6¦~6¦~6ÿÿÿ¦~6ÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿ¦~6¦~6ÿÿÿ¦~6¦~6¦~6ÿÿÿ¦~6ÿÿÿ¦~6¦~6ÿÿÿ¦~6ÿÿÿ¦~6¦~6¦~6¦~6ÿÿÿ¦~6¦~6¦~6ÿÿÿ¦~6¦~6ÿÿÿÿÿÿ¦~6¦~6ÿÿÿÿÿÿÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6¦~6ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿvcftools_0.1.11/website/README0000644000000000000000000000014112156354770014541 0ustar rootrootDo not edit *html files. Edit src/index.php and src/*inc files and run `make` in this directory. vcftools_0.1.11/website/img/0000755000000000000000000000000012163074506014433 5ustar rootrootvcftools_0.1.11/website/img/bgul.gif0000644000000000000000000000046012156354770016061 0ustar rootrootGIF89a<ÄîîîÝÝÝíííìììÞÞÞáááßßßêêêåååçççéééãããàààèèèäääâââëëëæææ!ù,<­` Ždižhª®lëp,Ïtm߸¡ï|ïÿÀ pÈ(ȤrYh:ŸÐ¨tJ­ZجvËízà°xL.›è´zÍn»ßˆ¸|N¯Û~Ïïûÿ€ ‚ƒ„…†‡ˆ Š‹ŒŽ‘ “”•–—˜™›œžŸ ¡¢£¤¦§¨©ª«¬®¯°±²³´µ¶¸¹º»¼½¾¿ÀÁÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÊ!;vcftools_0.1.11/website/img/header.gif0000644000000000000000000000247212156354770016365 0ustar rootrootGIF89a ÷1r–1s—1s—2u™2uš3wœ3wœ3x4yŸ4yŸ5|¢5|¢6~¥6~¥6~¦F—ÃI™ÄJ™ÄLšÅM›ÅPœÆSžÇUŸÈX¡ÉY¡É[£Ê^¥Ë_¥Ëb§Ìd¨ÍgªÎhªÎk¬Ïm­Ðn­Ðq¯Ñs°Òv²Ów³Óz´Ô|¶Õ·Ö¸Ö‚¸Ö„º×‡»Ø, ÿ[H° Áƒ*\Ȱ¡Ã‡#dA±¢Å‹3®ØÈ±£Ç CŠIR…É“(Sª\ɲ¥Ë0cÊœI³¦M8sêÜɳ§ÏŸ@ƒžJ´¨Ñ£H“šXÊ´©Ó§P£JJµ„Õ«X³jÝÊ•„ׯ`ÊK¶¬ÙhÓª]˶­Û·p㊘K·®Ý»xó†ØË·¯ß¿€ L„áÈ+^ÌøƒãÇ#KžL¹²åË2kÞ̹³çÏ CwMº´éÓ¨SsXͺµë×°cËžM{ƒíÛ¸sëÞÍ[ƒïßÀƒ N¼¸ñã’+_μ¹óç¢KŸN½ºõëØ³_ØÎ½»÷ïàËÿOÞ‚ùóèÓ«_Ͼ‚û÷ðãËŸO¿¾ýûòëßÏ¿¿ÿÿ(à€hà&¨  6èàƒF(á„F`á…f¨á†Bàᇠ†(âˆ$–h≤¨âŠ,¶è"‹Ä(ãŒ4Öhã8æ¨ãŽ<öèã@)äDiä‘H&©ä’L6éä“PF)å”T6`å•Xf©å–\véå—`. æ˜d–iæ™h¦©æšl¶éfpÆ)çœtÖiçxæ©çž|à矀*è „j¨ˆ&ªè¢Œ6êè£F*餔`饘fªé¦œvêé †*ꨤ–jª¨¦ªêª¬¶êê«°Æ*무`ë­¸æªë®¼öêë¯À+ì°Á:@챺;vcftools_0.1.11/website/img/search.gif0000644000000000000000000000125012156354770016373 0ustar rootrootGIF89aæÿÿÿþþþýýýüüüúúúùùù÷÷÷öööôôôóóóòòòñññðððîîîíííìììëëëêêêéééèèèåååâââßßßÞÞÞÝÝÝÜÜÜÛÛÛÚÚÚØØØ×××ÓÓÓÒÒÒÐÐÐÎÎÎÍÍÍËËËÊÊÊÉÉÉÆÆÆÃÃý½½»»»¹¹¹···µµµ±±±¯¯¯®®®­­­¬¬¬©©©¨¨¨¦¦¦¢¢¢žžž›››ššš’’’‘‘‘‹‹‹ˆˆˆ†††………„„„ÿÿÿ!ùB,ÿ€‚ƒ„…†…‰Š‹Œ‰‚Ž‘‘‚•–—˜™•‚žŸ ¡œ¢¥¢‚©ª«¬­©‚±²³± ,,)´°´´!4(0 ²‚ËÌÍ,ÍË"Ì‚רÙ&Ù×'Ø‚ãäå$5ä #;$ä‚ ñòò'1=>8ù=?2 ò‚  TÐAÇ/J¸X¸‚Ã@‚H”È@‹70LܸQƒ (Ø$(S2Ô ¥AfxhI³¦Í‚èüÐâ‚Ο@ƒþô ¨Ñ£H“¡©Ó§P£6DA‚Õ«X³j•@u‚ׯ`ÊÐu¬Y±T)¨]˶­[ ‡âÊ;vcftools_0.1.11/website/img/li.gif0000644000000000000000000000031712156354770015535 0ustar rootrootGIF89a ÄhÍøøø lÍ nÎñññqÏûûûwÐgÌ¸á¨Æä®ÉåBŒÕ8†ÔöööæéíbžÚ~®Þ(~Òòòòùùùôôôîîîüüüðððýýýõõõúúú÷÷÷ïïïóóófÌ!ù, L`&^ä5[ª¢TÀ½\@QœVy^¥½ÑYŒGÓ{@ ‡H'×|‚Obi) $@ƒz` KùŸÆ1é$ ŲæÁt:BM;vcftools_0.1.11/website/img/bgcode.gif0000644000000000000000000000007012156354770016350 0ustar rootrootGIF89a€òòòÿÿÿ!ù,Œ—¶¨{³F‰›¾¬;vcftools_0.1.11/website/img/quote.gif0000644000000000000000000000034412156354770016266 0ustar rootrootGIF89aÄ¿ÈÑúûûÁÊÓ­¹Å™§·ÔÚáæéíÓÙ਴Áäèììïò²½ÈÙßäðòô¹ÂÍ«¹ýþþëîñ¦²Àõöøçêîèìïûüü•¤´Ð×ÞÆÎ××ÜâáåêØÝ㩵”£³ÿÿÿ!ù,aà'Ždiž(:u,Ë}FËFeäÝ·ðø­•†žð9@!ÑØC’‚½ÌNÈ(AÀ­ð  ‹Åó(HZ‚åx$¤ÀÃã8)—Ä(ð& 7#7)‹ŒŽ(!;vcftools_0.1.11/website/img/bgcontainer.gif0000644000000000000000000001053212156354770017424 0ustar rootroot‰PNG  IHDR a€Œ_iCCPICC Profilex­—y8U_Ç÷½×xÍó§C¼áããŧ8Š/€Å@c)nAÌsô] j¼œî†bÞûÇfÎ @e 4¾ÿØø³ »À3ä?¶-Ó_ß F?è*)ñ+ŒT üéÃÃ->hmÙìgþ(=<Ü¿ b €çÞØà€_¾Ð¤°>þ×ýïwþ3‰ “…'!–ñ… ¢»‰™‘®$IÉ) )—©eh’h‡è9<˜‰XLYKØ>s(r&p½â&á‘çµâóâ8é+è.ä€Âk‹È‰¢ÄØÄÉÅ%6$g¤ú¤ŸÈÜM>å+g*/¥@§°­8¨T¥œ~ÚKÅ@UBQ¦¾¬1 Y§uU;m­#«K¯»©×§_apÞÐÖHÖ˜Þx×dÆ´Ó¬“o~ÎÂÕRÿŒ¨•Õ¦õM­m–½˜½#ÌqÓi;í<êòʵÕí‘{¹ÇUÏ$¯PogS_U? Ÿ?Wgà‰ þ`Tˆx¨tØ©pù…H…(ùèS1Ò±çDâNž?Ï’@s™Kü–´’<“2”Ú–ö8½übÞ¥¤ŒL—,L¶úeñŽ\ÒÜÝ+ WóšòË ²®…:^×)/f.Á/ùrc¶tèfû­ú²ÊÛ×ÊÓ+¢ïøVÚÝ5¸§x_°ŠéñƒŸÕ_jVj?>œ­›~4ùx¬~øICwã˧ÍM›<+o)zžÓšò"ú¥›s»EºS®K°›¥‡¤çGïÊ«±¾öþº²×ùƒ™CéÃ)#É£Éoß^‹›ˆŒšŠx6:þ>v6õCÁ܃ùöS ›KÄË+Ò«zŸœÖÂ×s>?Üß$ÞRÝŽýÚò¾£µ›¾7ò“}ßý îððþCø‚^„ÕÄ©O’C:AÎKáKÙH AÓæÐM20ú1=f³ê±å°Osòq¹(æîåYáÝáÛçߨ<¹$8%Ôj®¹*/æ'n+–””b•FH/ËôËVŸÊ’ÃÉ)*â+N+=VÎ8í¡¢©Ê£†¯¶¨Þ­Q©™ªå¡Fs£uÆukõRõ d É çšóMÂMmÍT1¼æÄæk–Õg²¬ü­m„m‰m?Úµž-¶qprÔwRÀŠ8Ÿp¡s%tÝs[uŸôèñlðºíãçëçgÃøè›†˜‡Z†Y…[GXGZGYE[Æ`bMÎéÇiŸW‰—K»À—È–DL˜¼—²–ú>m0ýÅŇ—J3²3ã²pÙg/ä(ä \¡½ ®®æä7ܺ–VèݲH±˜»„´äû…Ò77»n=-»»¨<£âÜ\¥Ã]£{Ê÷…«X>8¬Þ®Y¯]}¸\·ôhññBýܓن™ÆÉ§o›Fšžõ´´=on­{Qù²¤-·=µ#¦3 Ë¥Û²G§Wþ•PK?iÿÁÀÖëOƒ+C«ÃŸG¶GwߌÁÇñ'ˆ&I¦ÈÞQLSÍпç˜û =‡OøX¾Ð½øi™lEtÕøSÀZîzóçµ/"›1[_O»½C²ë±÷â'ݾóAí/ýEà–ˆb¼i&B4Qñ3ä)ŠÌ…¼ˆb„ŠŠZ&šöÝ/£=ÓæVB6%ö@ŽÛœ#\߸Iyèy™øèøÉ»'WÇ„:P5Â×DD½ÅLÄe%X$$ßKµH_—‰5?%*G 7%_«¨h¥$¨´§Ü{ºHÅ_UKMí«z¯F©f¨–ž6»öú¹N¦®½JoG¿Í ÃÐÆHÀè›q—I¡i€™.† óݼϢÔ2䌎«Õšu³ÍE[[;A»³ö~ŽzN‚X2ìç—z×·HwOfϯ7Þ}² ½`„“òg @ü\š  é­«Ï‹HŽ r‰69+tŽ>Ä­œŽoL(½’è—dž,ŸÂž Oý˜Ö•^y1ý’O†Q¦XuÖvö›Ë 9×r£¯Ø]UÎcÏ;ÈŸ.hºVXíébÚâ¯%c7ZJËofߊ.s»mR._Á}‡ôÎVåÄÝÖ{å÷/V?8[­W£X+ñUwòïãõìO˜è)Ÿ"›M?›¿>[oYx>Ó:öbðeO[[{SǣΪ®²î¢žÜÞôWñ}ý~¯qƒ¡C‰Ã#µ£½oæßŒ3NHLêO¹¾‹›.™éx¿óAn.q~rAv1wi{ÅrõÙšôzçFææõíï{ÆûžhàÎæñå5àã§ç?X99"øT¨#l+¢(Ê.†[+Ñ)Ù U%].S*{ýTž\¦|‚B°¢«’…2ú´¢Š¤*JM@_C@¥%©­ˆÖÖ±ÐuÓ‹ÔÏ6¸gØe´d‚45³ÀÄ™WYLŸ¡²Ò¶Žµi°ývVÊ>Ø¡Áñ‹vÎq™s“rOñ˜õRôÎ÷Ùñ³ÄÕPºµ†°††…EÈGEãÇøÅNÆéžoL@](L"NL™HSM¯¸D••¹’ms¹/WíJcžT~õ5Ñš"Ùâ–z¥·¼Ê˯Ü­ì½çQ…|PUc^{XWñظ~«!ë©`Ó‹gV-›­—^ µõtxwQt?êµéƒ÷—¿Öü81Š|seŒküþ¤ÂTÏ´ÍÌÚlÜÝ|å‚òbÿ²ÕÊä'ÌZûg‰ÂM¼-ïíáoòßKvñ÷<~ôïKäíŸ?ú3Bú‹U`\A$ÈwAø0˜ K†Ý‡Âàp ü<¼>‹ Ah!"5ˆeÄÖÈQ3’QRkÒ929Œ<—B„¢‡Ò‹ŠœªŽÚšFSAkB»OwŸÞža1ƒÉ„™y–¥Š5†Íˆ“ý3G3g:—Ý qn"î9žVÞR¾$~œ€íI=AE!a‹0žðŠÈ€è#±"ñ‰PIw)[i3YSh9´Å~œ‡ xPVðçPý°Ê’Hߨá…ØÛq4çÏÇo_pO|—lœò< •žsñg6óU¶ìå²\º+‰W÷òý – ¯(v*Y(õ½¹[–TÎPQY©rwì~àªêšZó‡?•Öë>Yo¼ØÄ×ÜØ¢÷|ê…o¢=¿S¢«¯Çó²ïÞ€Ñë¡ôÎÑ»oÅÆª'„'ワ®|š­ž“žº ¶Ø±¬¹Òô‰{-z½ƒá‹õæõ­Ù¯ß‚¾·ï²ì…üÝ—=(ø£? Ì€È4°8 n€f0vaÌ0%˜#¤~5lŽ—€;³à/áß(QˆxƒGƒg‚woŸ?¿‹€‹ šàáiÂÛD”DÑDëÄNÄãH rÒþ-©=é*Y$9y9…:Åe•(Õu:2Í&m9=ý C&#†‰…i‘¹ž%˦ÂÎÉDZÄÙÇU}â2w(ÏY^4Ÿ ¿€ûIA:!«0Ÿˆ„¨’˜–¸Ž„Ž$Z -­/c*k{ÊS.Bþ’B™b‹Ò¤òž ƒª˜š¦º¥†›f˜Všv ºAgD÷‹>µ4”™Œ«Mf̨0Zæ1–;VrÖ16ívgmíï;'Klµ ©«§[‡ç%¯-kß6œ„i ]Pj"4&l?"&  ?÷=Þ/a)Ñ9i.Å%u9ÝR;%‹9û^ŽrnÿÕ3y“g® ]×/ê*ѼÑ~}«ï6¦|úŽWåî½TèdQ]£Q;Q𘤾¼A£q®éÂ3–þÖ—œm½A]ŒÝ{µ^½î7x;h545b?:ÿÖsìËDÄâ]Ú ÅûÔÙÝ9ëùšØ¢ÁRÎòä*×'·µ»ëë"_\6Ó·*¶ï|-üñÝp‡rgx7~OrïýÄŸÂ?‡öƒíŽôÿ]/Aÿ¨Xñ0p„j§ÿoóñ>ŽI E&õu24‚z"èZÂcŽ90ÄLó˜]=´tŽÙÙQCï˜#ÜÕ Ù5@Ëô˜=uÙÅ×Üì˜qÞ¿êÛ?s©ýõw ÔüëᎱ<ö65?f/?½¿þÎ.׿ëm¨ìã¤ówýÀG€ý]gB€€€Â+Gô2wçÜQÿß-È% ªAP÷Ã…x¸¹qªBU¶ ŠSÇ+‚┓ÿéÞ,ª¼z=¶IDATxíÝKnÂ0FáÆ¤¼'¬…ý¯„0­„Wéq.²¬¤…ôd€l'ŽÄ7úuí$Í~¿O)Ýn·Óét¹\î÷{×ÛíöÃCP@PàoÝn·\.Û¶ý¬Žétš&“ÉwÔs¹®îÚV@P@ £(Nq<Ú³Ù,®IT­¨`]¯WBU„-MÓŒoሠ( € ( @-@d"G‘æó9ëx &include_indv, const vector &include_genotype) const { for (unsigned int ui=0; ui= ALT.size())) out = "."; else out = ALT[allele_num-1]; } string entry::get_allele(int allele_num) const { assert(parsed_ALT == true); if (allele_num == 0) return REF; else if ((allele_num < 0) || (unsigned(allele_num - 1) >= ALT.size())) return "."; else return ALT[allele_num-1]; } string entry::get_ALT_allele(int allele_num) const { assert(parsed_ALT == true); if ((allele_num < 0) || (unsigned(allele_num) >= ALT.size())) return "."; return ALT[allele_num]; } void entry::get_alleles_vector(vector &out) const { assert(parsed_ALT == true); out.resize(ALT.size()+1); out[0] = REF; copy(ALT.begin(), ALT.end(), out.begin()+1); } double entry::get_QUAL() const { return QUAL; } string entry::get_FILTER() const { assert(parsed_FILTER == true); ostringstream out; if ((passed_filters == false) && (FILTER.empty())) out << "."; else if (passed_filters == true) out << "PASS"; else { out << FILTER[0]; for (unsigned int ui=1; ui &out) const { assert(parsed_FILTER == true); out = FILTER; } string entry::get_INFO(const set &INFO_to_keep, bool keep_all_INFO) const { assert(parsed_INFO == true); ostringstream sout; sout.str(""); sout.clear(); bool first=true; if ( ( (!INFO.empty()) && (!INFO_to_keep.empty()) ) || keep_all_INFO ) { string key; for (unsigned int ui=0; ui > entry::get_INFO_vector(const set &INFO_to_keep, bool keep_all_INFO) const { assert(parsed_INFO == true); vector > out_vector; if (keep_all_INFO == true) return INFO; if ( (!INFO.empty()) && (!INFO_to_keep.empty()) ) { string key; for (unsigned int ui=0; ui &out) const { assert(parsed_FORMAT_binary == true); out = FORMAT_binary; } // Return the alleles of a genotype as a pair of strings. void entry::get_indv_GENOTYPE_strings(unsigned int indv, pair &out) const { assert(parsed_GT[indv] == true); static string out_allele1, out_allele2; get_allele(GENOTYPE[indv].first, out_allele1); get_allele(GENOTYPE[indv].second, out_allele2); out = make_pair(out_allele1, out_allele2); } void entry::get_indv_GENOTYPE_ids(unsigned int indv, pair &out) const { assert(parsed_GT[indv] == true); out = GENOTYPE[indv]; } char entry::get_indv_PHASE(unsigned int indv) const { assert(parsed_GT[indv] == true); return PHASE[indv]; } int entry::get_indv_DEPTH(unsigned int indv) const { assert(parsed_DP[indv] == true); if (DEPTH.empty()) return -1; return DEPTH[indv]; } double entry::get_indv_GQUALITY(unsigned int indv) const { assert(parsed_GQ[indv] == true); if (GQUALITY.empty()) return -1; return GQUALITY[indv]; } void entry::get_indv_GFILTER_vector(unsigned int indv, vector &out) const { assert(parsed_FT[indv] == true); if (!GFILTER.empty()) out = GFILTER[indv]; else out.resize(0); } void entry::get_indv_GFILTER(unsigned int indv, string &out) const { assert(parsed_FT[indv] == true); if ((!GFILTER.empty()) && (GFILTER[indv].size()>0)) { out=""; for (unsigned int ui=0; ui &include_indv, const vector &include_genotype) const { unsigned int out=0; for (unsigned int ui=0; ui &out, unsigned int &N_non_missing_chr_out, const vector &include_indv, const vector &include_genotype) const { pair genotype; vector allele_counts(get_N_alleles(), 0); N_non_missing_chr_out = 0; for (unsigned int ui=0; ui &include_indv, const vector &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const { out_N_hom1 = 0; out_N_hom2 = 0; out_N_het = 0; pair genotype; if (ALT.size() > 1) LOG.error("Tried to return the genotype counts of a non-biallelic SNP", 99); for (unsigned int ui=0; ui &out) const { out.resize(sizeof(uint32_t)); uint32_t pos = POS - 1; memcpy(&out[0], &pos, sizeof(pos)); } void entry::get_rlen(vector &out) const { out.resize(sizeof(int32_t)); int32_t rlen; if (REF != "" and REF != "." and REF != " ") rlen = (int32_t)REF.length(); else rlen = (int32_t)0; memcpy(&out[0], &rlen, sizeof(rlen)); } void entry::get_QUAL_binary(vector &out) const { out.resize(sizeof(float)); float qual = (float)QUAL; memcpy(&out[0], &qual, sizeof(qual)); } void entry::get_n_allele_info(vector &out) const { out.resize(sizeof(uint32_t)); uint32_t n_allele_info = (uint32_t)ALT.size() + 1; uint32_t n_info = (uint32_t)(INFO.size()-N_INFO_removed); n_allele_info = n_allele_info << 16; n_allele_info = n_allele_info | n_info; memcpy(&out[0], &n_allele_info, sizeof(n_allele_info)); } void entry::get_n_fmt_sample(vector &out) const { out.resize(sizeof(uint32_t)); uint32_t n_fmt_sample = (uint32_t)(FORMAT.size()-N_FORMAT_removed); uint32_t n_sample = (uint32_t)N_indv; n_fmt_sample = n_fmt_sample << 24; n_fmt_sample = n_fmt_sample | n_sample; memcpy(&out[0], &n_fmt_sample, sizeof(n_fmt_sample)); } void entry::get_ID_binary(vector &out) { make_typed_string(out, ID, true ); } void entry::get_ALLELES_binary(vector &out) { vector tmp; out.resize(0); make_typed_string(tmp, REF, true ); out.insert(out.end(), tmp.begin(), tmp.end()); for (unsigned int ui=0; ui &data_line) { N_indv = n_indv; line = data_line; basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; passed_filters = false; parsed_FORMAT_binary = false; N_INFO_removed = 0; N_FORMAT_removed = 0; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; N_samples = 0; N_info = 0; N_format = 0; L_shared = 0; L_indiv = 0; line_pos = 0; INFO_pos = 0; FILTER_pos = 0; ALT_pos = 0; FORMAT_pos = 0; FORMAT_positions.resize(0); FORMAT_types.resize(0); FORMAT_sizes.resize(0); FORMAT_skip.resize(0); FORMAT_keys.resize(0); entry_header = header_obj; INFO_map = header_obj.INFO_map; FILTER_map = header_obj.FILTER_map; FORMAT_map = header_obj.FORMAT_map; CONTIG_map = header_obj.CONTIG_map; CONTIG_reverse_map = header_obj.CONTIG_reverse_map; FILTER_reverse_map = header_obj.FILTER_reverse_map; FORMAT_reverse_map = header_obj.FORMAT_reverse_map; INFO_reverse_map = header_obj.INFO_reverse_map; } bcf_entry::bcf_entry(const unsigned int n_indv, const header &header_obj) { N_indv = n_indv; basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; N_INFO_removed = 0; N_FORMAT_removed = 0; passed_filters = false; parsed_FORMAT_binary = false; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; N_samples = 0; N_info = 0; N_format = 0; L_shared = 0; L_indiv = 0; line_pos = 0; INFO_pos = 0; FILTER_pos = 0; ALT_pos = 0; FORMAT_pos = 0; FORMAT_positions.resize(0); FORMAT_types.resize(0); FORMAT_sizes.resize(0); FORMAT_skip.resize(0); FORMAT_keys.resize(0); line.clear(); entry_header = header_obj; INFO_map = header_obj.INFO_map; FILTER_map = header_obj.FILTER_map; FORMAT_map = header_obj.FORMAT_map; CONTIG_map = header_obj.CONTIG_map; CONTIG_reverse_map = header_obj.CONTIG_reverse_map; FILTER_reverse_map = header_obj.FILTER_reverse_map; FORMAT_reverse_map = header_obj.FORMAT_reverse_map; INFO_reverse_map = header_obj.INFO_reverse_map; } bcf_entry::~bcf_entry() {} void bcf_entry::reset(const vector &data_line) { basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; parsed_FORMAT_binary = false; line = data_line; fill(parsed_GT.begin(), parsed_GT.end(), false); fill(parsed_GQ.begin(), parsed_GQ.end(), false); fill(parsed_DP.begin(), parsed_DP.end(), false); fill(parsed_FT.begin(), parsed_FT.end(), false); INFO_pos = 0; FILTER_pos = 0; ALT_pos = 0; FORMAT_pos = 0; FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear(); N_INFO_removed = 0; N_FORMAT_removed = 0; } void bcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO) { if (line.empty()) { if (parse_ALT) set_ALT(""); return; } uint32_t n_allele_info, n_fmt_sample; unsigned int n_allele; uint32_t chrom, pos, rlen; uint32_t shared, indiv; float qual; line_pos = 0; get_number(shared, &line_pos, line); get_number(indiv, &line_pos, line); L_shared = shared; L_indiv = indiv; get_number(chrom, &line_pos, line); get_number(pos, &line_pos, line); get_number(rlen, &line_pos, line); qual = *reinterpret_cast(&line[line_pos]); line_pos += sizeof(qual); get_number(n_allele_info, &line_pos, line); get_number(n_fmt_sample, &line_pos, line); N_format = n_fmt_sample >> 24; CHROM = CONTIG_map[chrom].ID; POS = pos + 1; ID = get_typed_string( &line_pos, line ); REF = get_typed_string( &line_pos, line ); QUAL = qual; n_allele = n_allele_info >> 16; N_info = n_allele_info & (uint32_t)65535; ALT_pos = line_pos; for (unsigned int ui=1; ui &include_genotype_out, double min_genotype_quality) { if (fully_parsed == false) parse_full_entry(); //if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end()) if (GQ_idx != -1) { // Have quality info double quality; include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui &include_genotype_out, int min_depth, int max_depth) { if (fully_parsed == false) parse_full_entry(); //if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end()) if (DP_idx != -1) { // Have depth info int depth; include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui max_depth)) include_genotype_out[ui] = false; } } } void bcf_entry::filter_genotypes_by_filter_status(vector &include_genotype_out, const set &filter_flags_to_remove, bool remove_all) { if (fully_parsed == false) parse_full_entry(); vector GFILTERs; if (FT_idx != -1) { // Have GFilter info include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui ids; if ( GT && !parsed_GT[indv] && GT_idx != -1 ) ids.push_back(GT_idx); if (GQ && !parsed_GQ[indv] && GQ_idx != -1) ids.push_back(GQ_idx); if (DP && !parsed_DP[indv] && DP_idx != -1) ids.push_back(DP_idx); if (FT && !parsed_FT[indv] && FT_idx != -1) ids.push_back(FT_idx); for(unsigned int i=0; i1) LOG.error("Error: Only expect single value for QUALITY.\n"); float tmp; if (type==5) tmp = *reinterpret_cast(&line[l_pos]); else if (type==1) { int8_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else if (type==2) { int16_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else if (type==3) { int32_t tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (float)tmp2; } else LOG.error("Error: Invalid type for QUALITY.\n"); set_indv_GQUALITY(indv, tmp); } else if ((int)ui == DP_idx) { if (size>1) LOG.error("Error: Only expect single value for DEPTH.\n"); int tmp = -1; if (type==1) { if ( !check_missing(l_pos, 1, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==2) { if ( !check_missing(l_pos, 2, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==3) { if ( !check_missing(l_pos, 3, line) ) tmp = *reinterpret_cast(&line[l_pos]); } else if (type==5) { float tmp2 = -1; if ( !check_missing(l_pos, 5, line) ) tmp2 = *reinterpret_cast(&line[l_pos]); tmp = (int)tmp2; } else LOG.error("Error: Only expect single value for DEPTH.\n"); set_indv_DEPTH(indv, tmp); } else if ((int)ui == FT_idx) { if (type == 7) { vector tmp; tmp.resize( size*sizeof(char) ); memcpy(&tmp[0], &line[l_pos], size*sizeof(char)); set_indv_GFILTER(indv, tmp); } else LOG.one_off_warning("Warning: FT values must be encoded in string format.\n"); } } // Set missing return values if requested a value, but couldn't find it if (GT && (parsed_GT[indv] == false)) { set_indv_GENOTYPE_and_PHASE(indv, make_pair(-1,-1), '/'); } if (GQ && (parsed_GQ[indv] == false)) { set_indv_GQUALITY(indv, -1); } if (DP && (parsed_DP[indv] == false)) { set_indv_DEPTH(indv, -1); } if (FT && (parsed_FT[indv] == false)) { set_indv_GFILTER(indv, ""); } } void bcf_entry::parse_genotype_entries(bool GT, bool GQ, bool DP, bool FT) { for (unsigned int ui=0; ui genotype; char phase; get_indv_GENOTYPE_ids(indv, genotype); phase = get_indv_PHASE(indv); if ((genotype.first != -1) && (genotype.second != -1)) outstream << genotype.first << phase << genotype.second; else if ((phase == '|') && (genotype.second == -1)) outstream << int2str(genotype.first); // Handle haploid case else outstream << int2str(genotype.first) << phase << int2str(genotype.second); tmpstr = outstream.str(); out = tmpstr; } else { format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); // outstream << int2str(tmp); outstream << int(tmp); } l_pos += sizeof(int8_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss) out = tmpstr; } } else if (type == 2) { int16_t tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); // outstream << int2str(tmp); outstream << int(tmp); } l_pos += sizeof(int16_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 3) { int32_t tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); // outstream << int2str(tmp); outstream << int(tmp); } l_pos += sizeof(int32_t); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 5) { float tmp; format_miss = true; for (unsigned int uj=0; uj(&line[l_pos]); // outstream << output_log::dbl2str(tmp, 6); outstream << float(tmp); } l_pos += sizeof(float); format_miss = format_miss && miss; } tmpstr = outstream.str(); if ( (tmpstr.length() > 0) and !format_miss ) out = tmpstr; } else if (type == 7) { stringstream str_stream; string tmp_string; char tmp = '.'; for (unsigned int uj=0; uj(&line[l_pos]); l_pos += sizeof(char); str_stream << tmp; } tmp_string = str_stream.str(); tmp_string.erase( remove( tmp_string.begin(), tmp_string.end(), ' ' ), tmp_string.end() ); if (tmp_string != "") out = tmp; else out = "."; } } void bcf_entry::read_all_entries(string &out, const vector &include_indv, const vector &include_genotype) { if (fully_parsed == false) parse_full_entry(true); if (parsed_FORMAT == false) set_FORMAT(); ostringstream outstream, tmpstream; string tmpstr; outstream.str(""); tmpstream.str(""); bool format_miss, indv_miss; for(unsigned int ui=0; ui include_indv(N_indv, true); vector include_genotype(N_indv, true); set INFO_to_keep; print(out, INFO_to_keep, false, include_indv, include_genotype); } void bcf_entry::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO) { vector include_indv(N_indv, true); vector include_genotype(N_indv, true); print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype); } // Output BCF entry to output stream in VCF format void bcf_entry::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) { if (fully_parsed == false) parse_full_entry(); out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT(); out << '\t' << entry::double2str(QUAL); out << '\t' << get_FILTER(); out << '\t' << get_INFO(INFO_to_keep, keep_all_INFO); if (FORMAT.size() > 0) { string indv_entries; out << '\t' << get_FORMAT(); read_all_entries( indv_entries, include_indv, include_genotype ); out << indv_entries; } out << '\n'; // endl flushes the buffer, which is slow. This (should be) quicker. } void bcf_entry::print_bcf(BGZF* out) { vector include_indv(N_indv, true); vector include_genotype(N_indv, true); set INFO_to_keep; print_bcf(out, INFO_to_keep, false, include_indv, include_genotype); } void bcf_entry::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) { vector include_indv(N_indv, true); vector include_genotype(N_indv, true); print_bcf(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype); } // Output BCF entry to output stream in BCF format void bcf_entry::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) { if (fully_parsed == false) parse_full_entry(true); vector out_vector, tmp_vector; vector > tmp_info; int index; out_vector.resize(INFO_pos); memcpy(&out_vector[0], &line[0], INFO_pos); if (keep_all_INFO) { unsigned int curr_size = out_vector.size(); out_vector.resize(curr_size + (FORMAT_pos - INFO_pos) ); memcpy(&out_vector[curr_size], &line[INFO_pos], (FORMAT_pos - INFO_pos)); } else { int map_type, number; get_n_allele_info(tmp_vector); memcpy(&out_vector[6*sizeof(int32_t)], &tmp_vector[0], sizeof(char)); tmp_info = get_INFO_vector(INFO_to_keep, keep_all_INFO); for(unsigned int ui=0; uiapply_filters(params); unsigned int N_indv = vf->N_kept_individuals(); unsigned int N_sites = vf->N_kept_sites(); unsigned int N_total_indv = vf->N_total_indv(); unsigned int N_total_sites = vf->N_total_sites(); LOG.printLOG("After filtering, kept " + output_log::int2str(N_indv) + " out of " + output_log::int2str(N_total_indv) + " Individuals\n"); LOG.printLOG("After filtering, kept " + output_log::int2str(N_sites) + " out of a possible " + output_log::int2str(N_total_sites) + " Sites\n"); if (N_sites == 0) LOG.error("No data left for analysis!"); if (params.diff_file != "") { // Merge files - cannot be run with other output options. variant_file *variant_diff; if (params.diff_file_bcf) variant_diff = new bcf_file(params.diff_file, params.chrs_to_keep, params.chrs_to_exclude, params.force_write_index, params.gatk); else variant_diff = new vcf_file(params.diff_file, params.diff_file_compressed, params.chrs_to_keep, params.chrs_to_exclude, params.force_write_index); variant_diff->apply_filters(params); // Apply various filters as required. vf->output_indv_in_files(params.output_prefix, *variant_diff, params.diff_indv_map_file); vf->output_sites_in_files(params.output_prefix, *variant_diff); if (params.diff_site_discordance == true) vf->output_discordance_by_site(params.output_prefix, *variant_diff, params.diff_indv_map_file); if (params.diff_discordance_matrix == true) vf->output_discordance_matrix(params.output_prefix, *variant_diff, params.diff_indv_map_file); if (params.diff_indv_discordance == true) vf->output_discordance_by_indv(params.output_prefix, *variant_diff, params.diff_indv_map_file); if (params.diff_switch_error == true) vf->output_switch_error(params.output_prefix, *variant_diff, params.diff_indv_map_file); delete variant_diff; } vf->output_INFO_for_each_site(params.output_prefix, params.INFO_to_extract); vf->output_FORMAT_information(params.output_prefix, params.FORMAT_id_to_extract); if (params.output_indv_depth == true) vf->output_individuals_by_mean_depth(params.output_prefix); if (params.output_geno_depth == true) vf->output_genotype_depth(params.output_prefix); if (params.output_site_depth == true) vf->output_site_depth(params.output_prefix, false); if (params.output_site_mean_depth == true) vf->output_site_depth(params.output_prefix, true); if (params.output_freq == true) vf->output_frequency(params.output_prefix, false, params.suppress_allele_output, params.derived); if (params.output_counts == true) vf->output_frequency(params.output_prefix, true, params.suppress_allele_output, params.derived); if (params.plink_output == true) vf->output_as_plink(params.output_prefix); if (params.plink_tped_output == true) vf->output_as_plink_tped(params.output_prefix); if (params.output_HWE == true) vf->output_hwe(params.output_prefix); if (params.output_SNP_density_bin_size > 0) vf->output_SNP_density(params.output_prefix, params.output_SNP_density_bin_size); if (params.output_missingness == true) vf->output_missingness(params.output_prefix); if (params.output_geno_chisq == true) vf->output_genotype_chisq(params.output_prefix, params.ld_snp_window_size, params.ld_snp_window_min, params.ld_bp_window_size, params.ld_bp_window_min, -1.0); if (params.output_geno_rsq == true) vf->output_genotype_r2(params.output_prefix, params.ld_snp_window_size, params.ld_snp_window_min, params.ld_bp_window_size, params.ld_bp_window_min, params.min_r2); if (params.output_interchromosomal_hap_rsq == true) vf->output_interchromosomal_haplotype_r2(params.output_prefix, params.min_r2); if (params.output_interchromosomal_geno_rsq == true) vf->output_interchromosomal_genotype_r2(params.output_prefix, params.min_r2); if (params.output_hap_rsq == true) vf->output_haplotype_r2(params.output_prefix, params.ld_snp_window_size, params.ld_snp_window_min, params.ld_bp_window_size, params.ld_bp_window_min, params.min_r2); if (params.hap_rsq_position_list != "") vf->output_haplotype_r2_of_SNP_list_vs_all_others(params.output_prefix, params.hap_rsq_position_list, params.min_r2); if (params.geno_rsq_position_list != "") vf->output_genotype_r2_of_SNP_list_vs_all_others(params.output_prefix, params.geno_rsq_position_list, params.min_r2); if (params.output_het == true) vf->output_het(params.output_prefix); if (params.output_site_quality == true) vf->output_site_quality(params.output_prefix); if (params.output_012_matrix == true) vf->output_as_012_matrix(params.output_prefix); if (params.output_as_IMPUTE == true) vf->output_as_IMPUTE(params.output_prefix); if (params.output_BEAGLE_genotype_likelihoods_GL == true) vf->output_BEAGLE_genotype_likelihoods(params.output_prefix, 0); if (params.output_BEAGLE_genotype_likelihoods_PL == true) vf->output_BEAGLE_genotype_likelihoods(params.output_prefix, 1); if (params.output_as_ldhat_unphased == true) vf->output_as_LDhat_unphased(params.output_prefix); if (params.output_as_ldhat_phased == true) vf->output_as_LDhat_phased(params.output_prefix); if (params.output_singletons == true) vf->output_singletons(params.output_prefix); if (params.output_site_pi == true) vf->output_per_site_nucleotide_diversity(params.output_prefix); if (params.pi_window_size > 0) vf->output_windowed_nucleotide_diversity(params.output_prefix, params.pi_window_size, params.pi_window_step); if (params.output_Tajima_D_bin_size > 0) vf->output_Tajima_D(params.output_prefix, params.output_Tajima_D_bin_size); if (params.output_TsTv_bin_size > 0) vf->output_TsTv(params.output_prefix, params.output_TsTv_bin_size); if (params.output_TsTv_by_count) vf->output_TsTv_by_count(params.output_prefix); if (params.output_TsTv_by_qual) vf->output_TsTv_by_quality(params.output_prefix); if (params.recode == true) vf->print(params.output_prefix, params.recode_INFO_to_keep, params.recode_all_INFO); if (params.recode_bcf == true) vf->print_bcf(params.output_prefix, params.recode_INFO_to_keep, params.recode_all_INFO); if (params.recode_to_stream == true) vf->print(std::cout, params.recode_INFO_to_keep, params.recode_all_INFO); if (params.recode_bcf_to_stream == true) vf->print_bcf("", params.recode_INFO_to_keep, params.recode_all_INFO, true); if (params.output_filter_summary == true) vf->output_FILTER_summary(params.output_prefix); if (params.output_filtered_sites == true) vf->output_kept_and_removed_sites(params.output_prefix); if (params.output_LROH == true) vf->output_LROH(params.output_prefix); if (params.output_relatedness == true) vf->output_indv_relatedness(params.output_prefix); if (params.output_PCA == true) vf->output_PCA(params.output_prefix, !params.PCA_no_normalisation, params.output_N_PCA_SNP_loadings); if (params.fst_window_size <= 0) { if (params.hapmap_fst_populations.size() > 0) vf->output_hapmap_fst(params.output_prefix, params.hapmap_fst_populations); if (params.weir_fst_populations.size() > 0) vf->output_weir_and_cockerham_fst(params.output_prefix, params.weir_fst_populations); } else { if (params.hapmap_fst_populations.size() > 0) vf->output_windowed_hapmap_fst(params.output_prefix, params.hapmap_fst_populations, params.fst_window_size, params.fst_window_step); if (params.weir_fst_populations.size() > 0) vf->output_windowed_weir_and_cockerham_fst(params.output_prefix, params.weir_fst_populations, params.fst_window_size, params.fst_window_step); } if (params.output_indel_hist == true) vf->output_indel_hist(params.output_prefix); time(&end); double running_time = difftime(end,start); LOG.printLOG("Run Time = " + output_log::dbl2str_fixed(running_time, 2) + " seconds\n"); LOG.close(); delete vf; return 0; } vcftools_0.1.11/cpp/variant_file_diff.cpp0000644000000000000000000006766712156354766017177 0ustar rootroot/* * variant_file_diff.cpp * * Created on: Oct 30, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "variant_file.h" void variant_file::return_site_union(variant_file &file2, map, pair > &CHROMPOS_to_filepos_pair) { unsigned int s; int POS; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); entry *e2 = file2.get_entry_object(file2.N_indv); for (s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); POS = e->get_POS(); CHROMPOS_to_filepos_pair[make_pair(CHROM, POS)] = make_pair(s, -1); } } for (s=0; sreset(variant_line); e2->parse_basic_entry(); CHROM = e2->get_CHROM(); POS = e2->get_POS(); if (CHROMPOS_to_filepos_pair.find(make_pair(CHROM, POS)) != CHROMPOS_to_filepos_pair.end()) { CHROMPOS_to_filepos_pair[make_pair(CHROM, POS)].second = s; } else { CHROMPOS_to_filepos_pair[make_pair(CHROM, POS)] = make_pair(-1, s); } } } delete e; delete e2; } void variant_file::return_indv_union(variant_file &file2, map > &combined_individuals, const string &indv_ID_map_file) { map indv_map; bool use_map = false; if (indv_ID_map_file != "") { LOG.printLOG("Reading individual mapping file. "); ifstream map(indv_ID_map_file.c_str()); if (!map.is_open()) LOG.error("Could not open map file: " + indv_ID_map_file); while (!map.eof()) { string indv1, indv2; map >> indv1 >> indv2; map.ignore(numeric_limits::max(), '\n'); if ((indv1 != "") && (indv1.substr(0,1) != "#")) { indv_map[indv1] = indv2; } } map.close(); use_map = true; LOG.printLOG("Read " + LOG.int2str(indv_map.size()) + " entries.\n"); } for (unsigned int ui=0; ui(ui, -1); } for (unsigned int ui=0; ui(-1, ui); } } void variant_file::output_sites_in_files(const string &output_file_prefix, variant_file &diff_variant_file) { LOG.printLOG("Comparing sites in VCF files...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_variant_file, CHROMPOS_to_filepos_pair); vector variant_line; string CHROM; int POS; string output_file = output_file_prefix + ".diff.sites_in_files"; ofstream sites_in_files(output_file.c_str()); sites_in_files << "CHROM\tPOS\tIN_FILE\tREF\tALT1\tALT2" << endl; int s1, s2; int N_common_SNPs = 0, N_SNPs_file1_only=0, N_SNPs_file2_only=0; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it!=CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; entry *e1 = get_entry_object(N_indv); entry *e2 = diff_variant_file.get_entry_object(diff_variant_file.N_indv); // Read entries from file (if available) if (s1 != -1) { get_entry(s1, variant_line); e1->reset(variant_line); } if (s2 != -1) { diff_variant_file.get_entry(s2, variant_line); e2->reset(variant_line); } e1->parse_basic_entry(true); e2->parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1->get_REF(); string REF2 = e2->get_REF(); if ((REF == "N") || (REF == ".") || (REF == "") ) REF = REF2; if ((REF2 == "N") || (REF2 == ".") || (REF2 == "") ) REF2 = REF; if ((REF != REF2) && (REF2 != "N") && (REF != "N") && (REF != ".") && (REF2 != ".") && (REF != "") && (REF2 != "")) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } sites_in_files << CHROM << "\t" << POS << "\t"; if ((s1 != -1) && (s2 != -1)) { N_common_SNPs++; sites_in_files << "B"; } else if ((s1 != -1) && (s2 == -1)) { N_SNPs_file1_only++; sites_in_files << "1"; } else if ((s1 == -1) && (s2 != -1)) { N_SNPs_file2_only++; sites_in_files << "2"; } else LOG.error("SNP in neither file!?"); sites_in_files << "\t" << REF << "\t" << e1->get_ALT() << "\t" << e2->get_ALT() << endl; delete e1; delete e2; } sites_in_files.close(); LOG.printLOG("Found " + output_log::int2str(N_common_SNPs) + " SNPs common to both files.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file1_only) + " SNPs only in main file.\n"); LOG.printLOG("Found " + output_log::int2str(N_SNPs_file2_only) + " SNPs only in second file.\n"); } void variant_file::output_indv_in_files(const string &output_file_prefix, variant_file &diff_variant_file, const string &indv_ID_map_file) { LOG.printLOG("Comparing individuals in VCF files...\n"); string output_file = output_file_prefix + ".diff.indv_in_files"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Indv Differences File: " + output_file, 3); out << "INDV\tFILES" << endl; // Build a list of individuals contained in each file map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, indv_ID_map_file); unsigned int N_combined_indv = combined_individuals.size(); unsigned int N[3]={0,0,0}; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { if ((combined_individuals_it->second.first != -1) && (combined_individuals_it->second.second != -1)) { N[0]++; out << combined_individuals_it->first << "\tB" << endl; } else if (combined_individuals_it->second.first != -1) { N[1]++; out << combined_individuals_it->first << "\t1" << endl; } else if (combined_individuals_it->second.second != -1) { N[2]++; out << combined_individuals_it->first << "\t2" << endl; } else LOG.error("Unhandled case"); } out.close(); LOG.printLOG("N_combined_individuals:\t" + output_log::int2str(N_combined_indv) + "\n"); LOG.printLOG("N_individuals_common_to_both_files:\t" + output_log::int2str(N[0]) + "\n"); LOG.printLOG("N_individuals_unique_to_file1:\t" + output_log::int2str(N[1]) + "\n"); LOG.printLOG("N_individuals_unique_to_file2:\t" + output_log::int2str(N[2]) + "\n"); } void variant_file::output_discordance_by_indv(const string &output_file_prefix, variant_file &diff_variant_file, const string &indv_ID_map_file) { LOG.printLOG("Outputting Discordance By Individual...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_variant_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, indv_ID_map_file); map > indv_sums; string CHROM; vector variant_line; int POS; int s1, s2, indv1, indv2; entry * e1 = get_entry_object(N_indv); entry * e2 = diff_variant_file.get_entry_object(diff_variant_file.N_indv); for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; // Read entries from file (if available) if (s1 != -1) { get_entry(s1, variant_line); e1->reset(variant_line); } if (s2 != -1) { diff_variant_file.get_entry(s2, variant_line); e2->reset(variant_line); } e1->parse_basic_entry(true); e2->parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1->get_REF(); string REF2 = e2->get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if ((REF.size() != REF2.size()) || ((REF != REF2) && (REF2 != "N") && (REF != "N"))) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } // Do the alternative alleles match? string ALT, ALT2; ALT = e1->get_ALT(); ALT2 = e2->get_ALT(); bool alleles_match = (ALT == ALT2) && (REF == REF2); e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { indv_sums[combined_individuals_it->first].first++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing // Don't do anything. } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything. } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything. } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data indv_sums[combined_individuals_it->first].first++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match // Don't do anything } else { // Mismatch indv_sums[combined_individuals_it->first].second++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing // Don't do anything } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. // Don't do anything } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. // Don't do anything } else LOG.error("Unknown condition"); } } } string output_file = output_file_prefix + ".diff.indv"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Sites Differences File: " + output_file, 3); out << "INDV\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; int N, N_discord; double discordance; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { out << combined_individuals_it->first; N = indv_sums[combined_individuals_it->first].first; N_discord = indv_sums[combined_individuals_it->first].second; discordance = N_discord / double(N); out << "\t" << N << "\t" << N_discord << "\t" << discordance << endl; } delete e1; delete e2; out.close(); } void variant_file::output_discordance_by_site(const string &output_file_prefix, variant_file &diff_variant_file, const string &indv_ID_map_file) { LOG.printLOG("Outputting Discordance By Site...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_variant_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, indv_ID_map_file); string CHROM; vector variant_line; int POS; int s1, s2, indv1, indv2; entry * e1 = get_entry_object(N_indv); entry * e2 = diff_variant_file.get_entry_object(diff_variant_file.N_indv); string output_file = output_file_prefix + ".diff.sites"; ofstream diffsites(output_file.c_str()); if (!diffsites.is_open()) LOG.error("Could not open Sites Differences File: " + output_file, 3); diffsites << "CHROM\tPOS\tFILES\tMATCHING_ALLELES\tN_COMMON_CALLED\tN_DISCORD\tDISCORDANCE" << endl; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; diffsites << CHROM << "\t" << POS; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; bool data_in_both = true; // Read entries from file (if available) if (s1 != -1) { get_entry(s1, variant_line); e1->reset(variant_line); } else data_in_both = false; if (s2 != -1) { diff_variant_file.get_entry(s2, variant_line); e2->reset(variant_line); } else data_in_both = false; if (data_in_both) diffsites << "\tB"; else if ((s1 != -1) && (s2 == -1)) diffsites << "\t1"; else if ((s1 == -1) && (s2 != -1)) diffsites << "\t2"; else LOG.error("Unhandled condition"); e1->parse_basic_entry(true); e2->parse_basic_entry(true); // Set the reference to the non-missing entry (if available) string REF = e1->get_REF(); string REF2 = e2->get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if ((REF.size() != REF2.size()) || ((REF != REF2) && (REF2 != "N") && (REF != "N"))) { LOG.one_off_warning("Non-matching REF. Skipping all such sites."); continue; } // Do the alternative alleles match? string ALT, ALT2; ALT = e1->get_ALT(); ALT2 = e2->get_ALT(); bool alleles_match = ((ALT == ALT2) && (REF == REF2)); diffsites << "\t" << alleles_match; e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair genotype1, genotype2; pair geno_ids1, geno_ids2; pair missing_genotype(".","."); pair missing_id(-1,-1); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int N_missing_1=0, N_missing_2=0; unsigned int N_discord=0; unsigned int N_concord_non_missing=0; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files if (alleles_match) { // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if ((geno_ids1 != missing_id) && (geno_ids2 != missing_id)) { N_common_called++; if (((geno_ids1.first == geno_ids2.first) && (geno_ids1.second == geno_ids2.second)) || ((geno_ids1.first == geno_ids2.second) && (geno_ids1.second == geno_ids2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((geno_ids1 == missing_id) && (geno_ids2 == missing_id)) { // Both missing N_missing_1++; N_missing_2++; } else if (geno_ids1 != missing_id) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (geno_ids2 != missing_id) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } else { // Alleles don't match, so need to be more careful and compare strings e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Match N_concord_non_missing++; } else { // Mismatch N_discord++; } } else if ((genotype1 == missing_genotype) && (genotype2 == missing_genotype)) { // Both missing N_missing_1++; N_missing_2++; } else if (genotype1 != missing_genotype) { // Genotype 1 is not missing, genotype 2 is. N_missing_2++; } else if (genotype2 != missing_genotype) { // Genotype 2 is not missing, genotype 1 is. N_missing_1++; } else LOG.error("Unknown condition"); } } double discordance = N_discord / double(N_common_called); diffsites << "\t" << N_common_called << "\t" << N_discord << "\t" << discordance; diffsites << endl; } delete e1; delete e2; diffsites.close(); } void variant_file::output_discordance_matrix(const string &output_file_prefix, variant_file &diff_variant_file, const string &indv_ID_map_file) { LOG.printLOG("Outputting Discordance Matrix\n\tFor bi-allelic loci, called in both files, with matching alleles only...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_variant_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, indv_ID_map_file); vector variant_line; int s1, s2, indv1, indv2; entry * e1 = get_entry_object(N_indv); entry * e2 = diff_variant_file.get_entry_object(diff_variant_file.N_indv); vector > discordance_matrix(4, vector(4, 0)); for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; // Read entries from file (if available) if (s1 != -1) { get_entry(s1, variant_line); e1->reset(variant_line); } if (s2 != -1) { diff_variant_file.get_entry(s2, variant_line); e2->reset(variant_line); } e1->parse_basic_entry(true); e2->parse_basic_entry(true); if ((e1->get_N_alleles() != 2) || (e2->get_N_alleles() != 2)) continue; // Set the reference to the non-missing entry (if available) string REF = e1->get_REF(); string REF2 = e2->get_REF(); if (REF == "N") REF = REF2; if (REF2 == "N") REF2 = REF; if (REF.size() != REF2.size()) continue; if ((REF != REF2) && (REF2 != "N") && (REF != "N")) continue; // Do the alternative alleles match? string ALT, ALT2; ALT = e1->get_ALT(); ALT2 = e2->get_ALT(); bool alleles_match = (ALT == ALT2) && (REF == REF2); if (alleles_match == false) continue; e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair geno_ids1, geno_ids2; int N1, N2; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files // Alleles match, so can compare ids instead of strings e1->get_indv_GENOTYPE_ids(indv1, geno_ids1); e2->get_indv_GENOTYPE_ids(indv2, geno_ids2); if (((geno_ids1.first != -1) && (geno_ids1.second == -1)) || ((geno_ids2.first != -1) && (geno_ids2.second == -1))) { // Haploid LOG.one_off_warning("***Warning: Haploid chromosomes not counted!***"); continue; } N1 = geno_ids1.first + geno_ids1.second; N2 = geno_ids2.first + geno_ids2.second; if ((N1 == -1) || (N1 < -2) || (N1 > 2)) LOG.error("Unhandled case"); if ((N2 == -1) || (N2 < -2) || (N2 > 2)) LOG.error("Unhandled case"); if (N1 == -2) N1 = 3; if (N2 == -2) N2 = 3; discordance_matrix[N1][N2]++; } } string output_file = output_file_prefix + ".diff.discordance_matrix"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open Discordance Matrix File: " + output_file, 3); out << "-\tN_0/0_file1\tN_0/1_file1\tN_1/1_file1\tN_./._file1" << endl; out << "N_0/0_file2\t" << discordance_matrix[0][0] << "\t" << discordance_matrix[1][0] << "\t" << discordance_matrix[2][0] << "\t" << discordance_matrix[3][0] << endl; out << "N_0/1_file2\t" << discordance_matrix[0][1] << "\t" << discordance_matrix[1][1] << "\t" << discordance_matrix[2][1] << "\t" << discordance_matrix[3][1] << endl; out << "N_1/1_file2\t" << discordance_matrix[0][2] << "\t" << discordance_matrix[1][2] << "\t" << discordance_matrix[2][2] << "\t" << discordance_matrix[3][2] << endl; out << "N_./._file2\t" << discordance_matrix[0][3] << "\t" << discordance_matrix[1][3] << "\t" << discordance_matrix[2][3] << "\t" << discordance_matrix[3][3] << endl; out.close(); delete e1; delete e2; } void variant_file::output_switch_error(const string &output_file_prefix, variant_file &diff_variant_file, const string &indv_ID_map_file) { LOG.printLOG("Outputting Phase Switch Errors...\n"); map, pair > CHROMPOS_to_filepos_pair; map, pair >::iterator CHROMPOS_to_filepos_pair_it; return_site_union(diff_variant_file, CHROMPOS_to_filepos_pair); map > combined_individuals; map >::iterator combined_individuals_it; return_indv_union(diff_variant_file, combined_individuals, indv_ID_map_file); string CHROM; vector variant_line; int POS; int s1, s2, indv1, indv2; entry * e1 = get_entry_object(N_indv); entry * e2 = diff_variant_file.get_entry_object(diff_variant_file.N_indv); string output_file = output_file_prefix + ".diff.switch"; ofstream switcherror(output_file.c_str()); if (!switcherror.is_open()) LOG.error("Could not open Switch Error file: " + output_file, 4); switcherror << "CHROM\tPOS\tINDV" << endl; unsigned int N_combined_indv = combined_individuals.size(); vector N_phased_het_sites(N_combined_indv, 0); vector N_switch_errors(N_combined_indv, 0); pair missing_genotype(".","."); vector > prev_geno_file1(N_combined_indv, missing_genotype); vector > prev_geno_file2(N_combined_indv, missing_genotype); pair file1_hap1, file1_hap2, file2_hap1; for (CHROMPOS_to_filepos_pair_it=CHROMPOS_to_filepos_pair.begin(); CHROMPOS_to_filepos_pair_it != CHROMPOS_to_filepos_pair.end(); ++CHROMPOS_to_filepos_pair_it) { CHROM = CHROMPOS_to_filepos_pair_it->first.first; POS = CHROMPOS_to_filepos_pair_it->first.second; s1 = CHROMPOS_to_filepos_pair_it->second.first; s2 = CHROMPOS_to_filepos_pair_it->second.second; // Read entries from file (if available) if (s1 != -1) { get_entry(s1, variant_line); e1->reset(variant_line); } if (s2 != -1) { diff_variant_file.get_entry(s2, variant_line); e2->reset(variant_line); } e1->parse_basic_entry(true); e2->parse_basic_entry(true); e1->parse_full_entry(true); e1->parse_genotype_entries(true); e2->parse_full_entry(true); e2->parse_genotype_entries(true); pair genotype1, genotype2; pair missing_genotype(".","."); unsigned int N_common_called=0; // Number of genotypes called in both files unsigned int indv_count=0; // Bug fix applied (#3354189) - July 5th 2011 for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it, indv_count++) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if ((indv1 == -1) || (indv2 == -1)) continue; // Individual not found in one of the files e1->get_indv_GENOTYPE_strings(indv1, genotype1); e2->get_indv_GENOTYPE_strings(indv2, genotype2); if ((genotype1 != missing_genotype) && (genotype2 != missing_genotype)) { // No missing data N_common_called++; if (((genotype1.first == genotype2.first) && (genotype1.second == genotype2.second)) || ((genotype1.first == genotype2.second) && (genotype1.second == genotype2.first)) ) { // Have a matching genotypes in files 1 and 2 if (genotype1.first != genotype1.second) { // It's a heterozgote char phase1, phase2; phase1 = e1->get_indv_PHASE(indv1); phase2 = e2->get_indv_PHASE(indv2); if ((phase1 == '|') && (phase2 == '|')) { // Calculate Phasing error (switch error) N_phased_het_sites[indv_count]++; file1_hap1 = make_pair(prev_geno_file1[indv_count].first, genotype1.first); file1_hap2 = make_pair(prev_geno_file1[indv_count].second, genotype1.second); file2_hap1 = make_pair(prev_geno_file2[indv_count].first, genotype2.first); if ((file2_hap1 != file1_hap1) && (file2_hap1 != file1_hap2)) { // Must be a switch error string indv_id; N_switch_errors[indv_count]++; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_variant_file.indv[indv2]; switcherror << CHROM << "\t" << POS << "\t" << indv_id << endl; } prev_geno_file1[indv_count] = genotype1; prev_geno_file2[indv_count] = genotype2; } } } } } } switcherror.close(); output_file = output_file_prefix + ".diff.indv.switch"; ofstream idiscord(output_file.c_str()); if (!idiscord.is_open()) LOG.error("Could not open Individual Discordance File: " + output_file, 3); idiscord << "INDV\tN_COMMON_PHASED_HET\tN_SWITCH\tSWITCH" << endl; unsigned int indv_count=0; double switch_error; string indv_id; for (combined_individuals_it=combined_individuals.begin(); combined_individuals_it!=combined_individuals.end(); ++combined_individuals_it) { indv1 = combined_individuals_it->second.first; indv2 = combined_individuals_it->second.second; if (indv1 != -1) indv_id = indv[indv1]; else indv_id = diff_variant_file.indv[indv2]; if (N_phased_het_sites[indv_count] > 0) switch_error = double(N_switch_errors[indv_count]) / N_phased_het_sites[indv_count]; else switch_error = 0; idiscord << indv_id << "\t" << N_phased_het_sites[indv_count] << "\t" << N_switch_errors[indv_count] << "\t" << switch_error << endl; indv_count++; } delete e1; delete e2; idiscord.close(); } vcftools_0.1.11/cpp/vcf_entry.h0000644000000000000000000000733312156354766015170 0ustar rootroot/* * vcf_entry.h * * Created on: Aug 19, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #ifndef VCF_ENTRY_H_ #define VCF_ENTRY_H_ #include #include #include #include #include #include #include #include #include #include #include "entry.h" #include "output_log.h" extern output_log LOG; using namespace std; class vcf_entry : public entry { public: vcf_entry(const unsigned int N_indv); vcf_entry(const unsigned int N_indv, const vector &data_line); virtual ~vcf_entry(); static string convert_line; void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false); void parse_full_entry(bool parse_FORMAT=true); void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_FORMAT(); void reset(const vector &data_line); void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out); void set_ALT(const string &in); void set_QUAL(const double in); void set_FILTER(const string &FILTER_str); void set_FORMAT(const string &in); void set_INFO(const string &INFO_str); void add_FORMAT_entry(const string &in, unsigned int pos); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const string &in); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in); void set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2); void set_indv_GENOTYPE_ids(unsigned int indv, const pair &in); void set_indv_PHASE(unsigned int indv, char in); void set_indv_GQUALITY(unsigned int indv, double in); void set_indv_DEPTH(unsigned int indv, int in); void set_indv_GFILTER(unsigned int indv, const string &in); void add_indv_GFILTER(unsigned int indv, const string &in); static int add_INFO_descriptor(const string &in, unsigned int index); static int add_FILTER_descriptor(const string &in, unsigned int index); static int add_FORMAT_descriptor(const string &in, unsigned int index); static void add_CONTIG_descriptor(const string &in, unsigned int index); void print(ostream &out); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype); void print_bcf(BGZF* out); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype); void filter_genotypes_by_depth(vector &include_genotype_out, int min_depth, int max_depth); void filter_genotypes_by_quality(vector &include_genotype_out, double min_genotype_quality); void filter_genotypes_by_filter_status(vector &include_genotype_out, const set &filter_flags_to_remove, bool remove_all = false); private: string ALT_str, FILTER_str, INFO_str, FORMAT_str, QUAL_str; vector GENOTYPE_str; static map INFO_map; static map INFO_reverse_map; static map FILTER_map; static map FILTER_reverse_map; static map FORMAT_map; static map FORMAT_reverse_map; static map CONTIG_map; }; #endif /* VCF_ENTRY_H_ */ vcftools_0.1.11/cpp/header.h0000644000000000000000000000217412156354766014417 0ustar rootroot/* * header.h * * Created on: Apr 29, 2013 * Author: amarcketta */ #ifndef HEADER_H_ #define HEADER_H_ #include #include #include #include "entry.h" using namespace std; class header { public: map INFO_map; map FILTER_map; map FORMAT_map; map CONTIG_map; map CONTIG_reverse_map; map FILTER_reverse_map; map INFO_reverse_map; map FORMAT_reverse_map; header() {}; ~header() {}; int add_INFO_descriptor(const string &in, int index); int add_FILTER_descriptor(const string &in, int index); int add_FORMAT_descriptor(const string &in, int index); void add_CONTIG_descriptor(const string &in, int index); }; // //class Field_description //{ //public: // string ID; // int N_entries; // string N_entries_str; // string Type_str; // Type_enum Type; // string Description; // string Length; // string Assembly; // // Field_description() : ID(""), N_entries(0), Type(Integer), Description("") {}; // ~Field_description() {}; //}; #endif /* HEADER_H_ */ vcftools_0.1.11/cpp/Makefile0000644000000000000000000000264412156354766014460 0ustar rootroot# Make file for vcftools # Author: Adam Auton # ($Revision: 230 $) # Compiler CC = gcc CPP = g++ # Output executable EXECUTABLE = vcftools # Flag used to turn on compilation of PCA routines ifndef VCFTOOLS_PCA VCFTOOLS_PCA = 0 endif # Compiler flags CFLAGS = -O2 -m64 #CFLAGS = -Wall -O2 -pg -m64 CPPFLAGS = -O2 -D_FILE_OFFSET_BITS=64 #CPPFLAGS = -O2 -Wall -pg -D_FILE_OFFSET_BITS=64 # Included libraries (zlib) LIB = -lz #LIB = -lz -I/opt/local/include/ -L/opt/local/lib/ OBJS = vcftools.o bcf_file.o vcf_file.o variant_file.o \ bcf_entry.o vcf_entry.o entry.o entry_setters.o entry_getters.o \ vcf_entry_setters.o bcf_entry_setters.o variant_file_filters.o \ variant_file_output.o variant_file_format_convert.o \ variant_file_diff.o header.o parameters.o \ variant_file_index.o \ output_log.o bgzf.o gamma.o ifeq ($(VCFTOOLS_PCA), 1) # Define flag for PCA routine compilation CPPFLAGS += -DVCFTOOLS_PCA # Add LAPACK library LIB += -llapack # Add PCA source code OBJS+= dgeev.o endif vcftools: $(OBJS) $(CPP) $(CPPFLAGS) $(OBJS) -o vcftools $(LIB) ifdef BINDIR cp $(CURDIR)/$@ $(BINDIR)/$@ endif bgzf: bgzf.c $(CC) -c $(CFLAGS) $(FLAGS) bgzf.c $(LIB) -o bgzf.o # pull in dependency info for *existing* .o files -include $(OBJS:.o=.d) %.o: %.cpp $(CPP) -c $(CPPFLAGS) $*.cpp -o $*.o $(CPP) -MM $(CPPFLAGS) $*.cpp > $*.d # remove compilation products clean: @rm -f vcftools *.o *.d @rm -f $(BINDIR)/vcftools vcftools_0.1.11/cpp/variant_file_filters.cpp0000644000000000000000000010263312156354766017716 0ustar rootroot/* * variant_file_filters.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 148 $) */ #include "variant_file.h" void variant_file::filter_genotypes_by_quality(double min_genotype_quality) { // Filter genotypes by quality if ((min_genotype_quality <= 0) || (has_genotypes == false)) return; if (has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Quality."); LOG.printLOG("Filtering out Genotypes with Quality less than " + output_log::dbl2str(min_genotype_quality,0) + "\n"); vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_genotype_entries(false, true); e->filter_genotypes_by_quality(include_genotype[s], min_genotype_quality); } delete e; } void variant_file::filter_genotypes_by_depth(int min_depth, int max_depth) { // Filter genotypes by depth if ((min_depth <= 0) && (max_depth == numeric_limits::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Depth."); LOG.printLOG("Filtering out Genotypes with Depth less than " + output_log::dbl2str(min_depth,0) + " and greater than " + output_log::dbl2str(max_depth, 0) + "\n"); vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_genotype_entries(false, false, true); e->filter_genotypes_by_depth(include_genotype[s], min_depth, max_depth); } delete e; } void variant_file::filter_genotypes_by_filter_flag(const set &filter_flags_to_remove, bool remove_all) { // Filter genotypes by Filter Flags if ((remove_all == false) && (filter_flags_to_remove.size() == 0)) return; if (remove_all == true) LOG.printLOG("Filtering out all genotypes with FILTER flag.\n"); else LOG.printLOG("Filtering out genotypes by Filter Status.\n"); if (has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter genotypes by Filter Flag."); vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_genotype_entries(false, false, false, true); e->filter_genotypes_by_filter_status(include_genotype[s], filter_flags_to_remove, remove_all); } delete e; } void variant_file::filter_individuals(const set &indv_to_keep, const set &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude) { // Filter individuals by user provided lists if (keep_then_exclude) { filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename); filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename); } else { filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename); filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename); } } void variant_file::filter_individuals_by_keep_list(const set &indv_to_keep, const string &indv_to_keep_filename) { // Filter individuals by user provided list if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0)) return; LOG.printLOG("Keeping individuals in 'keep' list\n"); set indv_to_keep_copy = indv_to_keep; if (indv_to_keep_filename != "") { ifstream infile(indv_to_keep_filename.c_str()); if (!infile.is_open()) LOG.error("Could not open Individual file:" + indv_to_keep_filename, 1); string line; string tmp_indv; stringstream ss; while (!infile.eof()) { getline(infile, line); ss.str(line); ss >> tmp_indv; indv_to_keep_copy.insert(tmp_indv); ss.clear(); } infile.close(); } for (unsigned int ui=0; ui &indv_to_exclude, const string &indv_to_exclude_filename) { // Filter individuals by user provided list if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0)) return; LOG.printLOG("Excluding individuals in 'exclude' list\n"); set indv_to_exclude_copy = indv_to_exclude; if (indv_to_exclude_filename != "") { ifstream infile(indv_to_exclude_filename.c_str()); if (!infile.is_open()) { LOG.error("Could not open Individual file:" + indv_to_exclude_filename, 1); } string line; string tmp_indv; stringstream ss; while (!infile.eof()) { getline(infile, line); ss.str(line); ss >> tmp_indv; indv_to_exclude_copy.insert(tmp_indv); ss.clear(); } infile.close(); } for (unsigned int ui=0; ui genotype; vector N_sites_included(N_indv, 0); vector N_missing(N_indv, 0); vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); for (ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); if (genotype.first != -1) { N_missing[ui]++; } N_sites_included[ui]++; } } } for (ui=0; ui::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in variant file in order to filter individuals by mean depth"); LOG.printLOG("Filtering individuals by mean depth\n"); unsigned int ui; vector N_sites_included(N_indv, 0); vector depth_sum(N_indv,0.0); int depth; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); for (ui=0; uiparse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { depth_sum[ui] += depth; N_sites_included[ui]++; } } } } for (ui=0; ui max_mean_depth)) include_indv[ui] = false; } delete e; } void variant_file::filter_individuals_by_phase() { // Filter individuals that are completely unphased. // TODO: Alter this to allow for a max/min level of unphased-ness. LOG.printLOG("Filtering Unphased Individuals\n"); if (has_genotypes == false) LOG.error("Require Genotypes in variant file to filter by Phase."); unsigned int ui, s; vector indv_count(N_indv, 0); vector indv_count_unphased(N_indv, 0); vector variant_line; entry *e = get_entry_object(N_indv); for (s=0; sreset(variant_line); for (ui=0; uiparse_genotype_entry(ui, true); indv_count[ui]++; if (e->get_indv_PHASE(ui) != '|') indv_count_unphased[ui]++; } } for (ui=0; ui keep_index(N_kept_indv); int count = 0; for (unsigned int ui=0; ui &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude) { // Filter sites by user provided lists if (keep_then_exclude) { filter_sites_to_keep(snps_to_keep, snps_to_keep_file); filter_sites_to_exclude(snps_to_exclude_file); } else { filter_sites_to_exclude(snps_to_exclude_file); filter_sites_to_keep(snps_to_keep, snps_to_keep_file); } } void variant_file::filter_sites_to_keep(const set &snps_to_keep, const string &snps_to_keep_file) { // Filter sites by user provided list if ((snps_to_keep.size() == 0) && (snps_to_keep_file == "")) return; set local_snps_to_keep = snps_to_keep; LOG.printLOG("Keeping sites by user-supplied list\n"); if (snps_to_keep_file != "") { ifstream in(snps_to_keep_file.c_str()); string tmp; if (!in.is_open()) { LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0); } while (!in.eof()) { in >> tmp; local_snps_to_keep.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); if (local_snps_to_keep.find(e->get_ID()) == local_snps_to_keep.end()) include_entry[s] = false; } delete e; } void variant_file::filter_sites_to_exclude(const string &snps_to_exclude_file) { // Filter sites by user provided list if (snps_to_exclude_file == "") return; LOG.printLOG("Excluding sites by user-supplied list\n"); set snps_to_exclude; if (snps_to_exclude_file != "") { ifstream in(snps_to_exclude_file.c_str()); string tmp; if (!in.is_open()) { LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0); } while (!in.eof()) { in >> tmp; snps_to_exclude.insert(tmp); in.ignore(numeric_limits::max(), '\n'); } in.close(); } vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); if (snps_to_exclude.find(e->get_ID()) != snps_to_exclude.end()) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_quality(double min_quality) { // Filter sites by quality if (min_quality < 0) return; LOG.printLOG("Filtering sites with Quality less than " + output_log::dbl2str(min_quality,0) + "\n"); unsigned int s; vector variant_line; entry *e = get_entry_object(N_indv); for (s=0; sreset(variant_line); e->parse_basic_entry(true); string alt_allele = e->get_ALT_allele(0); // The QUAL field has different definitions depending on the state of the // alternative allele. Here I treat them separately, although in this case // it is unnecessary. if ((alt_allele == ".") || (alt_allele == "")) { // The case that the alternative allele is unknown // QUAL is -10log_10 p(variant) if (e->get_QUAL() < min_quality) include_entry[s] = false; } else { // The normal case // QUAL is -10log_10 p(no variant) if (e->get_QUAL() < min_quality) include_entry[s] = false; } } delete e; } void variant_file::filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth) { // Filter sites by mean depth if ((min_mean_depth <= 0) && (max_mean_depth == numeric_limits::max())) return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to filter sites by mean depth"); LOG.printLOG("Filtering sites by mean depth\n"); int depth; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); unsigned int N_indv_included = 0; double depth_sum = 0.0; for (unsigned int ui=0; uiparse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { depth_sum += depth; } N_indv_included++; } } double mean_depth = depth_sum / N_indv_included; if ((mean_depth < min_mean_depth) || (mean_depth > max_mean_depth)) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_position(const string &chr, int start_pos, int end_pos) { // Filter sites by user provided position range if ((chr == "") || ((start_pos == -1) && (end_pos==numeric_limits::max()))) return; LOG.printLOG("Filtering sites by chromosome and/or position\n"); string chrom; int pos1; for (unsigned int s=0; s end_pos)) include_entry[s] = false; } else include_entry[s] = false; } } void variant_file::filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file) { // Filter sites by a user defined file containing a list of positions if ((positions_file == "") && (exclude_positions_file == "")) return; LOG.printLOG("Filtering sites by include/exclude positions files\n"); string chr; int pos1, idx; unsigned int N_chr=0; map chr_to_idx; bool keep=false, exclude=false; vector< set > keep_positions, exclude_positions; stringstream ss; string line; unsigned int gzMAX_LINE_LEN = 1024*1024; char *gz_readbuffer = new char[gzMAX_LINE_LEN]; if (positions_file != "") { gzFile gz_in = gzopen(positions_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + positions_file); keep = true; while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } gzclose(gz_in); } if (exclude_positions_file != "") { gzFile gz_in = gzopen(exclude_positions_file.c_str(), "rb"); if (gz_in == NULL) LOG.error("Could not open Positions file: " + exclude_positions_file); exclude = true; while (!gzeof(gz_in)) { line = ""; bool again = true; while (again == true) { gzgets(gz_in, gz_readbuffer, gzMAX_LINE_LEN); line.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } if (line[0] == '#') continue; line.erase( line.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); exclude_positions.resize(N_chr); } idx = chr_to_idx[chr]; exclude_positions[idx].insert(pos1); } gzclose(gz_in); } delete [] gz_readbuffer; for (unsigned int s=0; s chr_to_idx; vector< deque > > lims; vector variant_line; BED.ignore(numeric_limits::max(), '\n'); // Ignore header unsigned int N_BED_entries=0; while (!BED.eof()) { BED >> chr >> pos1 >> pos2; BED.ignore(numeric_limits::max(), '\n'); if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); lims.resize(N_chr); } idx = chr_to_idx[chr]; lims[idx].push_back(make_pair(pos1,pos2)); N_BED_entries++; } BED.close(); LOG.printLOG("\tRead " + output_log::int2str(N_BED_entries) + " BED file entries.\n"); for (unsigned int ui=0; ui min_ui(lims.size(), 0); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->get_CHROM(chr); pos1 = e->get_POS(); pos2 = pos1; unsigned int N_alleles = e->get_N_alleles(); for (int i=0; i<(int)N_alleles; i++) pos2 = max(pos2, (int)(pos1 + e->get_allele(i).length() - 1)); if (BED_exclude == false) { // Exclude sites not in BED file if (chr_to_idx.find(chr) == chr_to_idx.end()) include_entry[s] = false; else { idx = chr_to_idx[chr]; bool found=false; unsigned int max_ui = lims[idx].size(); for (unsigned int ui=min_ui[idx]; ui lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == false) include_entry[s] = false; } } else { // Exclude sites in BED file if (chr_to_idx.find(chr) != chr_to_idx.end()) { idx = chr_to_idx[chr]; bool found=false; unsigned int max_ui = lims[idx].size(); for (unsigned int ui=min_ui[idx]; ui lims[idx][ui].first) && (pos1 <= lims[idx][ui].second)) || // Start pos inside bin ((pos2 > lims[idx][ui].first) && (pos2 <= lims[idx][ui].second)) || // End pos inside bin ((pos1 <= lims[idx][ui].first) && (pos2 >= lims[idx][ui].second))) // Variant spans bin { found=true; break; } else if (pos1 > lims[idx][ui].second) min_ui[idx] = ui+1; } if (found == true) include_entry[s] = false; } } } } void variant_file::filter_sites_by_mask(const string &mask_file, bool invert_mask, int min_kept_mask_value) { // Filter sites on the basis of a fasta-like mask file. if (mask_file == "") return; if (invert_mask == false) LOG.printLOG("Filtering sites by mask file\n"); else LOG.printLOG("Filtering sites by inverted mask file\n"); ifstream mask(mask_file.c_str()); if (!mask.is_open()) LOG.error("Could not open mask file: " + mask_file); string line; string next_chr=""; vector variant_line; unsigned int next_pos = 0; unsigned int next_s = 0; unsigned int current_pos = 1; string current_header = ""; bool keep; entry *e = get_entry_object(N_indv); while (!mask.eof()) { getline(mask, line); line.erase( line.find_last_not_of(" \t") + 1); if (line[0] == '>') { // Header current_header = line.substr(1, line.find_first_of(" \t")-1); current_pos = 1; for (unsigned int s=next_s; sreset(variant_line); e->parse_basic_entry(); e->get_CHROM(next_chr); if (next_chr == current_header) { next_pos = (unsigned)e->get_POS(); next_s = s; break; } else { include_entry[s] = false; } } } } else { if ((current_pos + line.size() >= next_pos) && (next_chr == current_header)) { for (unsigned int ui=0; uireset(variant_line); e->parse_basic_entry(); e->get_CHROM(next_chr); next_pos = (unsigned)e->get_POS(); next_s = s; break; } } } } } current_pos += line.size(); } } mask.close(); delete e; // Remaining sites aren't covered by mask, so exclude for (unsigned int s=next_s; s::max())) return; LOG.printLOG("Filtering sites by number of alleles\n"); int N_alleles; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if ((N_alleles < min_alleles) || (N_alleles > max_alleles)) { include_entry[s] = false; } } delete e; } void variant_file::filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate) { // Filter sites so that all allele frequencies are between limits if ((min_maf <= 0.0) && (max_maf >= 1.0) && (min_site_call_rate <= 0) && (min_non_ref_af <= 0.0) && (max_non_ref_af >= 1.0)) return; if (has_genotypes == false) LOG.error("Require Genotypes in variant file to filter by frequency and/or call rate"); LOG.printLOG("Filtering sites by allele frequency and call rate\n"); unsigned int N_alleles; unsigned int N_non_missing_chr; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); vector allele_counts; e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); double freq, folded_freq; double maf=numeric_limits::max(); for (unsigned int ui=0; ui 0) && ((freq < min_non_ref_af) || (freq > max_non_ref_af))) include_entry[s] = false; } if ((maf < min_maf) || (maf > max_maf)) include_entry[s] = false; double call_rate = N_non_missing_chr / double(e->get_N_chr(include_indv, include_genotype[s])); if (call_rate < min_site_call_rate) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels) { if ((keep_only_indels == false) && (remove_indels == false)) return; if ((keep_only_indels == true) && (remove_indels == true)) LOG.error("Can't both keep and remove all indels!"); LOG.printLOG("Filtering sites by allele type\n"); vector variant_line; entry *e = get_entry_object(N_indv); string allele; unsigned int ref_len, N_alleles; bool is_indel; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); is_indel = false; allele = e->get_REF(); ref_len = allele.size(); N_alleles = e->get_N_alleles(); for (unsigned int ui=1; uiget_allele(ui, allele); if (allele.size() != ref_len) { is_indel = true; break; } } if (keep_only_indels == true) { if (is_indel == false) include_entry[s] = false; } else if (remove_indels == true) { if (is_indel == true) include_entry[s] = false; } } } void variant_file::filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count) { if ((min_mac <= 0) && (max_mac == numeric_limits::max()) && (min_non_ref_ac <= 0) && (max_non_ref_ac == numeric_limits::max()) && (max_missing_call_count == numeric_limits::max())) return; // Filter sites so that all allele counts are between limits if (has_genotypes == false) LOG.error("Require Genotypes in variant file to filter by allele counts and/or missing data"); LOG.printLOG("Filtering sites by allele count and missing data\n"); unsigned int N_alleles, N_chr, N_non_missing_chr; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); vector allele_counts; e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); N_chr = e->get_N_chr(include_indv, include_genotype[s]); int mac = numeric_limits::max(); for (unsigned int ui=0; ui 0) && ((allele_counts[ui] < min_non_ref_ac) || (allele_counts[ui] > max_non_ref_ac))) include_entry[s] = false; } if ((mac < min_mac) || (mac > max_mac)) include_entry[s] = false; if ((N_chr-N_non_missing_chr) > max_missing_call_count) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_HWE_pvalue(double min_HWE_pvalue) { // Filter sites by HWE p-value if (min_HWE_pvalue <= 0) return; if (has_genotypes == false) LOG.error("Require Genotypes in variant file to filter sites by HWE."); // Note this assumes Biallelic SNPs. LOG.printLOG("Filtering sites by HWE p-value (only including bi-allelic sites)\n"); unsigned int b11, b12, b22; double p; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->parse_genotype_entries(true); e->get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22); p = entry::SNPHWE(b12, b11, b22); if (p < min_HWE_pvalue) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_filter_status(const set &filter_flags_to_remove, const set &filter_flags_to_keep, bool remove_all) { // Filter sites by entries in the FILTER field. if ((remove_all == false) && (filter_flags_to_remove.size() == 0) && (filter_flags_to_keep.size() == 0)) return; LOG.printLOG("Filtering sites by FILTER Status.\n"); vector FILTERs; vector variant_line; unsigned int N_to_remove = filter_flags_to_remove.size(); unsigned int N_to_keep = filter_flags_to_keep.size(); entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(false, true); e->get_FILTER_vector(FILTERs); if (N_to_keep > 0) { bool keep = false; for (unsigned int ui=0; ui= 1) && (FILTERs[0] == "PASS") ) continue; else if ((remove_all == true) && (FILTERs.size() > 0)) include_entry[s] = false; else if (N_to_remove > 0) { for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); for (unsigned int ui=0; uiparse_genotype_entry(ui, true); count++; if (e->get_indv_PHASE(ui) != '|') count_unphased++; } if (count_unphased > 0) include_entry[s] = false; } delete e; } void variant_file::filter_sites_by_thinning(int min_SNP_distance) { // Filter sites so that no two SNPs are within some minimum distance if (min_SNP_distance < 1) return; LOG.printLOG("Filtering sites so that no two sites are within " + output_log::int2str(min_SNP_distance) + "bp\n"); string CHROM, last_CHROM=""; int POS, last_POS = -1; int distance_from_last_SNP; for (unsigned int s=0; s &flags_to_remove, const set &flags_to_keep) { // Filter sites by entries in the INFO field. if ((flags_to_remove.size() == 0) && (flags_to_keep.size() == 0)) return; LOG.printLOG("Filtering sites by INFO flags.\n"); vector variant_line; string value; unsigned int N_to_remove = flags_to_remove.size(); unsigned int N_to_keep = flags_to_keep.size(); entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(false, false, true); if (N_to_keep > 0) { bool keep = false; for (set::iterator it=flags_to_keep.begin(); it != flags_to_keep.end(); ++it) { value = e->get_INFO_value(*it); if (value == "1") keep = true; } include_entry[s] = keep; } if (include_entry[s]==false) continue; if (N_to_remove > 0) { for (set::iterator it=flags_to_remove.begin(); it != flags_to_remove.end(); ++it) { value = e->get_INFO_value(*it); if (value == "1") { include_entry[s] = false; continue; } } } } delete e; } vcftools_0.1.11/cpp/header.cpp0000644000000000000000000001526412156354766014756 0ustar rootroot/* * header.cpp * * Created on: Apr 29, 2013 * Author: amarcketta */ #include "header.h" #include "entry.h" int header::add_INFO_descriptor(const string &in, int index) { Field_description I; vector tokens; entry::tokenize(in, ',', tokens); if (tokens.size() < 4) LOG.error("Expected 4 parts in INFO definition: " + in); vector entry; entry::tokenize(tokens[0], '=', entry); if (entry[0] == "ID") I.ID = entry[1]; else LOG.error("Expected ID entry as first field in INFO description: " + in); entry::tokenize(tokens[1], '=', entry); if (entry[0] == "Number") { if ((entry[1] == "A") || (entry[1] == "G")) { I.N_entries = -1; I.N_entries_str = entry[1]; } else{ I.N_entries = entry::str2int(entry[1]); I.N_entries_str = entry[1]; } } else LOG.error("Expected Number entry as second field in INFO description: " + in); entry::tokenize(tokens[2], '=', entry); if (entry[0] == "Type") { if (entry[1] == "Integer") { I.Type_str = "Integer"; I.Type = Integer; } else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type_str = "Float"; I.Type = Float;} else if (entry[1] == "Character") {I.Type_str = "Character"; I.Type = Character;} else if (entry[1] == "String") {I.Type_str = "String"; I.Type = String;} else if (entry[1] == "Flag") { I.Type = Flag; I.Type_str = "Flag"; if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in); } else LOG.error("Unknown Type in INFO meta-information: " + in); } else LOG.error("Expected Type entry as third field in INFO description: " + in); entry::tokenize(tokens[3], '=', entry); if (entry[0] == "Description") { I.Description = entry[1]; for (unsigned int i=4; i"); string details = in.substr(0, found_end-1); vector tokens; entry::tokenize(details, ',', tokens); Field_description I; if (tokens.size() < 4) LOG.error("Expected 4 parts in FORMAT definition: " + in); vector entry; entry::tokenize(tokens[0], '=', entry); if (entry[0] == "ID") I.ID = entry[1]; else LOG.error("Expected ID entry as first field in FORMAT description: " + in); entry::tokenize(tokens[1], '=', entry); if (entry[0] == "Number") { if ((entry[1] == "A") || (entry[1] == "G")) I.N_entries = -1; else I.N_entries = entry::str2int(entry[1]); I.N_entries_str = entry[1]; } else LOG.error("Expected Number entry as second field in FORMAT description: " + in); entry::tokenize(tokens[2], '=', entry); if (entry[0] == "Type") { if (entry[1] == "Integer") {I.Type = Integer;} else if ((entry[1] == "Float") || (entry[1] == "Numeric")) {I.Type = Float;} else if (entry[1] == "Character") {I.Type = Character;} else if (entry[1] == "String") {I.Type = String;} else if (entry[1] == "Flag") { I.Type = Flag; I.Type_str = "Flag"; if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in); } else LOG.error("Unknown Type in FORMAT meta-information: " + in); } else LOG.error("Expected Type entry as third field in FORMAT description: " + in); entry::tokenize(tokens[3], '=', entry); if (entry[0] == "Description") { I.Description = entry[1]; for (unsigned int i=4; i"); string details = in.substr(0, found_end-1); vector tokens; entry::tokenize(details, ',', tokens); Field_description I; bool id_found = false; vector entry; for (unsigned int ui=0; ui entry; entry::tokenize(tokens[0], '=', entry); if (entry[0] == "ID") I.ID = entry[1]; else LOG.error("Expected ID as first field in FILTER description: " + in); entry::tokenize(tokens[1], '=', entry); if (entry[0] == "Description") { Description = entry[1]; for (unsigned int i=2; i void dgeev(double **H, int n, double *Er, double *Ei); void dgeev(double **H, int n, double *Er, double *Ei, double **Evecs); double *dgeev_ctof(double **in, int rows, int cols); void dgeev_ftoc(double *in, double **out, int rows, int cols); void dgeev_sort(double *Er, double *Ei, int N); void dgeev_sort(double *Er, double *Ei, double **Evecs, int N); extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double *vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info); #endif vcftools_0.1.11/cpp/vcf_entry_setters.cpp0000644000000000000000000003177412156354766017302 0ustar rootroot/* * vcf_entry_setters.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_entry.h" #include "entry.h" void vcf_entry::set_ALT(const string &in) { istringstream ss(in); string tmpstr; ALT.resize(0); while(!ss.eof()) { getline(ss, tmpstr, ','); add_ALT_allele(tmpstr); } parsed_ALT = true; } void vcf_entry::set_QUAL(const double in) { QUAL = in; } void vcf_entry::set_FORMAT(const string &in) { FORMAT.resize(0); FORMAT_to_idx.clear(); if (in.size() > 0) { istringstream ss(in); string tmpstr; unsigned int pos=0; while(!ss.eof()) { getline(ss, tmpstr, ':'); add_FORMAT_entry(tmpstr, pos); pos++; } } GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; if (FORMAT_to_idx.find("GT") != FORMAT_to_idx.end()) GT_idx = FORMAT_to_idx["GT"]; if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end()) GQ_idx = FORMAT_to_idx["GQ"]; if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end()) DP_idx = FORMAT_to_idx["DP"]; if (FORMAT_to_idx.find("FT") != FORMAT_to_idx.end()) FT_idx = FORMAT_to_idx["FT"]; parsed_FORMAT = true; } void vcf_entry::add_FORMAT_entry(const string &in, unsigned int pos) { FORMAT.push_back(in); FORMAT_to_idx[in] = pos; } // The following function reads in a genotype from a '0/1'-like string. // Should handle haploid types to, but NOT polyploidy. void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const string &in) { ploidy.resize(N_indv); if ((in.size() == 3) && ((in.c_str()[1] == '/') || (in.c_str()[1] == '|'))) { // Fast, diploid case... ploidy[indv] = 2; set_indv_PHASE(indv, in.c_str()[1]); set_indv_GENOTYPE_alleles(indv, in.c_str()[0], in.c_str()[2]); } else { // More complex case... size_t pos = in.find_first_of("/|"); if (pos != string::npos) { // autosome ploidy[indv] = 2; set_indv_PHASE(indv, in[pos]); set_indv_GENOTYPE_alleles(indv, make_pair(in.substr(0,pos), in.substr(pos+1))); } else { // Male chrX, or chrY ploidy[indv] = 1; set_indv_PHASE(indv, '|'); set_indv_GENOTYPE_alleles(indv, make_pair(in.substr(0,pos), ".")); } // Check for polypoidy size_t pos2 = in.find_last_of("/|"); if (pos != pos2) LOG.error("Polypolidy found, and not supported by vcftools: " + CHROM + ":" + int2str(POS)); } parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { ploidy.resize(N_indv); ploidy[indv] = 2; set_indv_GENOTYPE_ids(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { ploidy.resize(N_indv); ploidy[indv] = 2; set_indv_GENOTYPE_alleles(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (in.first != ".") a.first = str2int(in.first); if (in.second != ".") a.second = str2int(in.second); GENOTYPE[indv] = a; parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (a1 != '.') a.first = a1 - '0'; if (a2 != '.') a.second = a2 - '0'; GENOTYPE[indv] = a; parsed_GT[indv] = true; } void vcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); GENOTYPE[indv] = in; } void vcf_entry::set_indv_PHASE(unsigned int indv, char in) { if (PHASE.size() == 0) PHASE.resize(N_indv, '/'); PHASE[indv] = in; parsed_GT[indv] = true; } void vcf_entry::set_indv_GQUALITY(unsigned int indv, double in) { parsed_GQ[indv] = true; if (in == -1) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (in > 99) in = 99; GQUALITY[indv] = in; } void vcf_entry::set_indv_DEPTH(unsigned int indv, int in) { parsed_DP[indv] = true; if (in == -1) { if (DEPTH.size() > 0) DEPTH[indv] = -1; return; } if (DEPTH.size() == 0) DEPTH.resize(N_indv, -1); DEPTH[indv] = in; } void vcf_entry::add_indv_GFILTER(unsigned int indv, const string &in) { if (GFILTER.size() == 0) GFILTER.resize(N_indv); if (in != ".") if (find(GFILTER[indv].begin(), GFILTER[indv].end(), in) == GFILTER[indv].end()) GFILTER[indv].push_back(in); parsed_FT[indv] = true; } void vcf_entry::set_indv_GFILTER(unsigned int indv, const string &in) { parsed_FT[indv] = true; if (GFILTER.size() == 0) GFILTER.resize(N_indv); GFILTER[indv].resize(0); if ((in.size() == 0) || (in == ".")) return; static istringstream ss; static string ith_FILTER; ss.clear(); ss.str(in); while (!ss.eof()) { getline(ss, ith_FILTER, ';'); if ((ith_FILTER.size()==0) || (ith_FILTER == ".")) continue; // Don't bother storing "unfiltered" state. GFILTER[indv].push_back(ith_FILTER); } } void vcf_entry::set_FILTER(const string &FILTER_str) { FILTER.resize(0); passed_filters = false; if (FILTER_str == "PASS") passed_filters = true; else { if (FILTER_str != ".") { istringstream ss(FILTER_str); string ith_FILTER; while (!ss.eof()) { getline(ss, ith_FILTER, ';'); FILTER.push_back(ith_FILTER); } } } sort(FILTER.begin(), FILTER.end()); parsed_FILTER = true; } void vcf_entry::set_INFO(const string &INFO_str) { INFO.resize(0); if ((INFO_str.size() > 0) && (INFO_str != ".")) { istringstream ss(INFO_str); string tmpstr; while(!ss.eof()) { getline(ss, tmpstr, ';'); istringstream ss2(tmpstr); getline(ss2, tmpstr, '='); pair INFO_entry(tmpstr, "."); if (!ss2.eof()) { // If there is a value entry, read it now getline(ss2, tmpstr); INFO_entry.second = tmpstr; } else // Otherwise, set it equal to 1 INFO_entry.second = "1"; INFO.push_back(INFO_entry); } } parsed_INFO = true; } int vcf_entry::add_INFO_descriptor(const string &in, unsigned int index) { size_t found=in.find("##INFO="); if (found!=string::npos) { // Found an INFO descriptor size_t found_start=in.find_first_of("<"); size_t found_end=in.find_last_of(">"); string details = in.substr(found_start+1, found_end-found_start-1); Field_description I; vector tokens; tokenize(details, ',', tokens); if (tokens.size() < 4) LOG.error("Expected 4 parts in INFO definition: " + in); vector entry; tokenize(tokens[0], '=', entry); if (entry[0] == "ID") I.ID = entry[1]; else LOG.error("Expected ID entry as first field in INFO description: " + in); tokenize(tokens[1], '=', entry); if (entry[0] == "Number") { // TODO - handle 'A' and 'G' categories correctly. if ((entry[1] == "A") || (entry[1] == "G")) I.N_entries = -1; // Currently just treat as missing. else I.N_entries = str2int(entry[1]); } else LOG.error("Expected Number entry as second field in INFO description: " + in); tokenize(tokens[2], '=', entry); if (entry[0] == "Type") { if (entry[1] == "Integer") I.Type = Integer; else if ((entry[1] == "Float") || (entry[1] == "Numeric")) I.Type = Float; else if (entry[1] == "Character") I.Type = Character; else if (entry[1] == "String") I.Type = String; else if (entry[1] == "Flag") { I.Type = Flag; if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in); } else LOG.error("Unknown Type in INFO meta-information: " + in); } else LOG.error("Expected Type entry as third field in INFO description: " + in); tokenize(tokens[3], '=', entry); if (entry[0] == "Description") { I.Description = entry[1]; for (unsigned int i=4; i"); string details = in.substr(found_start+1, found_end-found_start-1); vector tokens; tokenize(details, ',', tokens); if (tokens.size() < 2) LOG.error("Expected 2 parts in FILTER definition: " + in); string ID, Description; vector entry; tokenize(tokens[0], '=', entry); if (entry[0] == "ID") ID = entry[1]; else LOG.error("Expected ID as first field in FILTER description: " + in); tokenize(tokens[1], '=', entry); if (entry[0] == "Description") { Description = entry[1]; for (unsigned int i=2; i"); string details = in.substr(found_start+1, found_end-found_start-1); vector tokens; tokenize(details, ',', tokens); Field_description I; if (tokens.size() < 4) LOG.error("Expected 4 parts in FORMAT definition: " + in); vector entry; tokenize(tokens[0], '=', entry); if (entry[0] == "ID") I.ID = entry[1]; else LOG.error("Expected ID entry as first field in FORMAT description: " + in); tokenize(tokens[1], '=', entry); if (entry[0] == "Number") { // TODO - handle 'A' and 'G' categories correctly. if ((entry[1] == "A") || (entry[1] == "G")) I.N_entries = -1; // Currently just treat as missing. else I.N_entries = str2int(entry[1]); } else LOG.error("Expected Number entry as second field in FORMAT description: " + in); tokenize(tokens[2], '=', entry); if (entry[0] == "Type") { if (entry[1] == "Integer") I.Type = Integer; else if ((entry[1] == "Float") || (entry[1] == "Numeric")) I.Type = Float; else if (entry[1] == "Character") I.Type = Character; else if (entry[1] == "String") I.Type = String; else if (entry[1] == "Flag") { I.Type = Flag; if (I.N_entries != 0) LOG.error("Flag Type must have 0 entries: " + in); } else LOG.error("Unknown Type in FORMAT meta-information: " + in); } else LOG.error("Expected Type entry as third field in FORMAT description: " + in); tokenize(tokens[3], '=', entry); if (entry[0] == "Description") { I.Description = entry[1]; for (unsigned int i=4; i"); string details = in.substr(found_start+1, found_end-found_start-1); vector tokens; entry::tokenize(details, ',', tokens); Field_description I; bool id_found = false; vector entry; for (unsigned int ui=0; ui 0) for (set::iterator it=snps_to_keep.begin(); it != snps_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--snp " + tmp + "\n"); } if (indv_to_keep.size() > 0) for (set::iterator it=indv_to_keep.begin(); it != indv_to_keep.end(); ++it) { string tmp = *it; LOG.printLOG("\t--indv " + tmp + "\n"); } if (indv_to_exclude.size() > 0) for (set::iterator it=indv_to_exclude.begin(); it != indv_to_exclude.end(); ++it) { string tmp = *it; LOG.printLOG("\t--remove-indv " + tmp + "\n"); } LOG.printLOG("\n"); } void parameters::print_help() { unsigned int i; string in_str; if (argv.size() <= 1) { // If there are no user parameters, display help. argv.push_back("--?"); print_help(); } for(i = 0; i < argv.size(); i++) { in_str = argv[i]; if ((in_str == "-h") || (in_str == "-?") || (in_str == "-help") || (in_str == "--?") || (in_str == "--help") || (in_str == "--h")) { cout << endl << "VCFtools (" << VCFTOOLS_VERSION << ")" << endl; cout << "\u00A9 Adam Auton 2009" << endl << endl; cout << "Process Variant Call Format files" << endl; cout << endl; cout << "For a list of options, please go to:" << endl; cout << "\thttp://vcftools.sourceforge.net/options.html" << endl; cout << endl; cout << "Questions, comments, and suggestions should be emailed to:" << endl; cout << "\tvcftools-help@lists.sourceforge.net" << endl; cout << endl; exit(0); } } } void parameters::check_parameters() { parameters defaults(0, 0); if (vcf_filename == "") error("VCF required.", 0); if (end_pos < start_pos) error("End position must be greater than Start position.", 1); if (((end_pos != numeric_limits::max()) || (start_pos != -1)) && (chrs_to_keep.size() != 1)) error("Require a single chromosome when specifying a range.", 2); if (max_maf < min_maf) error("Maximum MAF must be not be less than Minimum MAF.", 4); if (max_mac < min_mac) error("Maximum MAC must be not be less than Minimum MAC.", 4); if (min_maf != defaults.min_maf) { if ((min_maf < 0.0) || (min_maf > 1.0)) error("MAF must be between 0 and 1.", 4); } if (max_maf != defaults.max_maf) { if ((max_maf < 0.0) || (max_maf > 1.0)) error("Maximum MAF must be between 0 and 1.", 4); } if (min_non_ref_af != defaults.min_non_ref_af) { if ((min_non_ref_af < 0.0) || (min_non_ref_af > 1.0)) error("Non-Ref Allele Frequency must be between 0 and 1.", 4); } if (max_non_ref_af < min_non_ref_af) error("Maximum Non-Ref Allele Frequency must not be less that Minimum Non-Ref AF.", 4); if (max_non_ref_ac < min_non_ref_ac) error("Maximum Non-Ref Allele Count must not be less that Minimum Non-Ref AC.", 4); if ((min_site_call_rate > 1) || (min_indv_call_rate > 1)) error("Minimum Call rates cannot be greater than 1.", 5); if (max_alleles < min_alleles) error("Max Number of Alleles must be greater than Min Number of Alleles.", 6); if (max_mean_depth < min_mean_depth) error("Max Mean Depth must be greater the Min Mean Depth.", 7); if (max_indv_mean_depth < min_indv_mean_depth) error("Max Indv Mean Depth must be greater the Min Indv Mean Depth.", 8); if (max_genotype_depth < min_genotype_depth) error("Max Genotype Depth must be greater than Min Genotype Depth.", 9); if (((output_as_ldhat_phased == true) || (output_as_ldhat_unphased)) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting LDhat format.", 11); if ((output_BEAGLE_genotype_likelihoods_GL == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting Beagle likelihoods.", 11); if ((output_BEAGLE_genotype_likelihoods_PL == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting Beagle likelihoods.", 11); if (min_kept_mask_value > 9) error("Min Mask value must be between 0 and 9.", 14); if ((output_LROH == true) && (chrs_to_keep.size() != 1)) error("Require a chromosome (--chr) when outputting LROH.", 11); if (output_TsTv_bin_size < 0) error("TsTv bin size must be > 0",16); if (output_Tajima_D_bin_size < 0) error("Tajima D bin size must be > 0", 17); if (pi_window_size < 0) error("Pi Window size must be > 0", 18); if (output_SNP_density_bin_size < 0) error("SNP density bin size must be > 0", 18); } void parameters::error(string err_msg, int code) { LOG.printLOG("\n\nError: " + err_msg + "\n\n"); exit(code); } vcftools_0.1.11/cpp/bcf_file.cpp0000644000000000000000000003512112156354766015251 0ustar rootroot/* * bcf_file.cpp * * Created on: Dec 11, 2012 * Author: amarcketta */ #include "bcf_file.h" bcf_file::bcf_file(const string &fname, const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index, bool gatk) { filename = fname; has_body = false; has_file_format = false; has_header = false; has_meta = false; file_open = false; is_BGZF = false; bcf_format = true; big_endian = is_big_endian(); is_GATK = gatk; header_obj = header(); open(); scan_file(chrs_to_keep, exclude_chrs, force_write_index); } bcf_file::~bcf_file() { close(); } void bcf_file::open() { int ret; char magic[5]; char test_str[3] = {'B','C','F'}; if (filename.substr(filename.size()-4) == ".vcf") LOG.error("Filename ends in '.vcf'. Shouldn't you be using --vcf?\n"); if (filename.substr(filename.size()-7) == ".bcf") LOG.error("Filename ends in '.vcf.gz'. Shouldn't you be using --gzvcf?\n"); ret = bgzf_is_bgzf(filename.c_str()); if (ret == 1) is_BGZF = true; else is_BGZF = false; bcf_infile_bgzf = NULL; bcf_infile = NULL; if (is_BGZF){ gzMAX_LINE_LEN = 1024*1024; bcf_infile_bgzf = gzopen(filename.c_str(), "rb"); if (bcf_infile_bgzf == NULL) LOG.error("Could not open BGZF BCF file: " + filename, 0); #ifdef ZLIB_VERNUM string tmp(ZLIB_VERSION); LOG.printLOG("Using zlib version: " + tmp + "\n"); #if (ZLIB_VERNUM >= 0x1240) ret = gzbuffer(bcf_infile_bgzf, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster #else LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading compressed BCF files.\n"); #endif #endif } else bcf_infile = fopen(filename.c_str(), "r"); if ((bcf_infile == NULL) && (bcf_infile_bgzf == NULL)) { LOG.error("Could not open BCF file\n"); } read(magic, 5, 1); if ( (magic[0] != test_str[0]) || (magic[1] != test_str[1]) || (magic[2] != test_str[2]) ) LOG.error("Does not appear to be a BCF file\n"); if ( ((int)magic[3] != 2) || ((int)magic[4] != 1 ) ) { stringstream tmp_stream; tmp_stream << "File version number: " << (int)magic[3] << "." << (int)magic[4] << "\n"; LOG.printLOG( tmp_stream.str() ); LOG.error("VCFtools is currently only compatible with BCFv2.1\n"); } } streampos bcf_file::get_filepos() { if (!is_BGZF) return ftell(bcf_infile); else return gztell(bcf_infile_bgzf); } void bcf_file::set_filepos(streampos &filepos) { if (!is_BGZF) fseek(bcf_infile, filepos, SEEK_SET); else{ gzseek(bcf_infile_bgzf, filepos, SEEK_SET); } } void bcf_file::close() { if (!is_BGZF) fclose(bcf_infile); else{ gzclose(bcf_infile_bgzf); } } void bcf_file::scan_file(const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index) { bool filter_by_chr = (chrs_to_keep.size() != 0); bool exclude_by_chr = (exclude_chrs.size() != 0); string index_filename = filename + ".bcfidx"; bool could_read_index_file = false; N_entries = 0; if (force_write_index == false) could_read_index_file = read_index_file(index_filename); string CHROM, last_CHROM=""; streampos filepos, endpos; if (could_read_index_file == false) { int POS, last_POS = -1; char magic[5]; endpos = get_eof(); read(magic, 5, 1); read_header(); if ((has_header == false) || (has_meta == false)) LOG.error("No header or meta information. Invalid file: " + filename); filepos = get_filepos(); if (!is_BGZF) { while( filepos 0) && (entry_file_locations[0] < 0)) entry_file_locations.pop_front(); N_entries = entry_file_locations.size(); LOG.printLOG("Keeping " + output_log::int2str(N_entries) + " entries on specified chromosomes.\n"); } include_indv.clear(); include_indv.resize(N_indv, true); include_entry.clear(); include_entry.resize(N_entries, true); include_genotype.clear(); include_genotype.resize(N_entries, vector(N_indv, true)); } void bcf_file::read_CHROM_only(string &CHROM) { int32_t chrom_int[3]; read(&chrom_int[0], 3, sizeof(int32_t) ); CHROM = header_obj.CONTIG_map[chrom_int[2]].ID; } void bcf_file::read_CHROM_and_POS_only(string &CHROM, int &POS) { int32_t chrom_int[4];//, pos_int; read(&chrom_int[0], 4, sizeof(int32_t) ); CHROM = header_obj.CONTIG_map[chrom_int[2]].ID; POS = chrom_int[3] + (int32_t)1; } int bcf_file::read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS) { int ret; int32_t chrom_int[4]; ret = read(&chrom_int[0], 4, sizeof(int32_t) ); if (ret != 4*sizeof(int32_t)) return 0; CHROM = header_obj.CONTIG_map[chrom_int[2]].ID; POS = chrom_int[3] + (int32_t)1; size_t forward = chrom_int[0] + chrom_int[1] - 2*sizeof(int32_t); char whole_line[forward]; ret = read(&whole_line, 1, forward); return ret; } void bcf_file::get_entry(unsigned int entry_num, vector &out) { uint32_t size_int[2]; int read_size = 0; set_filepos(entry_file_locations[entry_num]); read(&size_int[0], 2, sizeof(uint32_t) ); read_size = size_int[0] + size_int[1]; out.resize(read_size+2*sizeof(uint32_t)); memcpy(&out[0], size_int, 2*sizeof(uint32_t)); read(&out[2*sizeof(uint32_t)], 1, read_size); } entry* bcf_file::get_entry_object(unsigned int N_indv) { return new bcf_entry(N_indv, header_obj); } int bcf_file::read(void *buffer, unsigned int len, size_t size) { int ret; if (is_BGZF) ret = gzread(bcf_infile_bgzf, buffer, size*len); else ret = fread(buffer, 1, size*len, bcf_infile); if ((big_endian) && (size > 1)) // Note: don't both swapping character arrays - BCF is defined as little endian. { unsigned int ui; for (ui=0; ui headers; vector ID_lookup; unsigned int N_header_indv = 0; read(&len_text, 1, sizeof(uint32_t)); char *header_array = new char[(unsigned int)len_text]; read(header_array, len_text, 1); string header(header_array); delete [] header_array; int contig_count = 0; header.erase( header.find_last_not_of(" \f\n\r\t\v\0" ) + 1 ); istringstream iss(header); string line; while (getline(iss, line)) headers.push_back(line); if (headers.size() == 0 ) { LOG.error(" Input BCF file does not have a header.\n"); exit(0); } else has_meta = true; // It needs to parse the header information and store in a // structure for later access. ID_lookup.resize(headers.size(), ""); int pos_correct = 0; pos_correct += header_obj.add_FILTER_descriptor("ID=PASS,Description=PASS", pos_correct); for (unsigned int ui=0; ui tmp; has_header = true; entry::tokenize(headers[ui],'\t',tmp); for (unsigned int ui = 0; ui < tmp.size(); ui++) { switch (ui) { case 0: if (tmp[ui] != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp[ui]); break; case 1: if (tmp[ui] != "POS") LOG.warning("Second Header entry should be POS: " + tmp[ui]); break; case 2: if (tmp[ui] != "ID") LOG.warning("Third Header entry should be ID: " + tmp[ui]); break; case 3: if (tmp[ui] != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp[ui]); break; case 4: if (tmp[ui] != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp[ui]); break; case 5: if (tmp[ui] != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp[ui]); break; case 6: if (tmp[ui] != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp[ui]); break; case 7: if (tmp[ui] != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp[ui]); break; case 8: if (tmp[ui] != "FORMAT") LOG.warning("Ninth Header entry should be FORMAT: " + tmp[ui]); else has_genotypes = true; break; default: { if (ui <= 8) LOG.error("Incorrectly formatted header."); indv.push_back(tmp[ui]); N_header_indv++; } break; } } N_indv = N_header_indv; } } } bool bcf_file::eof() { if (is_BGZF){ return gzeof(bcf_infile_bgzf); } else return(feof(bcf_infile)); } streampos bcf_file::get_eof() { streampos end_pos; if (!is_BGZF) { fseek(bcf_infile, 0, SEEK_END); end_pos = get_filepos(); fseek(bcf_infile, 0, SEEK_SET); } else { gzseek(bcf_infile_bgzf, 0, SEEK_END); end_pos = get_filepos(); gzseek(bcf_infile_bgzf, 0, SEEK_SET); } return end_pos; } void bcf_file::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO) { for (unsigned int ui=0; ui 0) out << "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry *e = new bcf_entry(N_indv, header_obj); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]); } delete e; } void bcf_file::print(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO) { LOG.printLOG("Outputting VCF file... "); string output_file = output_file_prefix + ".recode.vcf"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open VCF Output File: " + output_file, 3); print(out, INFO_to_keep, keep_all_INFO); out.close(); LOG.printLOG("Done\n"); } void bcf_file::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) { string header_str; uint32_t len_text = 0; vector header; char magic[5] = {'B','C','F','\2','\1'}; bgzf_write(out, magic, 5); for (unsigned int ui=0; ui 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new bcf_entry(N_indv, header_obj); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print_bcf(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]); } } delete e; } void bcf_file::print_bcf(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO, bool stream) { LOG.printLOG("Outputting BCF file... "); BGZF * out; if(!stream) { string output_file = output_file_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); print_bcf(out, INFO_to_keep, keep_all_INFO); bgzf_close(out); LOG.printLOG("Done\n"); } vcftools_0.1.11/cpp/knetfile.h0000644000000000000000000000311412156354766014763 0ustar rootroot#ifndef KNETFILE_H #define KNETFILE_H #include #include #ifndef _WIN32 #define netread(fd, ptr, len) read(fd, ptr, len) #define netwrite(fd, ptr, len) write(fd, ptr, len) #define netclose(fd) close(fd) #else #include #define netread(fd, ptr, len) recv(fd, ptr, len, 0) #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) #define netclose(fd) closesocket(fd) #endif // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 typedef struct knetFile_s { int type, fd; int64_t offset; char *host, *port; // the following are for FTP only int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; char *response, *retr, *size_cmd; int64_t seek_offset; // for lazy seek int64_t file_size; // the following are for HTTP only char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) #define knet_fileno(fp) ((fp)->fd) #ifdef __cplusplus extern "C" { #endif #ifdef _WIN32 int knet_win32_init(); void knet_win32_destroy(); #endif knetFile *knet_open(const char *fn, const char *mode); /* This only works with local files. */ knetFile *knet_dopen(int fd, const char *mode); /* If ->is_ready==0, this routine updates ->fd; otherwise, it simply reads from ->fd. */ ssize_t knet_read(knetFile *fp, void *buf, size_t len); /* This routine only sets ->offset and ->is_ready=0. It does not communicate with the FTP server. */ off_t knet_seek(knetFile *fp, off_t off, int whence); int knet_close(knetFile *fp); #ifdef __cplusplus } #endif #endif vcftools_0.1.11/cpp/variant_file_index.cpp0000644000000000000000000000733112156354766017354 0ustar rootroot/* * variant_file_index.cpp * * Created on: 3 Aug 2011 * Author: auton */ #include "variant_file.h" bool variant_file::read_index_file(const string &index_filename) { // Check index is newer than vcf file struct stat stat_idx, stat_vcf; stat(index_filename.c_str(), &stat_idx); stat(filename.c_str(), &stat_vcf); if (stat_vcf.st_mtime > stat_idx.st_mtime) { LOG.warning("Index file is older than variant file. Will regenerate."); return false; } LOG.printLOG("Reading Index file.\n"); big_endian_machine = is_big_endian(); gzFile in = gzopen(index_filename.c_str(), "rb"); if (in == NULL) return false; char magic[7]; idx_read(in, magic, 7, sizeof(char)); if (strncmp(magic, "VCFIDX\1", 7) != 0) { // Doesn't appear to be an index file gzclose(in); LOG.warning("Index file doesn't appear to be valid. Will (try to) overwrite.\n"); return false; } uint32_t tmp; uint64_t tmp64; idx_read(in, &tmp, 1, sizeof(uint32_t)); N_entries = tmp; idx_read(in, &tmp, 1, sizeof(uint32_t)); N_indv = tmp; idx_read(in, &tmp, 1, sizeof(uint32_t)); unsigned int l_meta = tmp; idx_read(in, &tmp, 1, sizeof(uint32_t)); unsigned int l_indv = tmp; char *meta_buffer = new char [l_meta+1]; char *indv_buffer = new char [l_indv+1]; idx_read(in, meta_buffer, l_meta, 1); idx_read(in, indv_buffer, l_indv, 1); // Split the strings meta.resize(0); char * pch; pch = strtok(meta_buffer,"\n"); while (pch != NULL) { meta.push_back(pch); pch = strtok(NULL, "\n"); } indv.resize(0); pch = strtok (indv_buffer,"\n"); while (pch != NULL) { indv.push_back(pch); pch = strtok (NULL, "\n"); } delete [] indv_buffer; delete [] meta_buffer; entry_file_locations.resize(N_entries); for (unsigned int ui=0; ui 1)) // Note: don't bother swapping character arrays - index is defined as little endian. { unsigned int ui; for (ui=0; ui 1)) { unsigned int ui; for (ui=0; ui &chrs_to_keep, const set &exclude_chrs, bool force_write_index=false, bool gatk=false); void get_entry(unsigned int entry_num, vector &out); entry* get_entry_object(unsigned int N_indv); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO); void print(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO=false); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO); void print_bcf(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO=false, bool stream=false); protected: ~bcf_file(); private: gzFile bcf_infile_bgzf; FILE *bcf_infile; bool file_open; bool is_BGZF; bool is_GATK; bool big_endian; unsigned int gzMAX_LINE_LEN; int read(void *buffer, unsigned int len, size_t size); void read_header(bool skip_meta=false); void open(); void close(); bool eof(); inline void read_CHROM_only(string &CHROM); void read_CHROM_and_POS_only(string &CHROM, int &POS); inline int read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS); streampos get_filepos(); void set_filepos(streampos &filepos); streampos get_eof(); void scan_file(const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index=false); }; #endif /* BCF_FILE_H_ */ vcftools_0.1.11/cpp/variant_file_format_convert.cpp0000644000000000000000000007154612156354766021306 0ustar rootroot/* * variant_file_format_convert.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 249 $) */ #include "variant_file.h" void variant_file::output_as_plink(const string &output_file_prefix) { // Output as PLINK formatted PED/MAP files. if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output as PLINK."); LOG.printLOG("Writing PLINK PED file ... \n"); string ped_file = output_file_prefix + ".ped"; string map_file = output_file_prefix + ".map"; vector tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); (*tmp_file) << indv[ui] << "\t" << indv[ui] << "\t" << 0 << "\t" << 0 << "\t" << 0 << "\t" << 0; tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } vector alleles; char phase; pair genotype; vector variant_line; entry *e = get_entry_object(N_indv); ofstream *tmp_file; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() > 2) { LOG.one_off_warning("\tPLINK: Only outputting biallelic loci."); continue; } e->get_alleles_vector(alleles); for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); phase = e->get_indv_PHASE(ui); } if (genotype.first == -1) (*tmp_file) << "\t0"; else (*tmp_file) << "\t" << alleles[genotype.first]; if (genotype.second == -1) { if (phase == '/') (*tmp_file) << "\t0"; else if (genotype.first != -1) (*tmp_file) << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc else (*tmp_file) << "\t0"; } else (*tmp_file) << "\t" << alleles[genotype.second]; } } ofstream PED(ped_file.c_str()); if (!PED.is_open()) LOG.error("Could not open output file: " + ped_file, 12); string tmp_line; for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n" "Alternatively, try the --plink-tped command.", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } PED.close(); LOG.printLOG("Writing PLINK MAP file ... "); ofstream MAP(map_file.c_str()); if (!MAP.is_open()) LOG.error("Could not open output file: " + map_file, 12); int POS; string ID, CHROM, CHROM2; map CHROM_to_PLINK; for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() > 2) continue; POS = e->get_POS(); ID = e->get_ID(); CHROM = e->get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { LOG.one_off_warning("\nUnrecognized values used for CHROM: " + CHROM + " - Replacing with 0.\n"); CHROM_to_PLINK[CHROM] = "0"; } CHROM2 = CHROM_to_PLINK[CHROM]; if (ID == ".") MAP << CHROM2 << "\t" << CHROM << ":" << POS << "\t0\t" << POS << endl; else MAP << CHROM2 << "\t" << ID << "\t0\t" << POS << endl; } delete e; MAP.close(); LOG.printLOG("Done.\n"); } // Output as Plink Transposed file void variant_file::output_as_plink_tped(const string &output_file_prefix) { // Output as PLINK formatted PED/MAP files. if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output as PLINK TPED."); LOG.printLOG("Writing PLINK TPED file ... "); string tped_file = output_file_prefix + ".tped"; string tfam_file = output_file_prefix + ".tfam"; ofstream TPED(tped_file.c_str()); if (!TPED.is_open()) LOG.error("Could not open output file: " + tped_file, 12); string CHROM, CHROM2; map CHROM_to_PLINK; for (int i=1; i<23; i++) { ostringstream convert; convert << i; CHROM_to_PLINK["chr" + convert.str()] = convert.str(); CHROM_to_PLINK[convert.str()] = convert.str(); } CHROM_to_PLINK["chrX"] = "X"; CHROM_to_PLINK["chrY"] = "Y"; CHROM_to_PLINK["chrXY"] = "XY"; CHROM_to_PLINK["chrMT"] = "MT"; CHROM_to_PLINK["X"] = "X"; CHROM_to_PLINK["Y"] = "Y"; CHROM_to_PLINK["XY"] = "XY"; CHROM_to_PLINK["MT"] = "MT"; vector alleles; char phase; pair genotype; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() > 2) // Only output sites with at most one alternative allele { LOG.one_off_warning("\tPLINK-TPED: Only outputting biallelic loci."); continue; } CHROM = e->get_CHROM(); if (CHROM_to_PLINK.find(CHROM) == CHROM_to_PLINK.end()) { LOG.one_off_warning("\nUnrecognized values used for CHROM: " + CHROM + " - Replacing with 0.\n"); CHROM_to_PLINK[CHROM] = "0"; } CHROM2 = CHROM_to_PLINK[CHROM]; if (e->get_ID() == ".") TPED << CHROM2 << "\t" << e->get_CHROM() << ":" << e->get_POS() << "\t0\t" << e->get_POS(); else TPED << CHROM2 << "\t" << e->get_ID() << "\t0\t" << e->get_POS(); e->get_alleles_vector(alleles); for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); phase = e->get_indv_PHASE(ui); } if (genotype.first == -1) TPED << "\t0"; else TPED << "\t" << alleles[genotype.first]; if (genotype.second == -1) { if (phase == '/') TPED << "\t0"; else if (genotype.first != -1) TPED << "\t" << alleles[genotype.first]; // Male X-chr, Y-chr etc else TPED << "\t0"; } else TPED << "\t" << alleles[genotype.second]; } TPED << endl; } TPED.close(); LOG.printLOG("Writing PLINK TFAM file ... "); ofstream TFAM(tfam_file.c_str()); if (!TFAM.is_open()) LOG.error("Could not open output file: " + tfam_file, 12); for (unsigned int ui=0; ui genotype; string vcf_line; vcf_entry e(N_indv); for (unsigned int ui=0; ui tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); (*tmp_file) << ui; tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } FAM.close(); vector alleles; pair genotype; vector variant_line; entry *e = get_entry_object(N_indv); ofstream *tmp_file; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() > 2) { LOG.one_off_warning("\t012: Only outputting biallelic loci."); continue; } e->get_alleles_vector(alleles); for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, genotype); } if ((genotype.first == -1) && (genotype.second == -1)) (*tmp_file) << "\t-1"; // Missing data else if ((genotype.first == 0) && (genotype.second == 0)) (*tmp_file) << "\t0"; // No copies of the alternative allele else { if ((genotype.first == 1) && (genotype.second == 1)) (*tmp_file) << "\t2"; // Two copies of the alternative allele else (*tmp_file) << "\t1"; // Must be one copy of the alternative allele. } } } ofstream PED(ped_file.c_str()); if (!PED.is_open()) LOG.error("Could not open output file: " + ped_file, 12); string tmp_line; for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); PED << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } PED.close(); LOG.printLOG("Writing 012 positions file ... "); ofstream MAP(map_file.c_str()); if (!MAP.is_open()) LOG.error("Could not open output file: " + map_file, 12); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() <= 2) // Only output sites with one alternative allele { MAP << e->get_CHROM() << "\t" << e->get_POS() << endl; } } delete e; MAP.close(); LOG.printLOG("Done.\n"); } // Output as IMPUTE format void variant_file::output_as_IMPUTE(const string &output_file_prefix) { if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output IMPUTE format."); LOG.printLOG("Outputting in IMPUTE format (bi-allelic, completely phased SNPs only)\n"); unsigned int s, ui; string legend_file = output_file_prefix + ".impute.legend"; string haplotype_file = output_file_prefix + ".impute.hap"; string indv_file = output_file_prefix + ".impute.hap.indv"; ofstream legend(legend_file.c_str()); if (!legend.is_open()) LOG.error("Could not open IMPUTE Legend Output File: " + legend_file, 2); legend << "ID pos allele0 allele1" << endl; ofstream hap(haplotype_file.c_str()); if (!hap.is_open()) LOG.error("Could not open IMPUTE Haplotype Output File: " + haplotype_file, 2); ofstream indv_out(indv_file.c_str()); if (!indv_out.is_open()) LOG.error("Could not open IMPUTE Individual Output File: " + indv_file, 2); for (ui=0; ui alleles; vector variant_line; entry *e = get_entry_object(N_indv); for (s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() > 2) { LOG.one_off_warning("\tIMPUTE: Only outputting biallelic loci."); continue; } // Exclude entries with missing data and/or unphased bool missing = false; for (ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if ((alleles.first == -1) || (alleles.second == -1)) { missing = true; break; } if (e->get_indv_PHASE(ui) != '|') { missing = true; break; } } if (missing == true) continue; if (e->get_ID() == ".") { legend << e->get_CHROM() << "-" << e->get_POS() << " " << e->get_POS() << " " << e->get_REF() << " " << e->get_ALT_allele(0) << endl; } else legend << e->get_ID() << " " << e->get_POS() << " " << e->get_REF() << " " << e->get_ALT_allele(0) << endl; bool first = true; for (ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (first == true) { hap << alleles.first << " " << alleles.second; first = false; } else hap << " " << alleles.first << " " << alleles.second; } hap << endl; } delete e; hap.close(); legend.close(); } void variant_file::output_LDhat_locs_file(const string &output_file_prefix, unsigned int &n_sites_out) { string locs_file = output_file_prefix + ".ldhat.locs"; ofstream locs(locs_file.c_str()); if (!locs.is_open()) LOG.error("Could not open LDhat locs Output File: " + locs_file, 2); int max_pos = -1; unsigned int n_sites=0; entry *e = get_entry_object(N_indv); vector variant_line; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { continue; } max_pos = max(e->get_POS(), max_pos); n_sites++; } locs << n_sites; locs.setf(ios::fixed,ios::floatfield); locs.precision(4); locs << "\t" << max_pos / 1000.0 << "\tL" << endl; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLDhat: Only outputting biallelic loci."); continue; } locs << e->get_POS() / 1000.0 << endl; } delete e; locs.close(); n_sites_out = n_sites; } void variant_file::output_as_LDhat_phased(const string &output_file_prefix) { if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhat format."); LOG.printLOG("Outputting in phased LDhat format\n"); unsigned int n_sites; output_LDhat_locs_file(output_file_prefix, n_sites); string sites_file = output_file_prefix + ".ldhat.sites"; ofstream sites(sites_file.c_str()); if (!sites.is_open()) LOG.error("Could not open LDhat sites Output File: " + sites_file, 2); unsigned int n_indv = N_kept_individuals(); pair alleles; sites << n_indv*2 << "\t" << n_sites << "\t1" << endl; // Note - this is incorrect for the X-chr. vector tmp_files(2*N_indv); vector tmp_filenames(2*N_indv); for (unsigned int ui=0; uigood()) LOG.error("Could not open temp file #" + output_log::int2str(ui) + ".\n", 12); tmp_files[2*ui] = tmp_file; tmp_filenames[2*ui] = filename; string filename2(tmpnam(NULL)); ofstream *tmp_file2 = new ofstream(filename2.c_str()); if (!tmp_file2->good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); tmp_files[2*ui+1] = tmp_file2; tmp_filenames[2*ui+1] = filename2; } vector variant_line; entry *e = get_entry_object(N_indv); ofstream *tmp_file; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLDhat: Only outputting biallelic loci."); continue; } for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); for (unsigned int k=0; k<2; k++) { tmp_file = tmp_files[(2*ui)+k]; int geno; if (k == 0) geno = alleles.first; else geno = alleles.second; if ((geno != -1) && (include_genotype[s][ui]==true)) (*tmp_file) << geno; else (*tmp_file) << "?"; } } } string tmp_line; for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[2*ui+k].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << indv[ui] << "-" << k << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[2*ui+k].c_str()); } } delete e; sites.close(); } void variant_file::output_as_LDhat_unphased(const string &output_file_prefix) { if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output LDhat format."); LOG.printLOG("Outputting in unphased LDhat format\n"); unsigned int n_sites; output_LDhat_locs_file(output_file_prefix, n_sites); string sites_file = output_file_prefix + ".ldhat.sites"; ofstream sites(sites_file.c_str()); if (!sites.is_open()) LOG.error("Could not open LDhat sites Output File: " + sites_file, 2); unsigned int n_indv = N_kept_individuals(); pair alleles; sites << n_indv << "\t" << n_sites << "\t2" << endl; vector tmp_files(N_indv); vector tmp_filenames(N_indv); for (unsigned int ui=0; uigood()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); tmp_files[ui] = tmp_file; tmp_filenames[ui] = filename; } vector variant_line; entry *e = get_entry_object(N_indv); ofstream *tmp_file; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLDhat: Only outputting biallelic loci."); continue; } for (unsigned int ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); switch (alleles.first) { case -1: (*tmp_file) << "?"; break; case 0: if (alleles.second == 0) (*tmp_file) << 0; else if (alleles.second == 1) (*tmp_file) << 2; else if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) (*tmp_file) << 0; // Haploid case else (*tmp_file) << '?'; break; case 1: if (alleles.second == 0) (*tmp_file) << 2; else if (alleles.second == 1) (*tmp_file) << 1; else if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) (*tmp_file) << 1; // Haploid case else (*tmp_file) << '?'; break; default: (*tmp_file) << '?'; break; } } } } string tmp_line; for (unsigned int ui=0; uiclose(); ifstream read_file(tmp_filenames[ui].c_str()); if (!read_file.good()) LOG.error("\n\nCould not open temporary file.\n\n" "Most likely this is because the system is not allowing me to open enough temporary files.\n" "Try using ulimit -n to increase the number of allowed open files.\n", 12); getline(read_file, tmp_line); sites << ">" << indv[ui] << endl; sites << tmp_line << endl; read_file.close(); remove(tmp_filenames[ui].c_str()); } delete e; sites.close(); } // Output INFO fields in tab-delimited format void variant_file::output_INFO_for_each_site(const string &output_file_prefix, const vector &INFO_to_extract) { if (INFO_to_extract.size() == 0) return; LOG.printLOG("Outputting INFO for each site\n"); string output = output_file_prefix + ".INFO"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open INFO Output File: " + output, 3); out << "CHROM\tPOS\tREF\tALT"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, false, true); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e->get_REF() << "\t" << e->get_ALT(); for (unsigned int ui=0; uiget_INFO_value(INFO_to_extract[ui]); } out << endl; } delete e; out.close(); } // Output FORMAT information in tab-delimited format. void variant_file::output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id) { if (FORMAT_id == "") return; if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output FORMAT information."); LOG.printLOG("Outputting FORMAT information for " + FORMAT_id + "\n"); string output = output_file_prefix + "." + FORMAT_id + ".FORMAT"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open FORMAT Output File: " + output, 7); out << "CHROM\tPOS"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); e->parse_full_entry(true); if (e->FORMAT_id_exists(FORMAT_id) == false) continue; out << e->get_CHROM() << "\t" << e->get_POS(); for (unsigned int ui=0; uiread_indv_generic_entry(ui, FORMAT_id, FORMAT_out); out << "\t" << FORMAT_out; } out << endl; } delete e; out.close(); } // Output genotype likelihoods from GL or PL FORMAT tag, ready for input into BEAGLE // using the Genotype likelihoods file format. void variant_file::output_BEAGLE_genotype_likelihoods(const string &output_file_prefix, int GL_or_PL) { if (has_genotypes == false) LOG.error("Require Genotypes in VCF file in order to output BEAGLE genotype likelihoods."); if (GL_or_PL == 0) LOG.printLOG("Outputting GLs in BEAGLE Genotype Likelihood format (bi-allelic SNPs with GL tags only)\n"); else if (GL_or_PL == 1) LOG.printLOG("Outputting PLs in BEAGLE Genotype Likelihood format (bi-allelic SNPs with PL tags only)\n"); else LOG.error("Unknown GL or PL option."); string output = output_file_prefix + ".BEAGLE.GL"; if (GL_or_PL == 1) output = output_file_prefix + ".BEAGLE.PL"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open BEAGLE GL/PL Output File: " + output, 3); out << "marker\talleleA\talleleB"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(N_indv); double lk1, lk2, lk3; bool found_GL=false; istringstream ss; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tBEAGLE: Only outputting biallelic loci."); continue; } e->parse_full_entry(true); if (GL_or_PL == 0) if (e->FORMAT_id_exists("GL") == false) continue; if (GL_or_PL == 1) if (e->FORMAT_id_exists("PL") == false) continue; found_GL = true; out << e->get_CHROM() << ":" << e->get_POS() << "\t" << e->get_REF() << "\t" << e->get_ALT(); for (unsigned int ui=0; uiread_indv_generic_entry(ui, "GL", GL_entry); else e->read_indv_generic_entry(ui, "PL", GL_entry); ss.clear(); ss.str(GL_entry); getline(ss, tmp_string, ','); lk1 = atof(tmp_string.c_str()); getline(ss, tmp_string, ','); lk2 = atof(tmp_string.c_str()); getline(ss, tmp_string); lk3 = atof(tmp_string.c_str()); if (GL_or_PL == 0) out << "\t" << pow(10,lk1) << "\t" << pow(10,lk2) << "\t" << pow(10,lk3); else out << "\t" << pow(10,-lk1*0.1) << "\t" << pow(10,-lk2*0.1) << "\t" << pow(10,-lk3*0.1); } else { out << "\t1\t1\t1"; // Mark as unknown } } out << endl; } delete e; if (found_GL == false) LOG.error("Require GL or PL FORMAT tags in VCF file to output BEAGLE input."); } vcftools_0.1.11/cpp/variant_file_output.cpp0000644000000000000000000032453612156354766017616 0ustar rootroot/* * variant_file_output.cpp * * Created on: Aug 28, 2009 * Author: Adam Auton * ($Revision: 249 $) */ #include "variant_file.h" void variant_file::output_frequency(const string &output_file_prefix, bool output_counts, bool suppress_allele_output, bool derived) { // Output statistics of frequency at each site if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Frequency Statistics."); LOG.printLOG("Outputting Frequency Statistics...\n"); string output_file = output_file_prefix + ".frq"; if (output_counts) output_file += ".count"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); if (suppress_allele_output == false) { out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:"; if (output_counts) out << "COUNT}" << endl; else out << "FREQ}" << endl; } else { if (output_counts) out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{COUNT}" << endl; else out << "CHROM\tPOS\tN_ALLELES\tN_CHR\t{FREQ}" << endl; } vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; vector variant_line; entry *e = get_entry_object(N_indv); unsigned int aa_idx = 0; for (unsigned int s=0; sreset(variant_line); if (derived) e->parse_basic_entry(true, false, true); else e->parse_basic_entry(true); e->parse_genotype_entries(true); N_alleles = e->get_N_alleles(); if (derived) { string AA = e->get_INFO_value("AA"); std::transform(AA.begin(), AA.end(), AA.begin(), ::toupper); // Comment this out if only want high quality sites. if ((AA == "?") || (AA == ".")) { LOG.one_off_warning("\tWarning: Cannot output derived allele frequencies without Ancestral Alleles (AA)"); continue; } else { bool found = false; for (unsigned int ui=0; uiget_allele(ui)) { aa_idx = ui; found = true; break; } } if (found == false) { LOG.one_off_warning("\tWarning: Ancestral allele does not match any SNP allele."); continue; } } } e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << N_alleles << "\t" << N_non_missing_chr; if (output_counts) { if (suppress_allele_output == false) { out << "\t" << e->get_allele(aa_idx) << ":" << allele_counts[aa_idx]; for (unsigned int ui=0; uiget_allele(ui) << ":" << allele_counts[ui]; } out << endl; } else { out << "\t" << allele_counts[aa_idx]; for (unsigned ui=0; uiget_allele(aa_idx) << ":" << freq; for (unsigned int ui=0; uiget_allele(ui) << ":" << freq; } } out << endl; } else { freq = allele_counts[aa_idx] / (double)N_non_missing_chr; out << "\t" << freq; for (unsigned int ui=0; ui freq(N_entries, 0.0); vector allele_counts; vector N_non_missing_chr(N_entries,0); vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tIndividual Heterozygosity: Only using biallelic SNPs."); continue; } e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tIndividual Heterozygosity: Only using fully diploid SNPs."); continue; } // Frequency of non-reference allele e->get_allele_counts(allele_counts, N_non_missing_chr[s], include_indv, include_genotype[s]); if (N_non_missing_chr[s] > 0) freq[s] = allele_counts[1] / double(N_non_missing_chr[s]); else freq[s] = -1; } vector N_sites_included(N_indv, 0); vector N_obs_hom(N_indv, 0); vector N_expected_hom(N_indv, 0.0); pair alleles; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) continue; e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) continue; if ((freq[s] <= numeric_limits::epsilon()) || (1.0 - freq[s] <= numeric_limits::epsilon())) continue; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, alleles); if ((alleles.first != -1) && (alleles.second != -1)) { N_sites_included[ui]++; if (alleles.first == alleles.second) N_obs_hom[ui]++; ///////////////////////// // Expected homozygosity // E = 1 - (2pq . 2N/(2N-1)) // (Using Nei's unbiased estimator) N_expected_hom[ui] += 1.0 - (2.0 * freq[s] * (1.0 - freq[s]) * (N_non_missing_chr[s] / (N_non_missing_chr[s] - 1.0))); } } } } out.setf(ios::fixed,ios::floatfield); for (unsigned int ui=0; ui 0) { double F = (N_obs_hom[ui] - N_expected_hom[ui]) / double(N_sites_included[ui] - N_expected_hom[ui]); out << indv[ui] << "\t" << N_obs_hom[ui] << "\t"; out.precision(1); out << N_expected_hom[ui] << "\t"; out.precision(5); out << N_sites_included[ui] << "\t" << F << endl; } } delete e; out.close(); } void variant_file::output_hwe(const string &output_file_prefix) { // Output HWE statistics for each site as described in Wigginton, Cutler, and Abecasis (2005) if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output HWE Statistics."); // Note this assumes Biallelic SNPs. LOG.printLOG("Outputting HWE statistics (but only for biallelic loci)\n"); string output_file = output_file_prefix + ".hwe"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHR\tPOS\tOBS(HOM1/HET/HOM2)\tE(HOM1/HET/HOM2)\tChiSq\tP" << endl; /* PLINK code: // b11 = Nhom1, b12 = Nhet, b22 = Nhom2 double tot = b11 + b12 + b22; double exp_11 = freq * freq * tot; double exp_12 = 2 * freq * (1-freq) * tot; double exp_22 = (1-freq) * (1-freq) * tot; double chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22 ; p = chiprobP(chisq,1); */ double freq; unsigned int b11, b12, b22; double exp_11, exp_12, exp_22; double chisq; double tot; double p; unsigned int precision = out.precision(); vector allele_counts; unsigned int N_non_missing_chr; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tHWE: Only using biallelic SNPs."); continue; // Isn't biallelic } e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tHWE: Only using fully diploid SNPs."); continue; // Isn't diploid } e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); freq = allele_counts[0] / (double)N_non_missing_chr; e->get_genotype_counts(include_indv, include_genotype[s], b11, b12, b22); tot = b11 + b12 + b22; exp_11 = freq * freq * tot; exp_12 = 2.0 * freq * (1.0-freq) * tot; exp_22 = (1.0-freq) * (1.0-freq) * tot; chisq = ( (b11-exp_11)*(b11-exp_11) ) / exp_11 + ( (b12-exp_12)*(b12-exp_12) ) / exp_12 + ( (b22-exp_22)*(b22-exp_22) ) / exp_22; p = entry::SNPHWE(b12, b11, b22); out << e->get_CHROM() << "\t" << e->get_POS(); out << "\t" << b11 << "/" << b12 << "/" << b22; out.precision(2); out << fixed << "\t" << exp_11 << "/" << exp_12 << "/" << exp_22; out.precision(precision); out << "\t" << chisq << "\t" << p << endl; } delete e; out.close(); } void variant_file::output_individuals_by_mean_depth(const string &output_file_prefix) { // Output information regarding the mean depth for each individual if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Individuals by Mean Depth Statistics."); LOG.printLOG("Outputting Mean Depth by Individual\n"); string output = output_file_prefix + ".idepth"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Individual Depth Output File: " + output, 2); out << "INDV\tN_SITES\tMEAN_DEPTH" << endl; vector depth_sum(N_indv, 0.0); vector count(N_indv, 0); int depth; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); for (unsigned int ui=0; uiparse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { depth_sum[ui] += depth; count[ui]++; } } } } for (unsigned int ui=0; ui max_pos; map min_pos; string CHROM; int POS; entry *e = get_entry_object(N_indv); for (unsigned int s=0; s max_pos[CHROM]) max_pos[CHROM] = POS; } else max_pos[CHROM] = POS; if (min_pos.find(CHROM) != min_pos.end()) { if (POS < min_pos[CHROM]) min_pos[CHROM] = POS; } else min_pos[CHROM] = POS; } } map::iterator it; unsigned int N_bins; map > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size)); bins[CHROM].resize(N_bins, 0); } unsigned int idx; double C = 1.0 / double(bin_size); for (unsigned int s=0; s 0) output = true; if (output == true) out << CHROM << "\t" << s*bin_size << "\t" << bin_tot << "\t" << bin_tot * C << endl; } } delete e; out.close(); double mean_SNP_density = sum1 / sum2 * 1000; LOG.printLOG("Mean SNP density: " + output_log::dbl2str(mean_SNP_density, 5) + " variants / kb\n"); } void variant_file::output_missingness(const string &output_file_prefix) { // Output missingness by individual and site if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Missingness Statistics."); LOG.printLOG("Outputting Site and Individual Missingness\n"); string output1 = output_file_prefix + ".imiss"; ofstream out1(output1.c_str()); if (!out1.is_open()) LOG.error("Could not open Individual Missingness Output File: " + output1, 3); string output2 = output_file_prefix + ".lmiss"; ofstream out2(output2.c_str()); if (!out2.is_open()) LOG.error("Could not open Site Missingness Output File: " + output2, 4); out1 << "INDV\tN_DATA\tN_GENOTYPES_FILTERED\tN_MISS\tF_MISS" << endl; unsigned int ui, s; vector indv_N_missing(N_indv, 0), indv_N_tot(N_indv, 0); vector indv_N_geno_filtered(N_indv, 0); unsigned int site_N_missing, site_N_tot, site_N_geno_filtered; pair alleles; vector variant_line; entry *e = get_entry_object(N_indv); out2 << "CHR\tPOS\tN_DATA\tN_GENOTYPE_FILTERED\tN_MISS\tF_MISS" << endl; for (s=0; sreset(variant_line); e->parse_basic_entry(); site_N_missing = 0; site_N_tot = 0; site_N_geno_filtered = 0; for (ui=0; uiparse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (alleles.first == -1) { site_N_missing++; indv_N_missing[ui]++; } indv_N_tot[ui]++; if (alleles.second == -1) { site_N_missing++; } site_N_tot+=2; if ((alleles.second == -1) && (e->get_indv_PHASE(ui) == '|')) { // Phased missing genotypes indicate haploid genome site_N_tot--; site_N_missing--; } } out2 << e->get_CHROM() << "\t" << e->get_POS() << "\t" << site_N_tot << "\t" << site_N_geno_filtered << "\t"; out2 << site_N_missing << "\t" << double(site_N_missing) / double(site_N_tot) << endl; } for (ui=0; ui &include_geno1, const vector &include_geno2, double &r2, double &D, double &Dprime, int &chr_count) { double x11=0, x12=0, x21=0, x22=0; double X=0, X2=0, Y=0, Y2=0, XY=0; double sx, sy; double rel_x11, p1, p2, q1, q2, Dmax; double var1, var2, cov12; chr_count = 0; pair geno1, geno2; int allele1, allele2; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno1); e2->parse_genotype_entry(ui, true); e2->get_indv_GENOTYPE_ids(ui, geno2); // if ((e->get_indv_ploidy(ui) != 2) || (e2->get_indv_ploidy(ui) != 2)) // { // LOG.one_off_warning("\tLD: Only using diploid individuals."); // continue; // } if ((e->get_indv_PHASE(ui) != '|') || (e2->get_indv_PHASE(ui) != '|')) LOG.error("Require phased haplotypes for r^2 calculation (use --phased)\n"); for (unsigned int c=0; c<2; c++) { if (c==0) { allele1 = geno1.first; allele2 = geno2.first; } else { allele1 = geno1.second; allele2 = geno2.second; } if ((allele1 == -1) || (allele2 == -1)) continue; if (allele1 == 0 && allele2 == 0){ x11++; } else if (allele1 == 0 && allele2 != 0){ x12++; } else if (allele1 != 0 && allele2 == 0){ x21++; } else { // (allele1 !=0 && allele2 != 0) x22++; } sx=0, sy=0; if (allele1 == 0) sx += 1; if (allele2 == 0) sy += 1; X += sx; Y += sy; XY += sx*sy; sx *= sx; sy *= sy; X2 += sx; Y2 += sy; chr_count++; } } rel_x11 = x11/double(chr_count); p1 = (x11 + x12)/double(chr_count); p2 = (x21 + x22)/double(chr_count); q1 = (x11 + x21)/double(chr_count); q2 = (x12 + x22)/double(chr_count); D = rel_x11 - p1*q1; if (D < 0) Dmax = min(p1*q1,p2*q2); else Dmax = min(p1*q2,p2*q1); Dprime = D/Dmax; X /= chr_count; X2 /= chr_count; Y /= chr_count; Y2 /= chr_count; XY /= chr_count; var1 = X2 - X*X; var2 = Y2 - Y*Y; cov12 = XY - X*Y; r2 = cov12 * cov12 / (var1 * var2); } // Calculate r2 for either haplotypes or genotypes using the em algorithm... void variant_file::calc_r2_em(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &r2, int &indv_count) { r2 = 0; indv_count = 0; pair geno1, geno2; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno1); e2->parse_genotype_entry(ui, true); e2->get_indv_GENOTYPE_ids(ui, geno2); // TODO... not yet implemented...! LOG.error("Not yet implmented!\n"); } } void variant_file::calc_geno_r2(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &r2, int &indv_count) { if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); double X=0, X2=0, Y=0, Y2=0, XY=0; double sx, sy; indv_count = 0; pair geno1, geno2; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno1); e2->parse_genotype_entry(ui, true); e2->get_indv_GENOTYPE_ids(ui, geno2); if ((e->get_indv_ploidy(ui) != 2) || (e2->get_indv_ploidy(ui) != 2)) { LOG.one_off_warning("\tgenoLD: Only using diploid individuals."); continue; } if ((geno1.first == -1) || (geno1.second == -1)) continue; if ((geno2.first == -1) || (geno2.second == -1)) continue; sx=0, sy=0; if (geno1.first == geno1.second) { if (geno1.first == 0) { sx = 2; } } else sx = 1; if (geno2.first == geno2.second) { if (geno2.first == 0) { sy = 2; } } else sy = 1; X += sx; Y += sy; XY += sx*sy; sx *= sx; sy *= sy; X2 += sx; Y2 += sy; indv_count++; } X /= indv_count; X2 /= indv_count; Y /= indv_count; Y2 /= indv_count; XY /= indv_count; double var1 = X2 - X*X; double var2 = Y2 - Y*Y; double cov12 = XY - X*Y; r2 = cov12 * cov12 / (var1 * var2); } void variant_file::calc_geno_chisq(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &chisq, double &dof, double &pval, int &indv_count) { if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); int N0 = e->get_N_alleles(); int N1 = e2->get_N_alleles(); int N_genotypes0 = N0 * (N0+1) / 2; int N_genotypes1 = N1 * (N1+1) / 2; vector > observed(N_genotypes0, vector(N_genotypes1,0)); indv_count = 0; pair geno1, geno2; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno1); e2->parse_genotype_entry(ui, true); e2->get_indv_GENOTYPE_ids(ui, geno2); if ((e->get_indv_ploidy(ui) != 2) || (e2->get_indv_ploidy(ui) != 2)) { LOG.one_off_warning("\tgenoLD: Only using diploid individuals."); continue; } if ((geno1.first == -1) || (geno1.second == -1)) continue; if ((geno2.first == -1) || (geno2.second == -1)) continue; map, int> idx_lookup1; int count = 0; for (int uj=0; uj, int> idx_lookup2; count = 0; for (int uj=0; uj > expected(N_genotypes0, vector(N_genotypes1,0)); vector row_tot(N_genotypes0, 0); vector col_tot(N_genotypes1, 0); double tot=0; for (int ui=0; ui 0) && (col_tot[uj] > 0)) // Don't use incomplete cases chisq += pow(observed[ui][uj] - expected[ui][uj], 2) / expected[ui][uj]; } } int n_col=0, n_row=0; for (int ui=0; ui 0) n_row++; for (int ui=0; ui 0) n_col++; dof = (n_row-1) * (n_col-1); //cout << chisq << " " << dof << endl; pval = 1.0-gammp(dof/2, chisq/2); } void variant_file::output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_r2) { // Output pairwise LD statistics, using traditional r^2. Requires phased haplotypes. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; LOG.printLOG("Outputting Pairwise LD (phased bi-allelic only)\n"); string output = output_file_prefix + ".hap.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR\tPOS1\tPOS2\tN_CHR\tR^2\tD\tDprime" << endl; double r2, D, Dprime; int chr_count; unsigned int skip = (unsigned int)max((int)1, snp_window_min); vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = s+skip; s2 snp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } get_entry(s2, variant_line2); e2->reset(variant_line2); e2->parse_basic_entry(true); if (e->get_CHROM() != e2->get_CHROM()) { s2 = N_entries; // No need to go any further (assuming SNPs are sorted) continue; } if ((e2->get_POS() - e->get_POS()) < bp_window_min) continue; if ((e2->get_POS() - e->get_POS()) > bp_window_size) { s2 = N_entries; // No need to go any further (assuming SNPs are sorted) continue; } if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tLD: Only using biallelic variants."); continue; } calc_hap_r2(e, e2, include_genotype[s], include_genotype[s2], r2, D, Dprime, chr_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_POS() << "\t" << chr_count << "\t" << r2 << "\t" << D << "\t" << Dprime << "\t" << endl; } } delete e; delete e2; out.close(); } void variant_file::output_genotype_r2(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_r2) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; LOG.printLOG("Outputting Pairwise LD (bi-allelic only)\n"); string output = output_file_prefix + ".geno.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR\tPOS1\tPOS2\tN_INDV\tR^2" << endl; double r2; int indv_count; unsigned int skip = (unsigned int)max((int)1, snp_window_min); vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tgenoLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = s+skip; s2 snp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } get_entry(s2, variant_line2); e2->reset(variant_line2); e2->parse_basic_entry(true); if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tgenoLD: Only using biallelic variants."); continue; // Isn't biallelic } if (e->get_CHROM() != e2->get_CHROM()) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } if ((e2->get_POS() - e->get_POS()) < bp_window_min) continue; if ((e2->get_POS() - e->get_POS()) > bp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } calc_geno_r2(e, e2, include_genotype[s], include_genotype[s2], r2, indv_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_POS() << "\t" << indv_count << "\t" << r2 << endl; } } delete e; delete e2; out.close(); } void variant_file::output_genotype_chisq(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_pval) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; LOG.printLOG("Outputting Pairwise LD\n"); string output = output_file_prefix + ".geno.chisq"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR\tPOS1\tPOS2\tN_INDV\tCHI^2\tDOF\tPVAL" << endl; double chisq, dof, pval; int indv_count; unsigned int skip = (unsigned int)max((int)1, snp_window_min); vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); e->parse_genotype_entries(true); for (s2 = s+skip; s2 snp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } get_entry(s2, variant_line2); e2->reset(variant_line2); e2->parse_basic_entry(true); if (e->get_CHROM() != e2->get_CHROM()) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } if ((e2->get_POS() - e->get_POS()) < bp_window_min) continue; if ((e2->get_POS() - e->get_POS()) > bp_window_size) { s2 = N_entries; // SNPs sorted, so no need to go any further continue; } calc_geno_chisq(e, e2, include_genotype[s], include_genotype[s2], chisq, dof, pval, indv_count); if (min_pval > 0) if ((pval < min_pval) | (pval != pval)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_POS() << "\t" << indv_count << "\t" << chisq << "\t" << dof << "\t" << pval << endl; } } delete e; delete e2; out.close(); } void variant_file::output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; LOG.printLOG("Outputting Interchromosomal Pairwise Genotype LD (bi-allelic only)\n"); string output = output_file_prefix + ".interchrom.geno.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl; int indv_count; double r2; vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = s+1; s2reset(variant_line2); e2->parse_basic_entry(true); if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } if (e->get_CHROM() == e2->get_CHROM()) continue; calc_geno_r2(e, e2, include_genotype[s], include_genotype[s2], r2, indv_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_CHROM() << "\t" << e2->get_POS() << "\t" << indv_count << "\t" << r2 << endl; } } delete e; delete e2; out.close(); } void variant_file::output_interchromosomal_haplotype_r2(const string &output_file_prefix, double min_r2) { // Output pairwise LD statistics, using genotype r^2. This is the same formula as used by PLINK, and is basically the squared // correlation coefficient between genotypes numbered as 0, 1, 2. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); unsigned int s, s2; LOG.printLOG("Outputting Interchromosomal Pairwise LD (bi-allelic only)\n"); string output = output_file_prefix + ".interchrom.hap.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_CHR\tR^2" << endl; double D, Dprime; int chr_count; double r2; vector variant_line, variant_line2; entry *e, *e2; e = get_entry_object(N_indv); e2 = get_entry_object(N_indv); for (s=0; s<(N_entries-1); s++) { if (include_entry[s] == false) continue; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = s+1; s2reset(variant_line2); e2->parse_basic_entry(true); if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } if (e->get_CHROM() == e2->get_CHROM()) continue; calc_hap_r2(e, e2, include_genotype[s], include_genotype[s2], r2, D, Dprime, chr_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_CHROM() << "\t" << e2->get_POS() << "\t" << chr_count << "\t" << r2 << endl; } } delete e; delete e2; out.close(); } void variant_file::output_haplotype_r2_of_SNP_list_vs_all_others(const string &output_file_prefix, const string &positions_file, double min_r2) { if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); LOG.printLOG("Outputting haplotype pairwise LD (bi-allelic only) for a set of SNPs verses all others.\n"); vector< set > keep_positions; map chr_to_idx; string line; stringstream ss; string chr; int pos1, idx; unsigned int N_chr=0; ifstream BED(positions_file.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + positions_file); // Skip header BED.ignore(numeric_limits::max(), '\n'); while (!BED.eof()) { getline(BED, line); if (line[0] == '#') continue; ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } BED.close(); unsigned int s, s2; string output = output_file_prefix + ".list.hap.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_CHR\tR^2" << endl; double D, Dprime; int chr_count; double r2; vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; sreset(variant_line); e->parse_basic_entry(true); e->get_CHROM(chr); if (chr_to_idx.find(chr) == chr_to_idx.end()) continue; idx = chr_to_idx[chr]; pos1 = e->get_POS(); if (keep_positions[idx].find(pos1) == keep_positions[idx].end()) continue; if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = 0; s2reset(variant_line2); e2->parse_basic_entry(true); if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } calc_hap_r2(e, e2, include_genotype[s], include_genotype[s2], r2, D, Dprime, chr_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_CHROM() << "\t" << e2->get_POS() << "\t" << chr_count << "\t" << r2 << endl; } } out.close(); } void variant_file::output_genotype_r2_of_SNP_list_vs_all_others(const string &output_file_prefix, const string &positions_file, double min_r2) { if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LD Statistics."); LOG.printLOG("Outputting genotype pairwise LD (bi-allelic only) for a set of SNPs verses all others.\n"); vector< set > keep_positions; map chr_to_idx; string line; stringstream ss; string chr; int pos1, idx; unsigned int N_chr=0; ifstream BED(positions_file.c_str()); if (!BED.is_open()) LOG.error("Could not open Positions file: " + positions_file); // Skip header BED.ignore(numeric_limits::max(), '\n'); while (!BED.eof()) { getline(BED, line); if (line[0] == '#') continue; ss.clear(); ss.str(line); ss >> chr >> pos1; if (chr_to_idx.find(chr) == chr_to_idx.end()) { N_chr++; chr_to_idx[chr] = (N_chr-1); keep_positions.resize(N_chr); } idx = chr_to_idx[chr]; keep_positions[idx].insert(pos1); } BED.close(); unsigned int s, s2; string output = output_file_prefix + ".list.geno.ld"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open LD Output File: " + output, 3); out << "CHR1\tPOS1\tCHR2\tPOS2\tN_INDV\tR^2" << endl; int indv_count; double r2; vector variant_line, variant_line2; entry *e = get_entry_object(N_indv); entry *e2 = get_entry_object(N_indv); for (s=0; sreset(variant_line); e->parse_basic_entry(true); e->get_CHROM(chr); if (chr_to_idx.find(chr) == chr_to_idx.end()) continue; idx = chr_to_idx[chr]; pos1 = e->get_POS(); if (keep_positions[idx].find(pos1) == keep_positions[idx].end()) continue; if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } e->parse_genotype_entries(true); for (s2 = 0; s2reset(variant_line2); e2->parse_basic_entry(true); if (e2->get_N_alleles() != 2) { LOG.one_off_warning("\tinterchromLD: Only using biallelic variants."); continue; // Isn't biallelic } calc_geno_r2(e, e2, include_genotype[s], include_genotype[s2], r2, indv_count); if (min_r2 > 0) if ((r2 < min_r2) | (r2 != r2)) continue; out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e2->get_CHROM() << "\t" << e2->get_POS() << "\t" << indv_count << "\t" << r2 << endl; } } out.close(); } void variant_file::output_singletons(const string &output_file_prefix) { // Locate and output singletons (and private doubletons) if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Singletons."); LOG.printLOG("Outputting Singleton Locations\n"); string output = output_file_prefix + ".singletons"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Singleton Output File: " + output, 3); out << "CHROM\tPOS\tSINGLETON/DOUBLETON\tALLELE\tINDV" << endl; unsigned int ui; int a; vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; pair geno; string allele; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->parse_genotype_entries(true); e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); N_alleles = e->get_N_alleles(); for (a=0; a<(signed)N_alleles; a++) { if (allele_counts[a] == 1) { // Singleton for (ui=0; uiget_indv_GENOTYPE_ids(ui, geno); if ((geno.first == a) || (geno.second == a)) { e->get_allele(a, allele); out << e->get_CHROM() << "\t" << e->get_POS() << "\tS\t" << allele << "\t" << indv[ui] << endl; ui=N_indv; break; } } } else if (allele_counts[a] == 2) { // Possible doubleton for (ui=0; uiget_indv_GENOTYPE_ids(ui, geno); if ((geno.first == a) && (geno.second == a)) { e->get_allele(a, allele); out << e->get_CHROM() << "\t" << e->get_POS() << "\tD\t" << allele << "\t" << indv[ui] << endl; ui=N_indv; break; } } } } } delete e; out.close(); } void variant_file::output_genotype_depth(const string &output_file_prefix) { // Output genotype depth in tab-delimited format. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Genotype Depth Statistics."); LOG.printLOG("Outputting Depth for Each Genotype\n"); string output = output_file_prefix + ".gdepth"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Genotype Depth Output File: " + output, 7); out << "CHROM\tPOS"; for (unsigned int ui=0; ui variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS(); for (unsigned int ui=0; uiparse_genotype_entry(ui, false, false, true); out << "\t" << e->get_indv_DEPTH(ui); } else out << "\t-1"; } out << endl; } delete e; out.close(); } void variant_file::output_FILTER_summary(const string &output_file_prefix) { // Output a summary of sites in various FILTER categories. LOG.printLOG("Outputting Filter Summary (for bi-allelic loci only)\n"); map model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; string FILTER; vector variant_line; entry *e = get_entry_object(N_indv); map > FILTER_to_TsTv; map FILTER_to_Nsites; map::iterator FILTER_to_Nsites_it; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, true); string model = e->get_REF() + e->get_ALT_allele(0); sort(model.begin(), model.end()); FILTER = e->get_FILTER(); FILTER_to_Nsites[FILTER]++; if (model_to_idx.find(model) != model_to_idx.end()) { switch (model_to_idx[model]) { case 1: case 4: FILTER_to_TsTv[FILTER].first++; break; case 0: case 2: case 3: case 5: FILTER_to_TsTv[FILTER].second++; break; default: // Don't count this snp towards Ts/Tv break; } } } vector > count_to_FILTER; for ( FILTER_to_Nsites_it=FILTER_to_Nsites.begin() ; FILTER_to_Nsites_it != FILTER_to_Nsites.end(); ++FILTER_to_Nsites_it ) { FILTER = (*FILTER_to_Nsites_it).first; int Nsites = (*FILTER_to_Nsites_it).second; count_to_FILTER.push_back(make_pair(Nsites, FILTER)); } sort(count_to_FILTER.begin(), count_to_FILTER.end()); string output = output_file_prefix + ".FILTER.summary"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Filter Summary Output File: " + output, 7); out << "FILTER\tN_VARIANTS\tN_Ts\tN_Tv\tTs/Tv" << endl; for (int i=count_to_FILTER.size()-1; i > -1; i--) { FILTER = count_to_FILTER[i].second; int Ts = FILTER_to_TsTv[FILTER].first; int Tv = FILTER_to_TsTv[FILTER].second; int Nsites = FILTER_to_Nsites[FILTER]; out << FILTER << "\t" << Nsites << "\t"; out << Ts << "\t" << Tv << "\t" << double(Ts)/Tv << endl; } delete e; out.close(); } void variant_file::output_TsTv(const string &output_file_prefix, int bin_size) { // Output Ts/Tv ratios in bins of a given size. LOG.printLOG("Outputting Ts/Tv in bins of " + output_log::int2str(bin_size) + "bp\n"); map model_to_idx; model_to_idx["AC"] = 0; model_to_idx["AG"] = 1; model_to_idx["AT"] = 2; model_to_idx["CG"] = 3; model_to_idx["CT"] = 4; model_to_idx["GT"] = 5; map max_pos; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); if (max_pos.find(CHROM) != max_pos.end()) { if (e->get_POS() > max_pos[CHROM]) max_pos[CHROM] = e->get_POS(); } else max_pos[CHROM] = e->get_POS(); } } map::iterator it; unsigned int N_bins; map > Ts_counts; map > Tv_counts; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + bin_size) / double(bin_size)); Ts_counts[CHROM].resize(N_bins, 0); Tv_counts[CHROM].resize(N_bins, 0); } vector model_counts(6,0); double C = 1.0 / double(bin_size); unsigned int idx; string model; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; model = e->get_REF() + e->get_ALT_allele(0); sort(model.begin(), model.end()); CHROM = e->get_CHROM(); idx = (unsigned int)(e->get_POS() * C); if (model_to_idx.find(model) != model_to_idx.end()) { model_counts[model_to_idx[model]]++; switch (model_to_idx[model]) { case 1: case 4: Ts_counts[CHROM][idx]++; break; case 0: case 2: case 3: case 5: Tv_counts[CHROM][idx]++; break; default: LOG.error("Unknown idx\n"); break; } } else LOG.warning("Unknown model type. Not a SNP? " + CHROM + ":" + output_log::int2str(e->get_POS()) +"\n"); } string output = output_file_prefix + ".TsTv"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open TsTv Output File: " + output, 7); out << "CHROM\tBinStart\tSNP_count\tTs/Tv" << endl; double ratio; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; for (unsigned int s=0; s Ts_counts, Tv_counts; unsigned int N_kept_indv = N_kept_individuals(); Ts_counts.resize(2*N_kept_indv); Tv_counts.resize(2*N_kept_indv); string model; vector variant_line; entry *e = get_entry_object(N_indv); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; vector allele_counts; unsigned int allele_count; unsigned int N_included_indv; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; e->parse_genotype_entries(true); e->get_allele_counts(allele_counts, N_included_indv, include_indv, include_genotype[s]); allele_count = allele_counts[1]; model = e->get_REF() + e->get_ALT_allele(0); if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end()) { idx = model_to_Ts_or_Tv[model]; if (idx == 0) // Ts Ts_counts[allele_count]++; else if (idx == 1) // Tv; Tv_counts[allele_count]++; else LOG.error("Unknown model type\n"); } else LOG.warning("Unknown model type. Not a SNP? " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS()) +"\n"); } } string output = output_file_prefix + ".TsTv.count"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open TsTv by Count Output File: " + output, 7); double ratio; out << "ALT_ALLELE_COUNT\tN_Ts\tN_Tv\tTs/Tv" << endl; for (unsigned int ui=0; ui<2*N_kept_indv; ui++) { ratio = double(Ts_counts[ui]) / Tv_counts[ui]; out << ui << "\t" << Ts_counts[ui] << "\t" << Tv_counts[ui] << "\t" << ratio << endl; } delete e; out.close(); } void variant_file::output_TsTv_by_quality(const string &output_file_prefix) { // Output Ts/Tv ratios in bins of a given size. LOG.printLOG("Outputting Ts/Tv By Quality\n"); map > TsTv_counts; double max_qual = -numeric_limits::max(), min_qual=numeric_limits::max(); string model; vector variant_line; entry *e = get_entry_object(N_indv); map model_to_Ts_or_Tv; model_to_Ts_or_Tv["AC"] = 1; model_to_Ts_or_Tv["CA"] = 1; model_to_Ts_or_Tv["AG"] = 0; // Ts model_to_Ts_or_Tv["GA"] = 0; // Ts model_to_Ts_or_Tv["AT"] = 1; model_to_Ts_or_Tv["TA"] = 1; model_to_Ts_or_Tv["CG"] = 1; model_to_Ts_or_Tv["GC"] = 1; model_to_Ts_or_Tv["CT"] = 0; // Ts model_to_Ts_or_Tv["TC"] = 0; // Ts model_to_Ts_or_Tv["GT"] = 1; model_to_Ts_or_Tv["TG"] = 1; unsigned int idx; double QUAL; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (!e->is_biallelic_SNP()) continue; QUAL = e->get_QUAL(); if (QUAL > max_qual) max_qual = QUAL; if (QUAL < min_qual) min_qual = QUAL; model = e->get_REF() + e->get_ALT_allele(0);; if (model_to_Ts_or_Tv.find(model) != model_to_Ts_or_Tv.end()) { idx = model_to_Ts_or_Tv[model]; if (idx == 0) // Ts { TsTv_counts[QUAL].first++; } else if (idx == 1) // Tv; TsTv_counts[QUAL].second++; else LOG.error("Unknown model type\n"); } else LOG.warning("Unknown model type. Not a SNP? " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS()) +"\n"); } } string output = output_file_prefix + ".TsTv.qual"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open TsTv by Count Output File: " + output, 7); out << "QUAL_THRESHOLD"; out << "\tN_Ts_LT_QUAL_THRESHOLD\tN_Tv_LT_QUAL_THRESHOLD\tTs/Tv_LT_QUAL_THRESHOLD"; out << "\tN_Ts_GT_QUAL_THRESHOLD\tN_Tv_GT_QUAL_THRESHOLD\tTs/Tv_GT_QUAL_THRESHOLD" << endl; unsigned int N_TsTv = TsTv_counts.size(); vector Ts_sum_below(N_TsTv+1, 0.0), Tv_sum_below(N_TsTv+1, 0.0); vector QUAL_vector(N_TsTv+1, 0.0); QUAL_vector[0] = min_qual; QUAL_vector[N_TsTv] = max_qual; idx = 1; for (map >::iterator it=TsTv_counts.begin(); it != TsTv_counts.end(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_below[idx] = Ts_sum_below[idx-1]+Ts; Tv_sum_below[idx] = Tv_sum_below[idx-1]+Tv; QUAL_vector[idx-1] = QUAL; idx++; } QUAL_vector[N_TsTv] = max_qual; vector Ts_sum_above(N_TsTv+1, 0.0), Tv_sum_above(N_TsTv+1, 0.0); idx = N_TsTv; for (map >::reverse_iterator it=TsTv_counts.rbegin(); it != TsTv_counts.rend(); ++it) { QUAL = (it->first); double Ts = (it->second).first; double Tv = (it->second).second; Ts_sum_above[idx] = Ts_sum_above[idx+1]+Ts; Tv_sum_above[idx] = Tv_sum_above[idx+1]+Tv; idx--; } double Ts_sum, Tv_sum, ratio; for (unsigned int ui=1; ui<(N_TsTv+1); ui++) { QUAL = QUAL_vector[ui-1]; out << QUAL; Ts_sum = Ts_sum_below[ui-1]; Tv_sum = Tv_sum_below[ui-1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; Ts_sum = Ts_sum_above[ui+1]; Tv_sum = Tv_sum_above[ui+1]; ratio = Ts_sum / Tv_sum; out << "\t" << Ts_sum << "\t" << Tv_sum << "\t" << ratio; out << endl; } delete e; out.close(); } void variant_file::output_site_quality(const string &output_file_prefix) { // Output per-site quality information. LOG.printLOG("Outputting Quality for Each Site\n"); string output = output_file_prefix + ".lqual"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Site Depth Output File: " + output, 7); out << "CHROM\tPOS\tQUAL" << endl; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << e->get_QUAL() << endl; } delete e; out.close(); } void variant_file::output_site_depth(const string &output_file_prefix, bool output_mean) { // Output per-site depth information if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Site Depth Statistics."); LOG.printLOG("Outputting Depth for Each Site\n"); string output = output_file_prefix + ".ldepth"; if (output_mean) output += ".mean"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Site Depth Output File: " + output, 7); out << "CHROM\tPOS\t"; if (output_mean) out << "MEAN_DEPTH\tVAR_DEPTH" << endl; else out << "SUM_DEPTH\tSUMSQ_DEPTH" << endl; int depth; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); out << e->get_CHROM() << "\t" << e->get_POS() << "\t"; unsigned int sum=0; unsigned int sumsq=0; unsigned int n=0; for (unsigned int ui=0; uiparse_genotype_entry(ui, false, false, true); depth = e->get_indv_DEPTH(ui); if (depth >= 0) { sum += depth; sumsq += (depth*depth); n++; } } if (output_mean) { double mean = double(sum) / n; double var = ((double(sumsq) / n) - (mean*mean)) * double(n) / double(n-1); out << mean << "\t" << var << endl; } else out << sum << "\t" << sumsq << endl; } delete e; out.close(); } void variant_file::output_hapmap_fst(const string &output_file_prefix, const vector &indv_files) { // Calculate Fst using individuals in one (rather than two VCF files) // Calculate, and output, Fst using the formula outlined in HapMap I // Namely: // Fst = 1 - (Pi_within / Pi_combined) // where // Pi_within = sum_j(nchoosek(n_j,2) * sum_i(2*n_ij * x_ij * (1-x_ij) / (n_ij -1))) / sum_j(nchoosek(n_j,2)) // and // Pi_between = sum_i(2*n_i*x_i*(1-x_i) / (n_i - 1)) // where j is the population index, and i is the SNP index if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting HapMap-style Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } string output = output_file_prefix + ".hapmap.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tPOS\tHAPMAP_FST" << endl; entry *e = get_entry_object(N_indv); vector variant_line; vector allele_counts1; double Fst_tot_num=0.0, Fst_tot_denom=0.0; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tFst: Only using biallelic sites."); continue; } e->parse_full_entry(true); e->parse_genotype_entries(true); unsigned int N_chr; e->get_allele_counts(allele_counts1, N_chr, all_indv, include_genotype[s]); double count_all = allele_counts1[1]; double N_chr_all = N_chr; if ((count_all == 0) || (count_all == N_chr_all)) continue; // No polymorphism vector counts(N_pops, 0); vector pop_N_chr(N_pops, 0); vector pop_N_choose_2(N_pops, 0); for (unsigned int p=0; pget_allele_counts(allele_counts1, N_chr, indvs_in_pops[p], include_genotype[s]); counts[p] = allele_counts1[1]; pop_N_chr[p] = N_chr; pop_N_choose_2[p] = N_chr * (N_chr-1.0) / 2.0; } double Fst_SNP = 0; double f; double sum1=0.0; for (unsigned int p=0; pget_CHROM() << "\t" << e->get_POS() << "\t" << Fst_SNP << endl; } double Fst_tot = 1.0 - (Fst_tot_num / Fst_tot_denom); LOG.printLOG("HapMap-style Fst = " + output_log::dbl2str(Fst_tot, 6) + "\n"); delete e; out.close(); } void variant_file::output_weir_and_cockerham_fst(const string &output_file_prefix, const vector &indv_files) { // Implements the bi-allelic version of Weir and Cockerham's Fst if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Weir and Cockerham Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } string output = output_file_prefix + ".weir.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tPOS\tWEIR_AND_COCKERHAM_FST" << endl; entry *e = get_entry_object(N_indv); vector variant_line; double snp_Fst; double sum1=0.0, sum2 = 0.0; double sum3=0.0, count = 0.0; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tFst: Only using biallelic sites."); continue; } e->parse_full_entry(true); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tFst: Only using diploid sites."); continue; } vector n; n.resize(N_pops, 0); vector p; p.resize(N_pops, 0); double nbar = 0.0, pbar=0.0, hbar=0.0; double ssqr=0.0; double sum_nsqr = 0.0; double n_sum = 0.0; unsigned int N_hom1, N_het, N_hom2; for (unsigned int j=0; jget_genotype_counts(indvs_in_pops[j], include_genotype[s], N_hom1, N_het, N_hom2); n[j] = N_hom1 + N_het + N_hom2; hbar += N_het; p[j] = N_het + 2*N_hom2; nbar += n[j]; pbar += p[j]; sum_nsqr += (n[j] * n[j]); p[j] /= (2.0*n[j]); // diploid } n_sum = nbar; nbar /= double(N_pops); hbar /= n_sum; pbar /= (n_sum*2.0); // diploid for (unsigned int j=0; jget_CHROM() << "\t" << e->get_POS() << "\t" << snp_Fst << endl; } double weighted_Fst = sum1 / sum2; double mean_Fst = sum3 / count; LOG.printLOG("Weir and Cockerham mean Fst estimate: " + output_log::dbl2str(mean_Fst, 5) + "\n"); LOG.printLOG("Weir and Cockerham weighted Fst estimate: " + output_log::dbl2str(weighted_Fst, 5) + "\n"); delete e; } void variant_file::output_windowed_weir_and_cockerham_fst(const string &output_file_prefix, const vector &indv_files, int fst_window_size, int fst_window_step) { if (fst_window_size <= 0) return; if ((fst_window_step <= 0) || (fst_window_step > fst_window_size)) fst_window_step = fst_window_size; if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Windowed Weir and Cockerham Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } // Find maximum position on each chromosome map max_pos; map::iterator it; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); if (max_pos.find(CHROM) != max_pos.end()) { if (e->get_POS() > max_pos[CHROM]) max_pos[CHROM] = e->get_POS(); } else max_pos[CHROM] = e->get_POS(); } } // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele unsigned int N_bins; const vector< double > empty_vector(4, 0); // sum1, sum2, sum3, count map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int) ceil( (max_pos[CHROM]+1) / double(fst_window_step)); bins[CHROM].resize(N_bins, empty_vector); } double snp_Fst; double sum1=0.0, sum2 = 0.0; double sum3=0.0, count = 0.0; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tFst: Only using biallelic sites."); continue; } e->parse_full_entry(true); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tFst: Only using diploid sites."); continue; } vector n; n.resize(N_pops, 0); vector p; p.resize(N_pops, 0); double nbar = 0.0, pbar=0.0, hbar=0.0; double ssqr=0.0; double sum_nsqr = 0.0; double n_sum = 0.0; unsigned int N_hom1, N_het, N_hom2; for (unsigned int j=0; jget_genotype_counts(indvs_in_pops[j], include_genotype[s], N_hom1, N_het, N_hom2); n[j] = N_hom1 + N_het + N_hom2; hbar += N_het; p[j] = N_het + 2*N_hom2; nbar += n[j]; pbar += p[j]; sum_nsqr += (n[j] * n[j]); p[j] /= (2.0*n[j]); // diploid } n_sum = nbar; nbar /= double(N_pops); hbar /= n_sum; pbar /= (n_sum*2.0); // diploid for (unsigned int j=0; jget_POS(); CHROM = e->get_CHROM(); int first = (int) ceil((pos - fst_window_size)/double(fst_window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(fst_window_step)); for(int idx = first; idx < last; idx++) { bins[CHROM][idx][0] += S1; bins[CHROM][idx][1] += S2; bins[CHROM][idx][2] += snp_Fst; bins[CHROM][idx][3]++; } sum1 += S1; sum2 += S2; sum3 += snp_Fst; count++; } } double weighted_Fst = sum1 / sum2; double mean_Fst = sum3 / count; LOG.printLOG("Weir and Cockerham mean Fst estimate: " + output_log::dbl2str(mean_Fst, 5) + "\n"); LOG.printLOG("Weir and Cockerham weighted Fst estimate: " + output_log::dbl2str(weighted_Fst, 5) + "\n"); string output = output_file_prefix + ".windowed.weir.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tWEIGHTED_FST\tMEAN_FST" << endl; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; for (unsigned int s=0; s 0)) { double weighted_Fst = bins[CHROM][s][0] / bins[CHROM][s][1]; double mean_Fst = bins[CHROM][s][2] / bins[CHROM][s][3]; out << CHROM << "\t" << s*fst_window_step + 1 << "\t" << (s*fst_window_step + fst_window_size) << "\t" << bins[CHROM][s][3] << "\t" << weighted_Fst << "\t" << mean_Fst << endl; } } } out.close(); delete e; } void variant_file::output_windowed_hapmap_fst(const string &output_file_prefix, const vector &indv_files, int fst_window_size, int fst_window_step) { if (fst_window_size <= 0) return; if ((fst_window_step <= 0) || (fst_window_step > fst_window_size)) fst_window_step = fst_window_size; if (indv_files.size() == 1) { LOG.printLOG("Require at least two populations to estimate Fst. Skipping\n"); return; } if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Fst statistics."); LOG.printLOG("Outputting Windowed HapMap Fst estimates.\n"); // First, read in the relevant files. vector< vector > indvs_in_pops; unsigned int N_pops = indv_files.size(); indvs_in_pops.resize(N_pops, vector(N_indv, false)); vector all_indv(N_indv,false); map indv_to_idx; for (unsigned int ui=0; ui> tmp_indv; if (indv_to_idx.find(tmp_indv) != indv_to_idx.end()) { indvs_in_pops[ui][indv_to_idx[tmp_indv]]=true; all_indv[indv_to_idx[tmp_indv]]=true; } ss.clear(); } indv_file.close(); } // Find maximum position on each chromosome map max_pos; map::iterator it; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); if (max_pos.find(CHROM) != max_pos.end()) { if (e->get_POS() > max_pos[CHROM]) max_pos[CHROM] = e->get_POS(); } else max_pos[CHROM] = e->get_POS(); } } // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele unsigned int N_bins; const vector< double > empty_vector(4, 0); // sum1, sum2, sum3, count map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int) ceil( (max_pos[CHROM]+1) / double(fst_window_step)); bins[CHROM].resize(N_bins, empty_vector); } vector allele_counts1; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tFst: Only using biallelic sites."); continue; } e->parse_full_entry(true); e->parse_genotype_entries(true); unsigned int N_chr; e->get_allele_counts(allele_counts1, N_chr, all_indv, include_genotype[s]); double count_all = allele_counts1[1]; double N_chr_all = N_chr; if ((count_all == 0) || (count_all == N_chr_all)) continue; // No polymorphism vector counts(N_pops, 0); vector pop_N_chr(N_pops, 0); vector pop_N_choose_2(N_pops, 0); for (unsigned int p=0; pget_allele_counts(allele_counts1, N_chr, indvs_in_pops[p], include_genotype[s]); counts[p] = allele_counts1[1]; pop_N_chr[p] = N_chr; pop_N_choose_2[p] = N_chr * (N_chr-1.0) / 2.0; } double Fst_SNP = 0; double f; double sum1=0.0; for (unsigned int p=0; pget_POS(); CHROM = e->get_CHROM(); int first = (int) ceil((pos - fst_window_size)/double(fst_window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(fst_window_step)); for(int idx = first; idx < last; idx++) { bins[CHROM][idx][0] += Fst_num; bins[CHROM][idx][1] += tmp; bins[CHROM][idx][2] += Fst_SNP; bins[CHROM][idx][3]++; } } string output = output_file_prefix + ".windowed.hapmap.fst"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Fst Output File: " + output, 7); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tWEIGHTED_FST\tMEAN_FST" << endl; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; for (unsigned int s=0; s 0)) { double weighted_Fst = 1.0 - (bins[CHROM][s][0] / bins[CHROM][s][1]); double mean_Fst = bins[CHROM][s][2] / bins[CHROM][s][3]; out << CHROM << "\t" << s*fst_window_step + 1 << "\t" << (s*fst_window_step + fst_window_size) << "\t" << bins[CHROM][s][3] << "\t" << weighted_Fst << "\t" << mean_Fst << endl; } } } out.close(); delete e; } void variant_file::output_per_site_nucleotide_diversity(const string &output_file_prefix) { // Output nucleotide diversity, calculated on a per-site basis. // Pi = average number of pairwise differences // Assumes a constant distance of 1 between all possible mutations if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics."); LOG.printLOG("Outputting Per-Site Nucleotide Diversity Statistics...\n"); string output_file = output_file_prefix + ".sites.pi"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tPOS\tPI" << endl; vector variant_line; entry *e = get_entry_object(N_indv); vector allele_counts; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); e->parse_full_entry(true); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tsitePi: Only using fully diploid sites."); continue; } unsigned int N_non_missing_chr; e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); unsigned int total_alleles = std::accumulate(allele_counts.begin(), allele_counts.end(), 0); unsigned int N_alleles = e->get_N_alleles(); int mismatches = 0; for(unsigned int allele = 0; allele < N_alleles; allele++) { int other_alleles_count = (total_alleles - allele_counts[allele]); mismatches += (allele_counts[allele] * other_alleles_count); } int pairs = (total_alleles * (total_alleles - 1)); double pi = (mismatches/static_cast(pairs)); out << e->get_CHROM() << "\t" << e->get_POS() << "\t" << pi << endl; } delete e; } // Output Tajima's D // Carlson et al. Genome Res (2005) void variant_file::output_Tajima_D(const string &output_file_prefix, int window_size) { if (window_size <= 0) return; if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Tajima's D Statistic."); LOG.printLOG("Outputting Tajima's D Statistic...\n"); string output_file = output_file_prefix + ".Tajima.D"; double a1=0.0, a2=0.0, b1, b2, c1, c2, e1, e2; unsigned int n = N_kept_individuals()*2; if (n < 2) LOG.error("Require at least two chromosomes!"); for (unsigned int ui=1; ui max_pos; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); if (max_pos.find(CHROM) != max_pos.end()) { if (e->get_POS() > max_pos[CHROM]) max_pos[CHROM] = e->get_POS(); } else max_pos[CHROM] = e->get_POS(); } } map::iterator it; unsigned int N_bins; map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int)((max_pos[CHROM] + window_size) / double(window_size)); bins[CHROM].resize(N_bins, make_pair(0,0)); } unsigned int idx; double C = 1.0 / double(window_size); vector allele_counts; unsigned int N_non_missing_chr; unsigned int N_alleles; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) { LOG.one_off_warning("\tTajimaD: Only using bialleleic sites."); continue; } CHROM = e->get_CHROM(); idx = (unsigned int)(e->get_POS() * C); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tTajimaD: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); double p = double(allele_counts[0]) / N_non_missing_chr; if ((p > 0.0) && (p < 1.0)) { bins[CHROM][idx].first++; bins[CHROM][idx].second += p * (1.0-p); } } ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tBIN_START\tN_SNPS\tTajimaD" << endl; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; bool output = false; for (unsigned int s=0; s 1) { double pi = 2.0*bins[CHROM][s].second*n/double(n-1); double tw = double(S) / a1; double var = (e1*S) + e2*S*(S-1); D = (pi - tw) / sqrt(var); output = true; } if (S > 0) output = true; if (output == true) out << CHROM << "\t" << s*window_size << "\t" << bins[CHROM][s].first << "\t" << D << endl; } } delete e; out.close(); } void variant_file::output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size, int window_step) { // Output nucleotide diversity, as calculated in windows. // Average number of pairwise differences in windows. if (window_size <= 0) return; if ((window_step <= 0) || (window_step > window_size)) window_step = window_size; if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics."); LOG.printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n"); string output_file = output_file_prefix + ".windowed.pi"; // Find maximum position on each chromosome map max_pos; map::iterator it; string CHROM; vector variant_line; entry *e = get_entry_object(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); CHROM = e->get_CHROM(); if (max_pos.find(CHROM) != max_pos.end()) { if (e->get_POS() > max_pos[CHROM]) max_pos[CHROM] = e->get_POS(); } else max_pos[CHROM] = e->get_POS(); } } // Calculate number of bins for each chromosome and allocate memory for them. // Each bin is a vector with four entries: // N_variant_sites: Number of sites in a window that have VCF entries // N_variant_site_pairs: Number of possible pairwise mismatches at polymorphic sites within a window // N_mismatches: Number of actual pairwise mismatches at polymorphic sites within a window // N_polymorphic_sites: number of sites within a window where there is at least 1 sample that is polymorphic with respect to the reference allele unsigned int N_bins; const unsigned int N_variant_sites = 0; const unsigned int N_variant_site_pairs = 1; const unsigned int N_mismatches = 2; const unsigned int N_polymorphic_sites = 3; const vector< unsigned long > empty_vector(4, 0); map > > bins; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; N_bins = (unsigned int) ceil( (max_pos[CHROM]+1) / double(window_step)); bins[CHROM].resize(N_bins, empty_vector); } // Count polymorphic sites and pairwise mismatches vector allele_counts; unsigned int N_non_missing_chr; unsigned long N_comparisons; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); CHROM = e->get_CHROM(); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\twindowPi: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); unsigned int N_site_mismatches = 0; for (vector::iterator ac = allele_counts.begin(); ac != allele_counts.end(); ++ac) { N_site_mismatches += (*ac * (N_non_missing_chr - *ac)); } if (N_site_mismatches == 0) continue; // Site is actually fixed. // Place the counts into bins int pos = (int)e->get_POS(); int first = (int) ceil((pos - window_size)/double(window_step)); if (first < 0) first = 0; int last = (int) ceil(pos/double(window_step)); N_comparisons = N_non_missing_chr * (N_non_missing_chr - 1); for(int idx = first; idx < last; idx++) { bins[CHROM][idx][N_variant_sites]++; bins[CHROM][idx][N_variant_site_pairs] += N_comparisons; bins[CHROM][idx][N_mismatches] += N_site_mismatches; if(allele_counts[0] < (signed)N_non_missing_chr) bins[CHROM][idx][N_polymorphic_sites]++; } } // Calculate and print nucleotide diversity statistics ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tPI" << endl; unsigned long N_monomorphic_sites = 0; int N_kept_chr = 2*N_kept_individuals(); N_comparisons = (N_kept_chr * (N_kept_chr - 1)); // Number of pairwise comparisons at a monomorphic site unsigned long N_pairs = 0; // Number of pairwise comparisons within a window double pi = 0; for (it=max_pos.begin(); it != max_pos.end(); ++it) { CHROM = (*it).first; for (unsigned int s=0; s 0) || (bins[CHROM][s][N_mismatches] > 0) ) { // This number can be slightly off for the last bin since the // window size can go off the end of the chromosome. N_monomorphic_sites = window_size - bins[CHROM][s][N_variant_sites]; // The total number of possible pairwise comparisons is the sum of // pairwise comparisons at polymorphic sites and pairwise // comparisons at monomorphic sites. N_pairs = bins[CHROM][s][N_variant_site_pairs] + (N_monomorphic_sites * N_comparisons); pi = bins[CHROM][s][N_mismatches] / double(N_pairs); out << CHROM << "\t" << s*window_step + 1 << "\t" << (s*window_step + window_size) << "\t" << bins[CHROM][s][N_polymorphic_sites] << "\t" << pi << endl; } } } delete e; out.close(); } void variant_file::output_kept_and_removed_sites(const string &output_file_prefix) { // Output lists of sites that have been filtered (or not). LOG.printLOG("Outputting Kept and Removed Sites...\n"); string output_file1 = output_file_prefix + ".kept.sites"; string output_file2 = output_file_prefix + ".removed.sites"; string CHROM; vector variant_line; int POS; entry *e = get_entry_object(N_indv); ofstream out1(output_file1.c_str()); if (!out1.is_open()) LOG.error("Could not open output file: " + output_file1, 12); out1 << "CHROM\tPOS" << endl; ofstream out2(output_file2.c_str()); if (!out2.is_open()) LOG.error("Could not open output file: " + output_file2, 12); out2 << "CHROM\tPOS" << endl; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(); POS = e->get_POS(); CHROM = e->get_CHROM(); if (include_entry[s] == true) out1 << CHROM << "\t" << POS << endl; else out2 << CHROM << "\t" << POS << endl; } delete e; out1.close(); out2.close(); } void variant_file::output_LROH(const string &output_file_prefix) { // Detect and output Long Runs of Homozygosity, following the method // developed by Adam Boyko, and described in Auton et al., Genome Research, 2009 // (Although using Forward-backwards algorithm in place of Viterbi). if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output LROH."); LOG.printLOG("Outputting Long Runs of Homozygosity (Experimental)... \n"); string output_file = output_file_prefix + ".LROH"; unsigned int nGen=4; // Number of generations since common ancestry double genotype_error_rate = 0.01; // Assumed genotype error rate double p_auto_prior = 0.05; // Prior probability of being in autozygous state double p_auto_threshold = 0.99; // Threshold for reporting autozygous region int min_SNPs=0; // Threshold for reporting autozygous region string CHROM; vector variant_line; int POS; entry *e = get_entry_object(N_indv); pair alleles; vector s_vector; vector > p_emission; vector > p_trans; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open output file: " + output_file, 12); out << "CHROM\tAUTO_START\tAUTO_END\tN_VARIANTS\tINDV" << endl; // TODO - refactor this so that Entries loop is on the outside. for (unsigned int ui=0; uireset(variant_line); e->parse_basic_entry(true); if (e->get_N_alleles() != 2) { LOG.one_off_warning("\tLROH: Only using bialleleic sites."); continue; // TODO: Probably could do without this... } POS = e->get_POS(); e->parse_genotype_entry(ui, true); e->get_indv_GENOTYPE_ids(ui, alleles); if (e->get_indv_ploidy(ui) != 2) { LOG.one_off_warning("\tLROH: Only using diploid sites."); continue; } if ((alleles.first == -1) || (alleles.second == -1)) continue; unsigned int X = alleles.first + alleles.second; // Calculate heterozyogosity of this site. // TODO: Would be better to do this once, but for simplicity, do it for each individual. unsigned int N_genotypes = 0; unsigned int N_hets = 0; for (unsigned int uj=0; ujparse_genotype_entry(uj, true); e->get_indv_GENOTYPE_ids(uj, alleles); if ((alleles.first != -1) && (alleles.second != -1)) { N_genotypes++; if (alleles.first != alleles.second) N_hets++; } } double h = N_hets / double(N_genotypes); double p_emission_given_nonauto; double p_emission_given_auto; if (X == 1) { // Heterozygote p_emission_given_nonauto = h; p_emission_given_auto = genotype_error_rate; p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto)); } else { // Homozygote p_emission_given_nonauto = 1.0-h; p_emission_given_auto = 1.0-genotype_error_rate; p_emission.push_back(make_pair(p_emission_given_auto, p_emission_given_nonauto)); } double r = 0; if (last_POS > 0) { // Assume 1cM/Mb. r = (POS - last_POS) / 1000000.0 / 100.0; // Morgans } double e = (1.0 - exp(-2.0*nGen*r)); double p_trans_auto_to_nonauto = (1.0 - p_auto_prior) * e; //A[1] double p_trans_nonauto_to_auto = p_auto_prior * e; //A[2] double p_trans_auto_to_auto = 1.0 - p_trans_nonauto_to_auto; //A[0] double p_trans_nonauto_to_nonauto = 1.0 - p_trans_auto_to_nonauto; // A[3] vector A(4); A[0] = p_trans_auto_to_auto; A[1] = p_trans_auto_to_nonauto; A[2] = p_trans_nonauto_to_auto; A[3] = p_trans_nonauto_to_nonauto; s_vector.push_back(s); p_trans.push_back(A); last_POS = POS; } // Forward-backward algorithm int N_obs = (int)p_emission.size(); if (N_obs == 0) continue; vector > alpha(N_obs, vector(2,0)); vector > beta(N_obs, vector(2,0)); alpha[0][0] = p_emission[0].first; alpha[0][1] = p_emission[0].second; for (int i=1; i=0; i--) { beta[i][0] = beta[i+1][0] * p_trans[i][0] * p_emission[i].first; beta[i][0] += beta[i+1][1] * p_trans[i][2] * p_emission[i].first; beta[i][1] = beta[i+1][1] * p_trans[i][3] * p_emission[i].second; beta[i][1] += beta[i+1][0] * p_trans[i][1] * p_emission[i].second; while (beta[i][0] + beta[i][1] < 1e-20) { // Renormalise to prevent underflow beta[i][0] *= 1e20; beta[i][1] *= 1e20; } } // Calculate probability of each site being autozygous vector p_auto(N_obs); for (int i=0; ithreshold. // TODO: Also would be good to report heterozygotic SNPs found in homozygotic regions. bool in_auto=false; int start_pos=0, end_pos=0; int N_SNPs = 0; for (int i=0; i p_auto_threshold) { if (in_auto == false) { // Start of autozygous region unsigned int s = s_vector[i]; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); CHROM = e->get_CHROM(); start_pos = e->get_POS(); } N_SNPs++; in_auto = true; } else { if (in_auto == true) { // end of autozygous region unsigned int s = s_vector[i]; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); end_pos = e->get_POS(); if (N_SNPs >= min_SNPs) out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl; } in_auto = false; N_SNPs = 0; } } if (in_auto == true) { // Report final region if needed unsigned int s = s_vector[N_obs-1]; get_entry(s, variant_line); e->reset(variant_line); e->parse_basic_entry(true); end_pos = e->get_POS(); if (N_SNPs >= min_SNPs) out << CHROM << "\t" << start_pos << "\t" << end_pos << "\t" << N_SNPs << "\t" << indv[ui] << endl; } } delete e; out.close(); } void variant_file::output_indv_relatedness(const string &output_file_prefix) { // Calculate and output a relatedness statistic based on the method of // Yang et al, 2010 (doi:10.1038/ng.608). Specifically, calculate the // unadjusted Ajk statistic (equation 6 of paper). // Expectation of Ajk is zero for individuals within a populations, and // one for an individual with themselves. if ((has_genotypes == false) | (N_kept_individuals() == 0)) LOG.error("Require Genotypes in VCF file in order to output Individual Relatedness."); LOG.printLOG("Outputting Individual Relatedness\n"); string output = output_file_prefix + ".relatedness"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Individual Relatedness Output File: " + output, 2); out << "INDV1\tINDV2\tRELATEDNESS" << endl; vector variant_line; entry *e = get_entry_object(N_indv); vector allele_counts; unsigned int N_alleles, N_non_missing_chr; double freq; pair geno_id; vector > Ajk(N_indv, vector(N_indv, 0.0)); vector > N_sites(N_indv, vector(N_indv, 0.0)); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) { LOG.one_off_warning("\tRelatedness: Only using biallelic sites."); continue; // Only use biallelic loci } e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) { LOG.one_off_warning("\tRelatedness: Only using fully diploid sites."); continue; } e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; vector x(N_indv, -1.0); for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x[ui] = geno_id.first + geno_id.second; } double div = 1.0/(2.0*freq*(1.0-freq)); for (unsigned int ui=0; ui= N_sites) LOG.error("PCA computation requires that there are more sites than individuals."); vector variant_line; entry *e = get_entry_object(N_indv); pair geno_id; double x, freq; vector allele_counts; unsigned int N_alleles, N_non_missing_chr; // Store list of included individuals vector included_indvs(N_indvs); unsigned int ui_prime = 0; for (unsigned int ui=0; uireset(variant_line); e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) LOG.error("PCA only works for biallelic sites."); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) LOG.error("PCA only works for fully diploid sites. Non-diploid site at " + e->get_CHROM() + ":" + output_log::int2str(e->get_POS())); e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; double mu = freq*2.0; double div = 1.0 / sqrt(freq * (1.0-freq)); ui_prime = 0; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x = geno_id.first + geno_id.second; if (x > -1) { if (use_normalisation == true) M[ui_prime][s_prime] = (x - mu) * div; else M[ui_prime][s_prime] = (x - mu); } ui_prime++; } s_prime++; } // Now construct X = (1/n)MM'. double **X = new double *[N_indvs]; for (unsigned int ui=0; ui 0) { // Output SNP loadings LOG.printLOG("Outputting " + output_log::int2str(SNP_loadings_N_PCs) + " SNP loadings\n"); output = output_file_prefix + ".pca.loadings"; out.open(output.c_str()); if (!out.good()) LOG.error("Could not open Principal Component SNP Loading Output File: " + output, 2); out << "CHROM\tPOS"; for (unsigned int ui=0; ui<(unsigned int)SNP_loadings_N_PCs; ui++) out << "\tGAMMA_" << ui; out << endl; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); N_alleles = e->get_N_alleles(); if (N_alleles != 2) LOG.error("PCA only works for biallelic sites."); e->parse_genotype_entries(true); if (e->is_diploid(include_indv, include_genotype[s]) == false) LOG.error("PCA only works for fully diploid sites."); e->get_allele_counts(allele_counts, N_non_missing_chr, include_indv, include_genotype[s]); freq = allele_counts[1] / (double)N_non_missing_chr; // Alt allele frequency if ((freq <= numeric_limits::epsilon()) || (freq >= (1.0-numeric_limits::epsilon()))) continue; vector gamma(SNP_loadings_N_PCs, 0.0); vector a_sum(SNP_loadings_N_PCs, 0.0); ui_prime = 0; for (unsigned int ui=0; uiget_indv_GENOTYPE_ids(ui, geno_id); x = geno_id.first + geno_id.second; if (x > -1) { for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++) { gamma[uj] += (x * Evecs[ui_prime][uj]); a_sum[uj] += (Evecs[ui_prime][uj]*Evecs[ui_prime][uj]); } } ui_prime++; } out << e->get_CHROM() << "\t" << e->get_POS(); for (unsigned int uj=0; uj<(unsigned int)SNP_loadings_N_PCs; uj++) out << "\t" << gamma[uj] / a_sum[uj]; out << endl; } out.close(); } delete e; delete [] Er; delete [] Ei; delete [] Evecs; delete [] X; #endif } void variant_file::output_indel_hist(const string &output_file_prefix) { vector variant_line; entry *e = get_entry_object(N_indv); string allele; unsigned int ref_len, N_alleles; int indel_len, smallest_len, largest_len, snp_count; vector s_vector; string output = output_file_prefix + ".indel.hist"; ofstream out(output.c_str()); if (!out.is_open()) LOG.error("Could not open Indel Hist File: " + output, 7); LOG.printLOG("Outputting Indel Histogram\n"); out << "LENGTH\tCOUNT\tPRCT" << endl; largest_len = 0; smallest_len = 0; snp_count = 0; for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true); allele = e->get_REF(); ref_len = allele.size(); N_alleles = e->get_N_alleles(); if (e->is_SNP() ) snp_count++; for (unsigned int ui=1; uiget_allele(ui, allele); if (allele.size() != ref_len) { indel_len = allele.size() - ref_len; s_vector.push_back (indel_len); if (indel_len > largest_len) largest_len = indel_len; else if (indel_len < smallest_len) smallest_len = indel_len; } } } double total = s_vector.size() + snp_count; double pct; for (int i=smallest_len; i<=largest_len; i++) { int icount = (int) count (s_vector.begin(), s_vector.end(), i); if (icount > 0) { pct = 100.0*icount/total; out << i << "\t" << icount << "\t" << pct << endl; } else if ((i == 0) and (snp_count>0)) { pct = 100.0*snp_count/total; out << i << "\t" << snp_count << "\t" << pct << endl; } } out.close(); } vcftools_0.1.11/cpp/gamma.h0000644000000000000000000000057412156354766014253 0ustar rootroot#ifndef GAMMA_H #define GAMMA_H #include #include #include #include #include #include #include using namespace std; double gammln(double xx); double gcf(double a, double x, double &gln); double gser(double a, double x, double &gln); double gammp(double a, double x); double gammq(double a, double x); #endif vcftools_0.1.11/cpp/vcf_file.cpp0000644000000000000000000003751012156354766015301 0ustar rootroot/* * vcf_file.cpp * * Created on: Dec 11, 2012 * Author: amarcketta */ #include "vcf_file.h" vcf_file::vcf_file(const string &fname, bool comp, const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index) { filename = fname; compressed = comp; has_body = false; has_file_format = false; has_header = false; has_meta = false; bcf_format = false; has_genotypes = false; has_contigs = false; contig_index = 0; gzMAX_LINE_LEN = 0; meta.clear(); open(); scan_file(chrs_to_keep, exclude_chrs, force_write_index); } vcf_file::vcf_file() { gzvcf_in = false; gz_readbuffer = NULL; compressed = false; has_body = false; has_file_format = false; has_header = false; has_meta = false; has_genotypes = false; has_contigs = false; contig_index = 0; gzMAX_LINE_LEN = 0; meta.clear(); } vcf_file::~vcf_file() { close(); } // Parse VCF meta information void vcf_file::parse_meta(const string &line, unsigned int &line_index) { has_meta = true; meta.push_back(line); size_t found=line.find("##fileformat="); if (found!=string::npos) { has_file_format = true; found = line.find_first_of("="); string version = line.substr(found+1); if ((version != "VCFv4.0") && (version != "VCFv4.1")) LOG.error("VCF version must be v4.0 or v4.1:\nYou are using version " + version); } found=line.find("##INFO="); if (found!=string::npos) { // Found an INFO descriptor line_index += vcf_entry::add_INFO_descriptor(line, line_index); } found=line.find("##FILTER="); if (found!=string::npos) { // Found a FILTER descriptor line_index += vcf_entry::add_FILTER_descriptor(line, line_index); } found=line.find("##FORMAT="); if (found!=string::npos) { // Found a genotype filter descriptor line_index += vcf_entry::add_FORMAT_descriptor(line, line_index); } //ALT FIELDS NO LONGER COUNT // found=line.find("##ALT="); // if (found!=string::npos) // line_index += 1; found=line.find("##contig="); if (found!=string::npos) { // Found a contig descriptor vcf_entry::add_CONTIG_descriptor(line, contig_index); contig_index++; has_contigs = true; } } void vcf_file::parse_header(const string &line) { // #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) if (has_header == true) LOG.warning("Multiple Header lines."); has_header = true; istringstream header(line); int count = 0; string tmp_str; unsigned int N_header_indv = 0; has_genotypes = false; while (!header.eof()) { getline(header, tmp_str, '\t'); switch (count) { case 0: if (tmp_str != "#CHROM") LOG.warning("First Header entry should be #CHROM: " + tmp_str); break; case 1: if (tmp_str != "POS") LOG.warning("Second Header entry should be POS: " + tmp_str); break; case 2: if (tmp_str != "ID") LOG.warning("Third Header entry should be ID: " + tmp_str); break; case 3: if (tmp_str != "REF") LOG.warning("Fourth Header entry should be REF: " + tmp_str); break; case 4: if (tmp_str != "ALT") LOG.warning("Fifth Header entry should be ALT: " + tmp_str); break; case 5: if (tmp_str != "QUAL") LOG.warning("Sixth Header entry should be QUAL: " + tmp_str); break; case 6: if (tmp_str != "FILTER") LOG.warning("Seventh Header entry should be FILTER: " + tmp_str); break; case 7: if (tmp_str != "INFO") LOG.warning("Eighth Header entry should be INFO: " + tmp_str); break; case 8: if (tmp_str != "FORMAT") LOG.warning("Ninth Header entry should be FORMAT: " + tmp_str); else has_genotypes = true; break; default: { if (count <= 8) LOG.error("Incorrectly formatted header."); indv.push_back(tmp_str); N_header_indv++; } break; } count++; } N_indv = N_header_indv; if ((has_genotypes == true ) && (N_indv == 0)) LOG.warning("FORMAT field without genotypes?"); } void vcf_file::scan_file(const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index) { bool filter_by_chr = (chrs_to_keep.size() != 0); bool exclude_by_chr = (exclude_chrs.size() != 0); string index_filename = filename + ".vcfidx"; bool could_read_index_file = false; if (force_write_index == false) could_read_index_file = read_index_file(index_filename); string CHROM, last_CHROM=""; unsigned int meta_counter = 1; if (could_read_index_file == false) { int POS, last_POS = -1; bool found_header = false; bool found_meta = false; LOG.printLOG("Building new index file.\n"); string line, CHROM, last_CHROM = ""; streampos filepos; char c; N_entries=0; N_indv = 0; while (!eof()) { filepos = get_filepos(); c = peek(); if ((c == '\n') || (c == '\r')) { read_line(line); continue; } else if (c == EOF) break; if (c == '#') { read_line(line); if (line[1] == '#') { // Meta information parse_meta(line, meta_counter); found_meta = true; } else { // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) parse_header(line); found_header = true; } } else { // Must be a data line if ((found_header == false) || (found_meta == false)) LOG.error("No header or meta information. Invalid file: " + filename); read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS); if (POS == last_POS) { if (last_CHROM == CHROM) LOG.one_off_warning("\tWarning - file contains entries with the same position. These entries will be processed separately.\n"); } else if (last_POS > POS) { if (last_CHROM == CHROM) LOG.error(" VCF file is not sorted at position " + CHROM + ":" + LOG.int2str(POS) + ".\n"); } if (last_CHROM != CHROM) { LOG.printLOG("\tScanning Chromosome: " + CHROM + "\n"); last_CHROM = CHROM; } last_POS = POS; entry_file_locations.push_back(filepos); N_entries++; } } if ((found_header == false) || (found_meta == false)) LOG.error("No header or meta information. Invalid file: " + filename); write_index_file(index_filename); } else { vector meta_lines = meta; meta.resize(0); meta_counter = 1; for (unsigned int ui=0; ui contig_vector; get_default_contigs(contig_vector); for(unsigned int ui=0; ui 0); if ((exclude_by_chr == true) || (filter_by_chr == true)) { unsigned int N_found_required_chr = chrs_to_keep.size(); LOG.printLOG("Filtering by chromosome.\n"); for (unsigned int ui=0; ui 0) && (entry_file_locations[0] < 0)) entry_file_locations.pop_front(); N_entries = entry_file_locations.size(); LOG.printLOG("Keeping " + output_log::int2str(N_entries) + " entries on specified chromosomes.\n"); } include_indv.clear(); include_indv.resize(N_indv, true); include_entry.clear(); include_entry.resize(N_entries, true); include_genotype.clear(); include_genotype.resize(N_entries, vector(N_indv, true)); } void vcf_file::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO) { for (unsigned int ui=0; ui 0) out << "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new vcf_entry(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]); } delete e; } void vcf_file::print(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO) { LOG.printLOG("Outputting VCF file... "); string output_file = output_file_prefix + ".recode.vcf"; ofstream out(output_file.c_str()); if (!out.is_open()) LOG.error("Could not open VCF Output File: " + output_file, 3); print(out, INFO_to_keep, keep_all_INFO); out.close(); LOG.printLOG("Done\n"); } void vcf_file::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) { string header_str; uint32_t len_text = 0; vector header; char magic[5] = {'B','C','F','\2', '\1'}; bgzf_write(out, magic, 5); for (unsigned int ui=0; ui contig_vector; get_default_contigs(contig_vector); for(unsigned int ui=0; ui 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui variant_line; entry * e = new vcf_entry(N_indv); for (unsigned int s=0; sreset(variant_line); e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print_bcf(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype[s]); } delete e; } void vcf_file::print_bcf(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO, bool stream) { LOG.printLOG("Outputting BCF file... "); BGZF * out; if(!stream) { string output_file = output_file_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); print_bcf(out, INFO_to_keep, keep_all_INFO); bgzf_close(out); LOG.printLOG("Done\n"); } void vcf_file::open() { struct stat buf; int i = stat(filename.c_str(), &buf); if (i != 0) { perror("stat error"); LOG.error("Can't determine file type of " + filename, 0); } if (!S_ISREG(buf.st_mode)) LOG.error("Does not appear to be a regular file: " + filename, 0); if (filename.substr(filename.size()-4) == ".bcf") LOG.error("Filename ends in '.bcf'. Shouldn't you be using --bcf?\n"); if (!compressed) { if (filename.substr(filename.size()-3) == ".gz") LOG.error("Filename ends in '.gz'. Shouldn't you be using --gzvcf or --gzdiff?\n"); vcf_in.open(filename.c_str(), ios::in); if (!vcf_in.is_open()) LOG.error("Could not open VCF file: " + filename, 0); } else { gzMAX_LINE_LEN = 1024*1024; gz_readbuffer = new char[gzMAX_LINE_LEN]; gzvcf_in = gzopen(filename.c_str(), "rb"); if (gzvcf_in == NULL) LOG.error("Could not open GZVCF file: " + filename, 0); #ifdef ZLIB_VERNUM string tmp(ZLIB_VERSION); LOG.printLOG("Using zlib version: " + tmp + "\n"); #if (ZLIB_VERNUM >= 0x1240) gzbuffer(gzvcf_in, gzMAX_LINE_LEN); // Included in zlib v1.2.4 and makes things MUCH faster #else LOG.printLOG("Versions of zlib >= 1.2.4 will be *much* faster when reading zipped VCF files.\n"); #endif #endif } } void vcf_file::close() { if (!compressed) vcf_in.close(); else { gzclose(gzvcf_in); delete [] gz_readbuffer; } } bool vcf_file::eof() { bool out; if (!compressed) out = vcf_in.eof(); else { out = gzeof(gzvcf_in); // Returns 1 when EOF has previously been detected reading the given input stream, otherwise zero. } return out; } streampos vcf_file::get_filepos() { if (!compressed) return vcf_in.tellg(); else { return gztell(gzvcf_in); // TODO: Type check } } void vcf_file::set_filepos(streampos &filepos) { if (!compressed) { vcf_in.clear(); vcf_in.seekg(filepos, ios::beg); } else { gzseek(gzvcf_in, filepos, SEEK_SET); } } void vcf_file::get_entry(unsigned int entry_num, vector &out) { set_filepos( entry_file_locations[entry_num] ); read_line(out); } entry* vcf_file::get_entry_object(unsigned int N_indv) { return new vcf_entry(N_indv); } void vcf_file::read_line(string &out) { if (!compressed) { getline(vcf_in, out); out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line } else { out = ""; bool again = true; while (again == true) { gzgets(gzvcf_in, gz_readbuffer, gzMAX_LINE_LEN); out.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } out.erase( out.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) } } void vcf_file::read_line(vector &out) { static string tmp; tmp.resize(0); if (!compressed) { getline(vcf_in, tmp); tmp.erase( tmp.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line } else { bool again = true; while (again == true) { gzgets(gzvcf_in, gz_readbuffer, gzMAX_LINE_LEN); tmp.append(gz_readbuffer); if (strlen(gz_readbuffer) != gzMAX_LINE_LEN-1) again = false; } tmp.erase( tmp.find_last_not_of(" \t\n\r") + 1); // Trim whitespace at end of line (required in gzipped case!) } // out.assign(tmp.begin(),tmp.end()); vector tmp_char(tmp.begin(),tmp.end()); out = tmp_char; } char vcf_file::peek() { if (!compressed) return vcf_in.peek(); else { char c = gzgetc(gzvcf_in); gzungetc(c, gzvcf_in); return c; } } int vcf_file::read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS) { if (!compressed) { getline(vcf_in, CHROM, '\t'); vcf_in >> POS; vcf_in.ignore(std::numeric_limits::max(), '\n'); } else { static string line; static stringstream ss; read_line(line); ss.clear(); ss.str(line); getline(ss, CHROM, '\t'); ss >> POS; } return eof(); } void vcf_file::read_CHROM_only(string &CHROM) { // Just read in the chromosome. Note: leaves the stream in a funny state, but is faster than reading whole line if (!compressed) { getline(vcf_in, CHROM, '\t'); } else { CHROM = ""; char c = gzgetc(gzvcf_in); while (c != '\t') { CHROM += c; c = gzgetc(gzvcf_in); } } } void vcf_file::read_CHROM_and_POS_only(string &CHROM, int &POS) { // Just read in the chromosome and position. Note: leaves the stream in a funny state, but is faster than reading whole line if (!compressed) { getline(vcf_in, CHROM, '\t'); vcf_in >> POS; } else { CHROM = ""; char c = gzgetc(gzvcf_in); while (c != '\t') { CHROM += c; c = gzgetc(gzvcf_in); } string tmp; c = gzgetc(gzvcf_in); while (c != '\t') { tmp += c; c = gzgetc(gzvcf_in); } POS = atoi(tmp.c_str()); } } vcftools_0.1.11/cpp/vcf_entry.cpp0000644000000000000000000004767412156354766015537 0ustar rootroot/* * vcf_entry.cpp * * Created on: Aug 19, 2009 * Author: Adam Auton * ($Revision: 230 $) */ #include "vcf_entry.h" map vcf_entry::INFO_map; map vcf_entry::INFO_reverse_map; map vcf_entry::FILTER_map; map vcf_entry::FILTER_reverse_map; map vcf_entry::FORMAT_map; map vcf_entry::FORMAT_reverse_map; map vcf_entry::CONTIG_map; string vcf_entry::convert_line; vcf_entry::vcf_entry(const unsigned int n_indv, const vector &line) { N_indv = n_indv; basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; passed_filters = false; parsed_FORMAT_binary = false; N_INFO_removed = 0; N_FORMAT_removed = 0; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; FORMAT_positions.resize(n_indv); FORMAT_types.resize(n_indv); FORMAT_sizes.resize(n_indv); FORMAT_skip.resize(n_indv); FORMAT_keys.resize(n_indv); convert_line.clear(); convert_line.assign(line.begin(), line.end()); data_stream.str(convert_line); } // Create an empty VCF entry vcf_entry::vcf_entry(const unsigned int n_indv) { N_indv = n_indv; basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; CHROM = ""; POS = -1; REF = ""; QUAL = -1; passed_filters = false; parsed_FORMAT_binary = false; N_INFO_removed = 0; N_FORMAT_removed = 0; parsed_GT = vector(N_indv, false); parsed_GQ = vector(N_indv, false); parsed_DP = vector(N_indv, false); parsed_FT = vector(N_indv, false); GT_idx = -1; GQ_idx = -1; DP_idx = -1; FT_idx = -1; FORMAT_positions.resize(n_indv); FORMAT_types.resize(n_indv); FORMAT_sizes.resize(n_indv); FORMAT_skip.resize(n_indv); FORMAT_keys.resize(n_indv); convert_line.clear(); data_stream.str(""); } vcf_entry::~vcf_entry() {} // Reset the VCF entry object with a new data line void vcf_entry::reset(const vector &data_line) { basic_parsed = false; fully_parsed = false; parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; parsed_FORMAT = false; parsed_FORMAT_binary = false; data_stream.clear(); convert_line.clear(); convert_line.assign(data_line.begin(), data_line.end()); data_stream.str(convert_line); fill(parsed_GT.begin(), parsed_GT.end(), false); fill(parsed_GQ.begin(), parsed_GQ.end(), false); fill(parsed_DP.begin(), parsed_DP.end(), false); fill(parsed_FT.begin(), parsed_FT.end(), false); N_INFO_removed = 0; N_FORMAT_removed = 0; FORMAT_positions.clear(); FORMAT_types.clear(); FORMAT_sizes.clear(); FORMAT_skip.clear(); FORMAT_keys.clear(); } // Tokenize the basic information in a VCF data line (at the tab level) void vcf_entry::parse_basic_entry(bool parse_ALT, bool parse_FILTER, bool parse_INFO) { // The following would break on spaces too, which caused a bug :-( //data_stream >> CHROM >> POS >> ID >> REF >> ALT_str >> QUAL_str >> FILTER_str >> INFO_str; getline(data_stream, CHROM, '\t'); getline(data_stream, ID, '\t'); POS = atoi(ID.c_str()); getline(data_stream, ID, '\t'); getline(data_stream, REF, '\t'); getline(data_stream, ALT_str, '\t'); getline(data_stream, QUAL_str, '\t'); getline(data_stream, FILTER_str, '\t'); getline(data_stream, INFO_str, '\t'); QUAL = str2double(QUAL_str); // Convert to uppercase for consistency // Note that VCF v4.1 allows mixtures of lower/upper case in REF and ALT. // However, the spec specifically states that tools using VCF are not required // to preserve the case. std::transform(REF.begin(), REF.end(), REF.begin(), ::toupper); std::transform(ALT_str.begin(), ALT_str.end(),ALT_str.begin(), ::toupper); parsed_ALT = false; parsed_FILTER = false; parsed_INFO = false; basic_parsed = true; if (parse_ALT) set_ALT(ALT_str); if (parse_FILTER) set_FILTER(FILTER_str); if (parse_INFO) set_INFO(INFO_str); } // Tokenize the genotype information (at the 'tab' level) in the VCF entry void vcf_entry::parse_full_entry(bool parse_FORMAT) { if (basic_parsed == false) parse_basic_entry(); //data_stream >> FORMAT_str; getline(data_stream, FORMAT_str, '\t'); if (parse_FORMAT) set_FORMAT(FORMAT_str); string tmpstr; tmpstr.reserve(64); GENOTYPE_str.resize(N_indv, tmpstr); for (unsigned int ui=0; ui> GENOTYPE_str[ui]; getline(data_stream, GENOTYPE_str[ui], '\t'); // The following line copies the GENOTYPE fields from the stringstream into the GENOTYPE_str vector. // Is actually slower than the above code. //copy(istream_iterator(data_stream), istream_iterator(), GENOTYPE_str.begin()); fully_parsed = true; } // Tokenize a given genotype entry into it's component parts void vcf_entry::parse_genotype_entry(unsigned int indv, bool GT, bool GQ, bool DP, bool FT) { if (fully_parsed == false) parse_full_entry(true); if (parsed_FORMAT == false) set_FORMAT(FORMAT_str); static string tmpstr; static istringstream ss; ss.clear(); ss.str(GENOTYPE_str[indv]); int N_required = GT + GQ + DP + FT; int N_got = 0; int i=0; while (getline(ss, tmpstr, ':')) { if (GT && (i == GT_idx)) // (FORMAT[ui] == "GT") { set_indv_GENOTYPE_and_PHASE(indv, tmpstr); N_got++; } else if (GQ && (i == GQ_idx)) // (FORMAT[ui] == "GQ") { set_indv_GQUALITY(indv, str2double(tmpstr)); N_got++; } else if (DP && (i == DP_idx)) // (FORMAT[ui] == "DP") { set_indv_DEPTH(indv, str2int(tmpstr)); N_got++; } else if (FT && (i == FT_idx)) // (FORMAT[ui] == "FT") { set_indv_GFILTER(indv, tmpstr); N_got++; } if (N_got == N_required) break; i++; } // Set missing return values if requested a value, but couldn't find it if (GT && (parsed_GT[indv] == false)) { set_indv_GENOTYPE_and_PHASE(indv, make_pair(-1,-1), '/'); } if (GQ && (parsed_GQ[indv] == false)) { set_indv_GQUALITY(indv, -1); } if (DP && (parsed_DP[indv] == false)) { set_indv_DEPTH(indv, -1); } if (FT && (parsed_FT[indv] == false)) { set_indv_GFILTER(indv, ""); } } // Read the VCF entry and fully populate the object void vcf_entry::parse_genotype_entries(bool GT, bool GQ, bool DP, bool FT) { for (unsigned int ui=0; ui tmp_vector; vector tmp_split; vector< vector > format_matrix(N_indv); unsigned int type, number, size, position=0; for (unsigned int ui=0; ui include_indv(N_indv, true); vector include_genotype(N_indv, true); set INFO_to_keep; print(out, INFO_to_keep, false, include_indv, include_genotype); } void vcf_entry::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO) { vector include_indv(N_indv, true); vector include_genotype(N_indv, true); print(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype); } // Output VCF entry to output stream void vcf_entry::print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) { if (fully_parsed == false) parse_full_entry(); out << get_CHROM() << '\t' << POS << '\t' << get_ID() << '\t' << REF << '\t' << get_ALT(); out << '\t' << double2str(QUAL); out << '\t' << get_FILTER(); if (keep_all_INFO == false) out << '\t' << get_INFO(INFO_to_keep); else out << '\t' << INFO_str; pair genotype; string GFILTER_tmp; if (FORMAT.size() > 0) { char PHASE; out << '\t' << get_FORMAT(); for (unsigned int ui=0; ui include_indv(N_indv, true); vector include_genotype(N_indv, true); set INFO_to_keep; print_bcf(out, INFO_to_keep, false, include_indv, include_genotype); } void vcf_entry::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) { vector include_indv(N_indv, true); vector include_genotype(N_indv, true); print_bcf(out, INFO_to_keep, keep_all_INFO, include_indv, include_genotype); } // Output VCF entry to output stream in binary void vcf_entry::print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) { if (fully_parsed == false) parse_full_entry(); if (parsed_FORMAT_binary == false) parse_FORMAT(); vector out_vector, tmp_vector; out_vector.resize(8*sizeof(int32_t)); int vector_pos = 2*sizeof(uint32_t); string tmp_string; int index; vector filter_vector; vector > tmp_info; tmp_string = get_CHROM(); if (tmp_string == "." or tmp_string == " " or tmp_string == "") LOG.error("CHROM value must be defined for all entries.",0); if (CONTIG_map.find(tmp_string) == CONTIG_map.end() ) LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.",0); int32_t chrom = (int32_t)CONTIG_map[tmp_string]; memcpy(&out_vector[vector_pos], &chrom, sizeof(chrom)); vector_pos += sizeof(chrom); get_POS_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_rlen(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_QUAL_binary(tmp_vector); memcpy(&out_vector[vector_pos], &tmp_vector[0], tmp_vector.size()); vector_pos += tmp_vector.size(); tmp_vector.resize(0); get_ID_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_ALLELES_binary(tmp_vector); out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end()); tmp_vector.resize(0); get_FILTER_vector(filter_vector); if (passed_filters == true) make_typed_int(tmp_vector, 0, true); else if (filter_vector.empty()) make_typed_int_vector(tmp_vector, filter_vector); else { vector index_vector; for(unsigned int ui=0; ui &include_genotype_out, int min_depth, int max_depth) { if (fully_parsed == false) parse_full_entry(); //if (FORMAT_to_idx.find("DP") != FORMAT_to_idx.end()) if (DP_idx != -1) { // Have depth info int depth; include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui max_depth)) include_genotype_out[ui] = false; } } } // Filter specific genotypes by quality void vcf_entry::filter_genotypes_by_quality(vector &include_genotype_out, double min_genotype_quality) { if (fully_parsed == false) parse_full_entry(); //if (FORMAT_to_idx.find("GQ") != FORMAT_to_idx.end()) if (GQ_idx != -1) { // Have quality info double quality; include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui &include_genotype_out, const set &filter_flags_to_remove, bool remove_all) { if (fully_parsed == false) parse_full_entry(); vector GFILTERs; if (FT_idx != -1) { // Have GFilter info include_genotype_out.resize(N_indv, true); for (unsigned int ui=0; ui::epsilon(); double FPMIN = numeric_limits::min() / numeric_limits::epsilon(); gln=gammln(a); b=x+1.0-a; c=1.0/FPMIN; d=1.0/b; h=d; for (i=1;;i++) { an = -i*(i-a); b += 2.0; d=an*d+b; if (fabs(d) < FPMIN) d=FPMIN; c=b+an/c; if (fabs(c) < FPMIN) c=FPMIN; d=1.0/d; del=d*c; h *= del; if (fabs(del-1.0) <= EPS) break; } return exp(-x+a*log(x)-gln)*h; } double gser(double a, double x, double &gln) { double sum,del,ap; gln=gammln(a); ap=a; del=sum=1.0/a; for (;;) { ++ap; del *= x/ap; sum += del; if (fabs(del) < fabs(sum)*numeric_limits::epsilon()) { return sum*exp(-x+a*log(x)-gln); } } return 0; } double gammp(double a, double x) { double gamser,gammcf,gln; if (x < 0.0 || a <= 0.0 || (x != x) || (a != a)) { return numeric_limits::quiet_NaN(); } if (x==0.0) return 0.0; if (x < (a+1.0)) { gamser=gser(a,x,gln); return gamser; } else { gammcf = gcf(a,x,gln); return 1.0-gammcf; } } double gammq(double a, double x) { double gamser,gammcf,gln; if (x < 0.0 || a <= 0.0 || (x != x) || (a != a)) { return numeric_limits::quiet_NaN(); } if (x == 0.0) return 1.0; if (x < (a+1.0)) { gamser=gser(a,x,gln); return 1.0-gamser; } else { gammcf = gcf(a,x,gln); return gammcf; } } vcftools_0.1.11/cpp/khash.h0000644000000000000000000004454012156354766014270 0ustar rootroot/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - http://code.google.com/p/ulib/ - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better performance for random input. Double hashing is still the default as it is more robust to certain non-random input. * Added Wang's integer hash function (not used by default). This hash function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. */ #define AC_VERSION_KHASH_H "0.2.6" #include #include #include /* compipler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER #define inline __inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ extern kh_##name##_t *kh_init_##name(); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ __KHASH_TYPE(name, khkey_t, khval_t) \ SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ free(h->vals); \ free(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ kroundup32(new_n_buckets); \ if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ inc = __ac_inc(k, new_mask); \ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ free(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) static inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ vcftools_0.1.11/cpp/bgzf.h0000644000000000000000000001437312156354766014123 0ustar rootroot/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The BGZF library was originally written by Bob Handsaker from the Broad * Institute. It was later improved by the SAMtools developers. */ #ifndef __BGZF_H #define __BGZF_H #include #include #include #include #define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE #define BGZF_MAX_BLOCK_SIZE 0x10000 #define BGZF_ERR_ZLIB 1 #define BGZF_ERR_HEADER 2 #define BGZF_ERR_IO 4 #define BGZF_ERR_MISUSE 8 typedef struct { int errcode:16, is_write:2, is_be:2, compress_level:12; int cache_size; int block_length, block_offset; int64_t block_address; void *uncompressed_block, *compressed_block; void *cache; // a pointer to a hash table void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading #ifdef BGZF_MT void *mt; // only used for multi-threading #endif } BGZF; #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifdef __cplusplus extern "C" { #endif /****************** * Basic routines * ******************/ /** * Open an existing file descriptor for reading or writing. * * @param fd file descriptor * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. * @return BGZF file handler; 0 on error */ BGZF* bgzf_dopen(int fd, const char *mode); #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility /** * Open the specified file for reading or writing. */ BGZF* bgzf_open(const char* path, const char *mode); /** * Close the BGZF and free all associated resources. * * @param fp BGZF file handler * @return 0 on success and -1 on error */ int bgzf_close(BGZF *fp); /** * Read up to _length_ bytes from the file storing into _data_. * * @param fp BGZF file handler * @param data data array to read into * @param length size of data to read * @return number of bytes actually read; 0 on end-of-file and -1 on error */ ssize_t bgzf_read(BGZF *fp, void *data, size_t length); /** * Write _length_ bytes from _data_ to the file. * * @param fp BGZF file handler * @param data data array to write * @param length size of data to write * @return number of bytes actually written; -1 on error */ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length); /** * Write the data in the buffer to the file. */ int bgzf_flush(BGZF *fp); /** * Return a virtual file pointer to the current location in the file. * No interpetation of the value should be made, other than a subsequent * call to bgzf_seek can be used to position the file at the same point. * Return value is non-negative on success. */ #define bgzf_tell(fp) ((((BGZF*)fp)->block_address << 16) | (((BGZF*)fp)->block_offset & 0xFFFF)) /** * Set the file to read from the location specified by _pos_. * * @param fp BGZF file handler * @param pos virtual file offset returned by bgzf_tell() * @param whence must be SEEK_SET * @return 0 on success and -1 on error */ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); /** * Check if the BGZF end-of-file (EOF) marker is present * * @param fp BGZF file handler opened for reading * @return 1 if EOF is present; 0 if not or on I/O error */ int bgzf_check_EOF(BGZF *fp); /** * Check if a file is in the BGZF format * * @param fn file name * @return 1 if _fn_ is BGZF; 0 if not or on I/O error */ int bgzf_is_bgzf(const char *fn); /********************* * Advanced routines * *********************/ /** * Set the cache size. Only effective when compiled with -DBGZF_CACHE. * * @param fp BGZF file handler * @param size size of cache in bytes; 0 to disable caching (default) */ void bgzf_set_cache_size(BGZF *fp, int size); /** * Flush the file if the remaining buffer size is smaller than _size_ */ int bgzf_flush_try(BGZF *fp, ssize_t size); /** * Read one byte from a BGZF file. It is faster than bgzf_read() * @param fp BGZF file handler * @return byte read; -1 on end-of-file or error */ int bgzf_getc(BGZF *fp); /** * Read one line from a BGZF file. It is faster than bgzf_getc() * * @param fp BGZF file handler * @param delim delimitor * @param str string to write to; must be initialized * @return length of the string; 0 on end-of-file; negative on error */ int bgzf_getline(BGZF *fp, int delim, kstring_t *str); /** * Read the next BGZF block. */ int bgzf_read_block(BGZF *fp); #ifdef BGZF_MT /** * Enable multi-threading (only effective on writing) * * @param fp BGZF file handler; must be opened for writing * @param n_threads #threads used for writing * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended */ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); #endif #ifdef __cplusplus } #endif #endif vcftools_0.1.11/cpp/bgzf.c0000644000000000000000000005234312156354766014115 0ustar rootroot/* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include "bgzf.h" #ifdef _USE_KNETFILE #include "knetfile.h" typedef knetFile *_bgzf_file_t; #define _bgzf_open(fn, mode) knet_open((fn), (mode)) #define _bgzf_dopen(fd, mode) knet_dopen((fd), (mode)) #define _bgzf_close(fp) knet_close((knetFile*)(fp)) #define _bgzf_fileno(fp) (((knetFile*)(fp))->fd) #define _bgzf_tell(fp) knet_tell((knetFile*)(fp)) #define _bgzf_seek(fp, offset, whence) knet_seek((knetFile*)(fp), (offset), (whence)) #define _bgzf_read(fp, buf, len) knet_read((knetFile*)(fp), (buf), (len)) #define _bgzf_write(fp, buf, len) knet_write((knetFile*)(fp), (buf), (len)) #else // ~defined(_USE_KNETFILE) #if defined(_WIN32) || defined(_MSC_VER) #define ftello(fp) ftell((FILE*)(fp)) #define fseeko(fp, offset, whence) fseek((FILE*)(fp), (offset), (whence)) #else // ~defined(_WIN32) extern off_t ftello(FILE *stream); extern int fseeko(FILE *stream, off_t offset, int whence); #endif // ~defined(_WIN32) typedef FILE *_bgzf_file_t; #define _bgzf_open(fn, mode) fopen((fn), (mode)) #define _bgzf_dopen(fd, mode) fdopen(fd, (mode)) #define _bgzf_close(fp) fclose((FILE*)(fp)) #define _bgzf_fileno(fp) fileno((FILE*)(fp)) #define _bgzf_tell(fp) ftello((FILE*)(fp)) #define _bgzf_seek(fp, offset, whence) fseeko((FILE*)(fp), (offset), (whence)) #define _bgzf_read(fp, buf, len) fread((buf), 1, (len), (FILE*)(fp)) #define _bgzf_write(fp, buf, len) fwrite((buf), 1, (len), (FILE*)(fp)) #endif // ~define(_USE_KNETFILE) #define BLOCK_HEADER_LENGTH 18 #define BLOCK_FOOTER_LENGTH 8 /* BGZF/GZIP header (speciallized from RFC 1952; little endian): +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; #ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; #include "khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) #endif static inline int ed_is_big() { long one= 1; return !(*((char *)(&one))); } static inline void packInt16(uint8_t *buffer, uint16_t value) { buffer[0] = value; buffer[1] = value >> 8; } static inline int unpackInt16(const uint8_t *buffer) { return buffer[0] | buffer[1] << 8; } static inline void packInt32(uint8_t *buffer, uint32_t value) { buffer[0] = value; buffer[1] = value >> 8; buffer[2] = value >> 16; buffer[3] = value >> 24; } static BGZF *bgzf_read_init() { BGZF *fp; fp = (BGZF*)calloc(1, sizeof(BGZF)); fp->is_write = 0; fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); #ifdef BGZF_CACHE fp->cache = kh_init(cache); #endif return fp; } static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level { BGZF *fp; fp = (BGZF*)calloc(1, sizeof(BGZF)); fp->is_write = 1; fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; return fp; } // get the compress level from the mode string static int mode2level(const char *__restrict mode) { int i, compress_level = -1; for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; if (mode[i]) compress_level = (int)mode[i] - '0'; if (strchr(mode, 'u')) compress_level = 0; return compress_level; } BGZF *bgzf_open(const char *path, const char *mode) { BGZF *fp = 0; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_open(path, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fopen(path, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } fp->is_be = ed_is_big(); return fp; } BGZF *bgzf_dopen(int fd, const char *mode) { BGZF *fp = 0; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r') || strchr(mode, 'R')) { _bgzf_file_t fpr; if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; fp = bgzf_read_init(); fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'W')) { FILE *fpw; if ((fpw = fdopen(fd, "w")) == 0) return 0; fp = bgzf_write_init(mode2level(mode)); fp->fp = fpw; } fp->is_be = ed_is_big(); return fp; } static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) { uint32_t crc; z_stream zs; uint8_t *dst = (uint8_t*)_dst; // compress the body zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)src; zs.avail_in = slen; zs.next_out = dst + BLOCK_HEADER_LENGTH; zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; if (deflateEnd(&zs) != Z_OK) return -1; *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; // write the header memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes // write the footer crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen); packInt32((uint8_t*)&dst[*dlen - 8], crc); packInt32((uint8_t*)&dst[*dlen - 4], slen); return 0; } // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. static int deflate_block(BGZF *fp, int block_length) { int comp_size = BGZF_MAX_BLOCK_SIZE; if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } fp->block_offset = 0; return comp_size; } // Inflate the block in fp->compressed_block into fp->uncompressed_block static int inflate_block(BGZF* fp, int block_length) { z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = (Bytef*)fp->compressed_block + 18; zs.avail_in = block_length - 16; zs.next_out = (Bytef*)fp->uncompressed_block; zs.avail_out = BGZF_MAX_BLOCK_SIZE; if (inflateInit2(&zs, -15) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { inflateEnd(&zs); fp->errcode |= BGZF_ERR_ZLIB; return -1; } if (inflateEnd(&zs) != Z_OK) { fp->errcode |= BGZF_ERR_ZLIB; return -1; } return zs.total_out; } static int check_header(const uint8_t *header) { return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 && unpackInt16((uint8_t*)&header[10]) == 6 && header[12] == 'B' && header[13] == 'C' && unpackInt16((uint8_t*)&header[14]) == 2); } #ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (fp->is_write) return; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); } static int load_block_from_cache(BGZF *fp, int64_t block_address) { khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; k = kh_get(cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); return p->size; } static void cache_block(BGZF *fp, int size) { int ret; khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { /* A better way would be to remove the oldest block in the * cache, but here we remove a random one for simplicity. This * should not have a big impact on performance. */ for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) break; if (k < kh_end(h)) { free(kh_val(h, k).block); kh_del(cache, h, k); } } k = kh_put(cache, h, fp->block_address, &ret); if (ret == 0) return; // if this happens, a bug! p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); } #else static void free_cache(BGZF *fp) {} static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} static void cache_block(BGZF *fp, int size) {} #endif int bgzf_read_block(BGZF *fp) { uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; int count, size = 0, block_length, remaining; int64_t block_address; block_address = _bgzf_tell((_bgzf_file_t)fp->fp); if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; count = _bgzf_read(fp->fp, header, sizeof(header)); if (count == 0) { // no data read fp->block_length = 0; return 0; } if (count != sizeof(header) || !check_header(header)) { fp->errcode |= BGZF_ERR_HEADER; return -1; } size = count; block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" compressed_block = (uint8_t*)fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); remaining = block_length - BLOCK_HEADER_LENGTH; count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); if (count != remaining) { fp->errcode |= BGZF_ERR_IO; return -1; } size += count; if ((count = inflate_block(fp, block_length)) < 0) return -1; if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. fp->block_address = block_address; fp->block_length = count; cache_block(fp, size); return 0; } ssize_t bgzf_read(BGZF *fp, void *data, size_t length) { ssize_t bytes_read = 0; uint8_t *output = (uint8_t*)data; if (length <= 0) return 0; assert(fp->is_write == 0); while (bytes_read < length) { int copy_length, available = fp->block_length - fp->block_offset; uint8_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) return -1; available = fp->block_length - fp->block_offset; if (available <= 0) break; } copy_length = length - bytes_read < available? length - bytes_read : available; buffer = (uint8_t*)fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; bytes_read += copy_length; } if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = fp->block_length = 0; } return bytes_read; } #ifdef BGZF_MT typedef struct { BGZF *fp; struct mtaux_t *mt; void *buf; int i, errcode, toproc; } worker_t; typedef struct mtaux_t { int n_threads, n_blks, curr, done; volatile int proc_cnt; void **blk; int *len; worker_t *w; pthread_t *tid; pthread_mutex_t lock; pthread_cond_t cv; } mtaux_t; static int worker_aux(worker_t *w) { int i, stop = 0; // wait for condition: to process or all done pthread_mutex_lock(&w->mt->lock); while (!w->toproc && !w->mt->done) pthread_cond_wait(&w->mt->cv, &w->mt->lock); if (w->mt->done) stop = 1; w->toproc = 0; pthread_mutex_unlock(&w->mt->lock); if (stop) return 1; // to quit the thread w->errcode = 0; for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { int clen = BGZF_MAX_BLOCK_SIZE; if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) w->errcode |= BGZF_ERR_ZLIB; memcpy(w->mt->blk[i], w->buf, clen); w->mt->len[i] = clen; } __sync_fetch_and_add(&w->mt->proc_cnt, 1); return 0; } static void *mt_worker(void *data) { while (worker_aux((worker_t*)data) == 0); return 0; } int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) { int i; mtaux_t *mt; pthread_attr_t attr; if (!fp->is_write || fp->mt || n_threads <= 1) return -1; mt = (mtaux_t*)calloc(1, sizeof(mtaux_t)); mt->n_threads = n_threads; mt->n_blks = n_threads * n_sub_blks; mt->len = (int*)calloc(mt->n_blks, sizeof(int)); mt->blk = (void**)calloc(mt->n_blks, sizeof(void*)); for (i = 0; i < mt->n_blks; ++i) mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t)); for (i = 0; i < mt->n_threads; ++i) { mt->w[i].i = i; mt->w[i].mt = mt; mt->w[i].fp = fp; mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); } pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_mutex_init(&mt->lock, 0); pthread_cond_init(&mt->cv, 0); for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); fp->mt = mt; return 0; } static void mt_destroy(mtaux_t *mt) { int i; // signal all workers to quit pthread_mutex_lock(&mt->lock); mt->done = 1; mt->proc_cnt = 0; pthread_cond_broadcast(&mt->cv); pthread_mutex_unlock(&mt->lock); for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread // free other data allocated on heap for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); pthread_cond_destroy(&mt->cv); pthread_mutex_destroy(&mt->lock); free(mt); } static void mt_queue(BGZF *fp) { mtaux_t *mt = (mtaux_t*)fp->mt; assert(mt->curr < mt->n_blks); // guaranteed by the caller memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); mt->len[mt->curr] = fp->block_offset; fp->block_offset = 0; ++mt->curr; } static int mt_flush(BGZF *fp) { int i; mtaux_t *mt = (mtaux_t*)fp->mt; if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail // signal all the workers to compress pthread_mutex_lock(&mt->lock); for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; mt->proc_cnt = 0; pthread_cond_broadcast(&mt->cv); pthread_mutex_unlock(&mt->lock); // worker 0 is doing things here worker_aux(&mt->w[0]); // wait for all the threads to complete while (mt->proc_cnt < mt->n_threads); // dump data to disk for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; for (i = 0; i < mt->curr; ++i) if (fwrite(mt->blk[i], 1, mt->len[i], (FILE*)fp->fp) != (size_t)mt->len[i]) fp->errcode |= BGZF_ERR_IO; mt->curr = 0; return 0; } static int mt_lazy_flush(BGZF *fp) { mtaux_t *mt = (mtaux_t*)fp->mt; if (fp->block_offset) mt_queue(fp); if (mt->curr == mt->n_blks) return mt_flush(fp); return -1; } static ssize_t mt_write(BGZF *fp, const void *data, size_t length) { const uint8_t *input = (const uint8_t*)data; ssize_t rest = length; while (rest) { int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest; memcpy((uint8_t*)fp->uncompressed_block + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; rest -= copy_length; if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp); } return length - rest; } #endif // ~ #ifdef BGZF_MT int bgzf_flush(BGZF *fp) { if (!fp->is_write) return 0; #ifdef BGZF_MT if (fp->mt) return mt_flush(fp); #endif while (fp->block_offset > 0) { int block_length; block_length = deflate_block(fp, fp->block_offset); if (block_length < 0) return -1; if (fwrite(fp->compressed_block, 1, block_length, (FILE*)fp->fp) != (size_t)block_length) { fp->errcode |= BGZF_ERR_IO; // possibly truncated file return -1; } fp->block_address += block_length; } return 0; } int bgzf_flush_try(BGZF *fp, ssize_t size) { if (fp->block_offset + size > BGZF_BLOCK_SIZE) { #ifdef BGZF_MT if (fp->mt) return mt_lazy_flush(fp); else return bgzf_flush(fp); #else return bgzf_flush(fp); #endif } return -1; } ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) { const uint8_t *input = (const uint8_t*)data; int block_length = BGZF_BLOCK_SIZE, bytes_written = 0; assert(fp->is_write); #ifdef BGZF_MT if (fp->mt) return mt_write(fp, data, length); #endif while (bytes_written < length) { uint8_t* buffer = (uint8_t*)fp->uncompressed_block; int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; memcpy(buffer + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; bytes_written += copy_length; if (fp->block_offset == block_length && bgzf_flush(fp)) break; } return bytes_written; } int bgzf_close(BGZF* fp) { int ret, block_length; if (fp == 0) return -1; if (fp->is_write) { if (bgzf_flush(fp) != 0) return -1; fp->compress_level = -1; block_length = deflate_block(fp, 0); // write an empty block fwrite(fp->compressed_block, 1, block_length, (FILE*)fp->fp); if (fflush((FILE*)fp->fp) != 0) { fp->errcode |= BGZF_ERR_IO; return -1; } #ifdef BGZF_MT if (fp->mt) mt_destroy((mtaux_t*)fp->mt); #endif } ret = fp->is_write? fclose((FILE*)fp->fp) : _bgzf_close(fp->fp); if (ret != 0) return -1; free(fp->uncompressed_block); free(fp->compressed_block); free_cache(fp); free(fp); return 0; } void bgzf_set_cache_size(BGZF *fp, int cache_size) { if (fp) fp->cache_size = cache_size; } int bgzf_check_EOF(BGZF *fp) { uint8_t buf[28]; off_t offset; offset = _bgzf_tell((_bgzf_file_t)fp->fp); if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; _bgzf_read(fp->fp, buf, 28); _bgzf_seek(fp->fp, offset, SEEK_SET); return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0; } int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { int block_offset; int64_t block_address; if (fp->is_write || where != SEEK_SET) { fp->errcode |= BGZF_ERR_MISUSE; return -1; } block_offset = pos & 0xFFFF; block_address = pos >> 16; if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { fp->errcode |= BGZF_ERR_IO; return -1; } fp->block_length = 0; // indicates current block has not been loaded fp->block_address = block_address; fp->block_offset = block_offset; return 0; } int bgzf_is_bgzf(const char *fn) { uint8_t buf[16]; int n; _bgzf_file_t fp; if ((fp = _bgzf_open(fn, "r")) == 0) return 0; n = _bgzf_read(fp, buf, 16); _bgzf_close(fp); if (n != 16) return 0; return memcmp(g_magic, buf, 16) == 0? 1 : 0; } int bgzf_getc(BGZF *fp) { int c; if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) return -2; /* error */ if (fp->block_length == 0) return -1; /* end-of-file */ } c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; if (fp->block_offset == fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } return c; } #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif int bgzf_getline(BGZF *fp, int delim, kstring_t *str) { int l, state = 0; unsigned char *buf = (unsigned char*)fp->uncompressed_block; str->l = 0; do { if (fp->block_offset >= fp->block_length) { if (bgzf_read_block(fp) != 0) { state = -2; break; } if (fp->block_length == 0) { state = -1; break; } } for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); if (l < fp->block_length) state = 1; l -= fp->block_offset; if (str->l + l + 1 >= str->m) { str->m = str->l + l + 2; kroundup32(str->m); str->s = (char*)realloc(str->s, str->m); } memcpy(str->s + str->l, buf + fp->block_offset, l); str->l += l; fp->block_offset += l + 1; if (fp->block_offset >= fp->block_length) { fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); fp->block_offset = 0; fp->block_length = 0; } } while (state == 0); if (str->l == 0 && state < 0) return state; str->s[str->l] = 0; return str->l; } vcftools_0.1.11/cpp/variant_file.cpp0000644000000000000000000001226412156354766016166 0ustar rootroot/* * variant_file.cpp * * Created on: Dec 11, 2012 * Author: amarcketta */ #include "variant_file.h" variant_file::~variant_file() {} void variant_file::apply_filters(const parameters ¶ms) { LOG.printLOG("Applying Required Filters.\n"); // Apply all filters in turn. filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file); filter_sites_by_allele_type(params.keep_only_indels, params.remove_indels); filter_sites(params.snps_to_keep, params.snps_to_keep_file, params.snps_to_exclude_file); filter_sites_by_filter_status(params.site_filter_flags_to_exclude, params.site_filter_flags_to_keep, params.remove_all_filtered_sites); string chr_to_keep = ""; if (params.chrs_to_keep.size() == 1) chr_to_keep = *(params.chrs_to_keep.begin()); // Get first chromosome in list (there should only be one). filter_sites_by_position(chr_to_keep, params.start_pos, params.end_pos); filter_sites_by_positions(params.positions_file, params.exclude_positions_file); filter_sites_by_BED_file(params.BED_file, params.BED_exclude); filter_sites_by_number_of_alleles(params.min_alleles, params.max_alleles); filter_sites_by_INFO_flags(params.site_INFO_flags_to_remove, params.site_INFO_flags_to_keep); filter_sites_by_quality(params.min_quality); filter_sites_by_mean_depth(params.min_mean_depth, params.max_mean_depth); filter_sites_by_mask(params.mask_file, params.invert_mask, params.min_kept_mask_value); filter_individuals_by_mean_depth(params.min_indv_mean_depth, params.max_indv_mean_depth); if (params.phased_only == true) { filter_individuals_by_phase(); filter_sites_by_phase(); } filter_genotypes_by_quality(params.min_genotype_quality); filter_genotypes_by_depth(params.min_genotype_depth, params.max_genotype_depth); filter_genotypes_by_filter_flag(params.geno_filter_flags_to_exclude, params.remove_all_filtered_genotypes); filter_individuals_by_call_rate(params.min_indv_call_rate); filter_individuals_randomly(params.max_N_indv); filter_sites_by_frequency_and_call_rate(params.min_maf, params.max_maf, params.min_non_ref_af, params.max_non_ref_af, params.min_site_call_rate); filter_sites_by_allele_count(params.min_mac, params.max_mac, params.min_non_ref_ac, params.max_non_ref_ac, params.max_missing_call_count); filter_sites_by_HWE_pvalue(params.min_HWE_pvalue); filter_sites_by_thinning(params.min_interSNP_distance); } // Return the number of individuals that have not been filtered out int variant_file::N_kept_individuals() const { int N_kept = 0; for (unsigned int ui=0; ui &contig_vector) { contig_vector.resize(0); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); contig_vector.push_back("##contig="); } vcftools_0.1.11/cpp/bcf_entry_setters.cpp0000644000000000000000000002536012156354766017250 0ustar rootroot/* * bcf_entry_setters.cpp * * Created on: Sep 20, 2012 * Author: Anthony Marcketta * ($Revision: 1 $) */ #include "bcf_entry.h" void bcf_entry::set_QUAL(const float &in) { QUAL = in; } void bcf_entry::set_ALT(const int n_allele) { ALT.resize(n_allele-1); unsigned int pos = ALT_pos; string allele; for (int ui=0; ui<(n_allele-1); ui++) { allele = get_typed_string( &pos, line ); std::transform(allele.begin(), allele.end(), allele.begin(), ::toupper); ALT[ui] = allele; } parsed_ALT = true; } void bcf_entry::set_ALT(const string &in) { istringstream ss(in); string tmpstr; ALT.resize(0); while(!ss.eof()) { getline(ss, tmpstr, ','); add_ALT_allele(tmpstr); } parsed_ALT = true; } void bcf_entry::set_INFO() { int key; unsigned int size, type, i = INFO_pos; string data_type; INFO.resize(N_info); bool miss = true; for (unsigned int ui=0; ui INFO_entry(INFO_map[key].ID, "."); data_type = INFO_map[key].Type_str; ostringstream ss(ostringstream::out); for (unsigned int uj=0; uj &in) { int8_t tmp, tmp2; char phased[2] = {'/', '|'}; ploidy.resize(N_indv); ploidy[indv] = 0; for (unsigned int ui=0; ui(in[ui]); if ( tmp == (int8_t)0x80 ) break; ploidy[indv]++; } if (ploidy[indv] == 0) { set_indv_GENOTYPE_alleles(indv, make_pair(-1, -1)); } else if (ploidy[indv] == 1) { set_indv_PHASE(indv, '|'); tmp = in[0]; if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; set_indv_GENOTYPE_alleles(indv, make_pair(tmp, -1)); } else if (ploidy[indv] == 2) { tmp = in[0]; tmp2 = in[1]; if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; if (tmp2 == (int8_t)0x80) { tmp2 = -1; set_indv_PHASE(indv, '/'); } else { char phase = phased[ tmp2 & (int8_t)1 ]; tmp2 = (tmp2 >> 1) - 1; set_indv_PHASE(indv, phase); } set_indv_GENOTYPE_alleles(indv, make_pair((int)tmp, (int)tmp2)); } else if (ploidy[indv] > 2) LOG.error("Polyploidy found, and not supported by vcftools: " + CHROM + ":" + int2str(POS)); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const unsigned int &pos, const unsigned int &size) { int8_t tmp, tmp2; unsigned int cur_pos = pos; char phased[2] = {'/', '|'}; ploidy.resize(N_indv); ploidy[indv] = 0; for (unsigned int ui=0; ui(&line[cur_pos]); if ( tmp == (int8_t)0x80 ) break; ploidy[indv]++; cur_pos += sizeof(int8_t); } if (ploidy[indv] == 0) { set_indv_GENOTYPE_alleles(indv, make_pair(-1, -1)); } else if (ploidy[indv] == 1) { set_indv_PHASE(indv, '|'); tmp = *reinterpret_cast(&line[pos]); if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; set_indv_GENOTYPE_alleles(indv, make_pair(tmp, -1)); } else if (ploidy[indv] == 2) { tmp = *reinterpret_cast(&line[pos]); tmp2 = *reinterpret_cast(&line[pos+sizeof(int8_t)]); if (tmp == (int8_t)0x80) tmp = -1; else tmp = (tmp >> 1) - 1; if (tmp2 == (int8_t)0x80) { tmp2 = -1; set_indv_PHASE(indv, '/'); } else { char phase = phased[ tmp2 & (int8_t)1 ]; tmp2 = (tmp2 >> 1) - 1; set_indv_PHASE(indv, phase); } set_indv_GENOTYPE_alleles(indv, make_pair((int)tmp, (int)tmp2)); } else if (ploidy[indv] > 2) LOG.error("Polyploidy found, and not supported by vcftools: " + CHROM + ":" + int2str(POS)); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { set_indv_GENOTYPE_ids(indv, genotype); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase) { pair a(-1,-1); if (genotype.first != ".") a.first = str2int(genotype.first); if (genotype.second != ".") a.second = str2int(genotype.second); set_indv_GENOTYPE_alleles(indv, a); set_indv_PHASE(indv, phase); parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (in.first != 0x80) a.first = in.first; if (in.second != 0x80) a.second = in.second; GENOTYPE[indv] = in; parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); pair a(-1,-1); if (a1 != '.') a.first = a1 - '0'; if (a2 != '.') a.second = a2 - '0'; GENOTYPE[indv] = a; parsed_GT[indv] = true; } void bcf_entry::set_indv_GENOTYPE_ids(unsigned int indv, const pair &in) { if (GENOTYPE.size() == 0) GENOTYPE.resize(N_indv, make_pair(-1,-1)); GENOTYPE[indv] = in; } void bcf_entry::set_indv_PHASE(unsigned int indv, char in) { if (PHASE.size() == 0) PHASE.resize(N_indv, '/'); PHASE[indv] = in; parsed_GT[indv] = true; } void bcf_entry::set_indv_GQUALITY(unsigned int indv, const vector &in) { float tmp; memcpy(&tmp, &in[0], sizeof(tmp)); parsed_GQ[indv] = true; if (tmp == 0x7F800001) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (tmp > 99.0) tmp = 99; GQUALITY[indv] = tmp; } void bcf_entry::set_indv_GQUALITY(unsigned int indv, const float &in) { parsed_GQ[indv] = true; if ( (in == -1) or (in == 0x7F800001) ) { if (GQUALITY.size() > 0) GQUALITY[indv] = -1; return; } if (GQUALITY.size() == 0) GQUALITY.resize(N_indv, -1); if (in > 99) GQUALITY[indv] = 99; else GQUALITY[indv] = in; } void bcf_entry::set_indv_DEPTH(unsigned int indv, int in) { parsed_DP[indv] = true; if (in == -1) { if (DEPTH.size() > 0) DEPTH[indv] = -1; return; } if (DEPTH.size() == 0) DEPTH.resize(N_indv, -1); DEPTH[indv] = in; } void bcf_entry::set_indv_GFILTER(unsigned int indv, const vector &in) { parsed_FT[indv] = true; if (GFILTER.size() == 0) GFILTER.resize(N_indv); GFILTER[indv].resize(0); if (in.empty()) return; else if ((in.size() == 1) and (in[0] == '\0') ) return; ostringstream ss; string ith_FILTER; ss.clear(); for (unsigned int ui=0; ui #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "parameters.h" #include "entry.h" #include "gamma.h" #include "vcf_entry.h" #include "bcf_entry.h" #include "header.h" #ifdef VCFTOOLS_PCA #include "dgeev.h" #endif extern output_log LOG; using namespace std; class variant_file { public: string filename; bool compressed; vector meta; vector include_indv; vector indv; unsigned int N_indv; unsigned int N_entries; bool bcf_format; header header_obj; deque entry_file_locations; deque include_entry; bool has_genotypes; bool has_body; bool has_file_format; bool has_header; bool has_meta; bool has_contigs; deque > include_genotype; virtual void scan_file(const set &chrs_to_keep, const set &exclude_chrs, bool force_write_index=false) = 0; int N_kept_individuals() const; int N_kept_sites() const; int N_total_sites() const; int N_total_indv() const; virtual void open() = 0; virtual void close() = 0; virtual bool eof() = 0; virtual inline void read_CHROM_only(string &CHROM) = 0; virtual void read_CHROM_and_POS_only(string &CHROM, int &POS) = 0; virtual inline int read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS) = 0; virtual streampos get_filepos() = 0; virtual void set_filepos(streampos &filepos) = 0; void apply_filters(const parameters ¶ms); virtual void get_entry(unsigned int entry_num, vector &out) = 0; virtual entry* get_entry_object(unsigned int N_indv) = 0; bool read_index_file(const string &index_filename); void write_index_file(const string &index_filename); void ByteSwap(unsigned char *b, int n) const; int idx_read(gzFile &in, void *buffer, unsigned int len, size_t size); void idx_write(gzFile &out, void *buffer, unsigned int len, size_t size); bool big_endian_machine; static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); }; void filter_sites(const set &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false); void filter_sites_to_keep(const set &snps_to_keep, const string &snps_to_keep_file); void filter_sites_to_exclude(const string &snps_to_exclude_file); void filter_sites_by_position(const string &chr, int start_pos, int end_pos); void filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file); void filter_sites_by_quality(double min_quality); void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth); void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate); void filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels); void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count); void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles); void filter_sites_by_HWE_pvalue(double min_HWE_pvalue); void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false); void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0); void filter_sites_by_filter_status(const set &filter_flags_to_remove, const set &filter_flags_to_keep, bool remove_all = false); void filter_sites_by_phase(); void filter_sites_by_thinning(int min_SNP_distance); void filter_sites_by_INFO_flags(const set &flags_to_remove, const set &flags_to_keep); void filter_individuals(const set &indv_to_keep, const set &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude=true); void filter_individuals_by_keep_list(const set &indv_to_keep, const string &indv_to_keep_filename); void filter_individuals_by_exclude_list(const set &indv_to_exclude, const string &indv_to_exclude_filename); void filter_individuals_by_call_rate(double min_call_rate); void filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth); void filter_individuals_by_phase(); void filter_individuals_randomly(int max_N_indv); void filter_genotypes_by_quality(double min_genotype_quality); void filter_genotypes_by_depth(int min_depth, int max_depth); void filter_genotypes_by_filter_flag(const set &filter_flags_to_remove, bool remove_all = false); void output_frequency(const string &output_file_prefix, bool output_counts=false, bool suppress_allele_output=false, bool derived=false); void output_individuals_by_mean_depth(const string &output_file_prefix); void output_site_depth(const string &output_file_prefix, bool output_mean=true); void output_genotype_depth(const string &output_file_prefix); void output_het(const string &output_file_prefix); void output_hwe(const string &output_file_prefix); void output_SNP_density(const string &output_file_prefix, int bin_size); void output_missingness(const string &output_file_prefix); void output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_r2); void output_genotype_r2(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_r2); void output_genotype_chisq(const string &output_file_prefix, int snp_window_size, int snp_window_min, int bp_window_size, int bp_window_min, double min_pval); void output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2=0.1); void output_interchromosomal_haplotype_r2(const string &output_file_prefix, double min_r2=0.1); void output_haplotype_r2_of_SNP_list_vs_all_others(const string &output_file_prefix, const string &positions_file, double min_r2); void output_genotype_r2_of_SNP_list_vs_all_others(const string &output_file_prefix, const string &positions_file, double min_r2); void output_singletons(const string &output_file_prefix); void output_TsTv(const string &output_file_prefix, int bin_size); void output_TsTv_by_count(const string &output_file_prefix); void output_TsTv_by_quality(const string &output_file_prefix); void output_per_site_nucleotide_diversity(const string &output_file_prefix); void output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size, int window_step); void output_Tajima_D(const string &output_file_prefix, int window_size); void output_site_quality(const string &output_file_prefix); void output_FILTER_summary(const string &output_file_prefix); void output_kept_and_removed_sites(const string &output_file_prefix); void output_LROH(const string &output_file_prefix); void output_indv_relatedness(const string &output_file_prefix); void output_PCA(const string &output_file_prefix, bool use_normalisation=true, int SNP_loadings_N_PCs=-1); void output_indel_hist(const string &output_file_prefix); void output_as_012_matrix(const string &output_file_prefix); void output_as_plink(const string &output_file_prefix); void output_as_plink_tped(const string &output_file_prefix); void output_BEAGLE_genotype_likelihoods(const string &output_file_prefix, int GL_or_PL=0); void output_as_IMPUTE(const string &output_file_prefix); void output_as_LDhat_phased(const string &output_file_prefix); void output_as_LDhat_unphased(const string &output_file_prefix); void output_LDhat_locs_file(const string &output_file_prefix, unsigned int &n_sites_out); void output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id); void output_hapmap_fst(const string &output_file_prefix, const vector &indv_files); void output_weir_and_cockerham_fst(const string &output_file_prefix, const vector &indv_files); void output_windowed_weir_and_cockerham_fst(const string &output_file_prefix, const vector &indv_files, int fst_window_size, int fst_window_step); void output_windowed_hapmap_fst(const string &output_file_prefix, const vector &indv_files, int fst_window_size, int fst_window_step); void output_sites_in_files(const string &output_file_prefix, variant_file &diff_vcf_file); void output_indv_in_files(const string &output_file_prefix, variant_file &diff_vcf_file, const string &indv_ID_map_file=""); void output_discordance_by_site(const string &output_file_prefix, variant_file &diff_vcf_file, const string &indv_ID_map_file=""); void output_discordance_matrix(const string &output_file_prefix, variant_file &diff_vcf_file, const string &indv_ID_map_file=""); void output_discordance_by_indv(const string &output_file_prefix, variant_file &diff_vcf_file, const string &indv_ID_map_file=""); void output_switch_error(const string &output_file_prefix, variant_file &diff_vcf_file, const string &indv_ID_map_file=""); void output_INFO_for_each_site(const string &output_file_prefix, const vector &INFO_to_extract); virtual void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO) = 0; virtual void print(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO=false) = 0; virtual void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO) = 0; virtual void print_bcf(const string &output_file_prefix, const set &INFO_to_keep, bool keep_all_INFO=false, bool stream=false) = 0; void calc_hap_r2(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &r2, double &D, double &Dprime, int &chr_count); void calc_geno_r2(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &r2, int &chr_count); void calc_r2_em(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &r2, int &indv_count); void calc_geno_chisq(entry *e, entry *e2, const vector &include_geno1, const vector &include_geno2, double &chisq, double &dof, double &pval, int &indv_count); void return_indv_union(variant_file &file2, map > &combined_individuals, const string &indv_ID_map_file=""); void return_site_union(variant_file &file2, map, pair > &out); void get_default_contigs(vector &contig_vector); virtual ~variant_file(); }; #endif /* VARIANT_FILE_H_ */ vcftools_0.1.11/cpp/entry.cpp0000644000000000000000000005326112156354766014666 0ustar rootroot/* * entry.cpp * * Created on: Dec 12, 2012 * Author: amarcketta */ #include "entry.h" /* // This function implements an exact SNP test of Hardy-Weinberg // Equilibrium as described in Wigginton, JE, Cutler, DJ, and // Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg // Equilibrium. American Journal of Human Genetics. 76: 000 - 000 // // Written by Jan Wigginton */ double entry::SNPHWE(int obs_hets, int obs_hom1, int obs_hom2) { if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; if (obs_hom1 < 0 || obs_hom2 < 0 || obs_hets < 0) LOG.error("Internal error: negative count in HWE test", 91); int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; int rare_copies = 2 * obs_homr + obs_hets; int genotypes = obs_hets + obs_homc + obs_homr; double * het_probs = (double *) malloc((size_t) (rare_copies + 1) * sizeof(double)); if (het_probs == NULL) LOG.error("Internal error: SNP-HWE: Unable to allocate array", 90); for (int i = 0; i <= rare_copies; i++) het_probs[i] = 0.0; /* start at midpoint */ int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); /* check to ensure that midpoint and rare alleles have same parity */ if ((rare_copies & 1) ^ (mid & 1)) mid++; int curr_hets = mid; int curr_homr = (rare_copies - mid) / 2; int curr_homc = genotypes - curr_hets - curr_homr; het_probs[mid] = 1.0; double sum = het_probs[mid]; for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) { het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); sum += het_probs[curr_hets - 2]; /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ curr_homr++; curr_homc++; } curr_hets = mid; curr_homr = (rare_copies - mid) / 2; curr_homc = genotypes - curr_hets - curr_homr; for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) { het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); sum += het_probs[curr_hets + 2]; /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ curr_homr--; curr_homc--; } for (int i = 0; i <= rare_copies; i++) het_probs[i] /= sum; /* alternate p-value calculation for p_hi/p_lo double p_hi = het_probs[obs_hets]; for (int i = obs_hets + 1; i <= rare_copies; i++) p_hi += het_probs[i]; double p_lo = het_probs[obs_hets]; for (int i = obs_hets - 1; i >= 0; i--) p_lo += het_probs[i]; double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; */ double p_hwe = 0.0; /* p-value calculation for p_hwe */ for (int i = 0; i <= rare_copies; i++) { if (het_probs[i] > het_probs[obs_hets]) continue; p_hwe += het_probs[i]; } p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; free(het_probs); return p_hwe; } int entry::str2int(const string &in, const int missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atoi(in.c_str()); } double entry::str2double(const string &in, const double missing_value) { if ((in.size() == 0) || (in == ".")) return missing_value; else return atof(in.c_str()); } string entry::int2str(const int in, const int missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } } string entry::double2str(const double in, const double missing_value) { if (in == missing_value) return "."; else { static ostringstream out; out.str(""); out.clear(); out << in; return out.str(); } } void entry::tokenize(const string &in, char token, vector &out) { out.resize(0); istringstream ss(in); string tmp; while( getline(ss, tmp, token) ) { out.push_back(tmp); } } void entry::copy_object(vector &out, int &position, const vector &in) { memcpy(&out[position], &in, in.size() ); position += in.size(); } void entry::make_typed_string(vector &out, const string &in, bool typed) { vector tmp_vector; out.resize(0); if (in == "." or in == " " or in == "") { if (typed == false) return; int8_t tmp = (int8_t)0; tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); return; } if (typed == true) { if (in.length() >= 15) { int8_t tmp = (int8_t)15; tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); make_typed_int(tmp_vector, in.length(), typed); out.insert( out.end(), tmp_vector.begin(), tmp_vector.end() ); } else { int8_t tmp = (int8_t)in.length(); tmp = tmp << 4; tmp = tmp | (int8_t)7; out.push_back( tmp ); } } out.reserve(out.size()+in.size()); copy(in.begin(), in.end(), back_inserter(out)); } void entry::make_typed_int(vector &out, const int &in, bool typed) { vector tmp_char; out.resize(0); int type; int8_t size_type = (int8_t)1; if (in < 127 and in >-127) type = 1; else if (in < 32767 and in>-32767) type = 2; else type = 3; make_int(tmp_char, in, type); if (typed == true) { size_type = size_type << 4; size_type = size_type | type; out.push_back(size_type); } out.insert(out.end(), tmp_char.begin(), tmp_char.end()); } void entry::make_typed_string_vector( vector &out, const vector &in, int number ) { vector tmp_char; int max_val = 0; int8_t size_type; out.resize(0); if (number == -1) { for (unsigned int ui=0; ui max_val) max_val = in[ui].size(); } } else max_val = number; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)7; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)7; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.end() ); } for (unsigned int ui=0; ui &out, vector &in ) { vector tmp_vector; int8_t size_type; int max_ploidy = 0; out.resize(0); max_ploidy = *max_element(ploidy.begin(), ploidy.end()); if (max_ploidy < 15) { size_type = (int8_t)max_ploidy; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); make_typed_int(tmp_vector, max_ploidy, true); out.insert( out.end(), tmp_vector.begin(), tmp_vector.end() ); tmp_vector.resize(0); } for (unsigned int ui=0; ui &out, string &in, int exp_size) { int8_t tmp_int; int8_t phased = 0; out.resize(0); for (unsigned int ui=0; ui &out, const string &in, int number ) { vector tmp_char; vector tmp_ints; vector split_string; int converted, type; int8_t size_type; unsigned int max = 0; unsigned int max_val = 0; out.resize(0); if (in == " " or in == "." or in == "") { size_type = (int8_t)0; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); return; } tokenize(in, ',', split_string); if (number == -1) { if (split_string.size() > max_val) max_val = split_string.size(); } else max_val = number; for (unsigned int ui=0; ui (int)max) and ( converted != (int)0x80000000)) max = abs(converted); } else converted = 0x80000000; tmp_ints.push_back( converted ); } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const vector &in, int number ) { vector tmp_char; vector tmp_ints; vector split_string; int converted, type; int8_t size_type; unsigned int max = 0; unsigned int max_val = 0; out.resize(0); if (number == -1) { unsigned int tmp_int = 0; for (unsigned int ui=0; ui max_val) max_val = tmp_int; } max_val++; } else max_val = number; for (unsigned int ui=0; ui (int)max) and (converted != (int)0x80000000)) max = abs(converted); } else converted = 0x80000000; tmp_ints.push_back( converted ); } } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (max_val < 15) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, max_val, true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const vector &in ) { vector tmp_char; int type; int8_t size_type; unsigned int max = 0; out.resize(0); for (unsigned int ui=0; ui (int)max) and ( (int8_t)in[ui] != (int8_t)0x80)) max = abs(in[ui]); } if (max < 127) type = 1; else if (max < 32767) type = 2; else type = 3; if (in.size() < 15) { size_type = (int8_t)in.size(); size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)type; out.push_back( size_type ); make_typed_int(tmp_char, in.size(), true); out.insert( out.end(), tmp_char.begin(), tmp_char.begin() ); } for (unsigned int ui=0; ui &out, const int &in, int type) { out.resize(0); if (type == 1) { int8_t tmp_int; if (in == (int)0x80000000 || in >= 128) tmp_int = (int8_t)0x80; else tmp_int = (int8_t)in; out.push_back( (int8_t)tmp_int); } else if (type == 2) { int16_t tmp_int; if (in == (int)0x80000000 || in >= 32768) tmp_int = 0x8000; else tmp_int = (int16_t)in; int8_t split; for(unsigned int ui=0; ui<2; ui++) { split = tmp_int & (int16_t)0x00FF;//0000000011111111 out.push_back(split); tmp_int = tmp_int >> 8; } } else { int32_t tmp_int; tmp_int = (int32_t)in; int8_t split; for(unsigned int ui=0; ui<4; ui++) { split = tmp_int & (int32_t)0x0000FF; out.push_back( (int8_t)split); tmp_int = tmp_int >> 8; } } } void entry::make_typed_float_vector(vector &out, const string &in, int number ) { vector split_string; int8_t size_type; int max_val = 0; out.resize(0); if (in == " " or in == "." or in == "") { size_type = (int8_t)0; size_type = size_type << 4; size_type = size_type | (int8_t)1; out.push_back( size_type ); return; } tokenize(in, ',', split_string); if (number == -1) max_val = split_string.size(); else max_val = number; if ( max_val < 15 ) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); vector size_vector; make_typed_int(size_vector, max_val, true ); out.insert(out.end(), size_vector.begin(), size_vector.end()); } float value; char missing[4] = {0x01, 0x00, 0x80, 0x7F}; for(unsigned int ui=0; (int)ui &out, const vector &in, int number ) { vector split_string; int8_t size_type; unsigned int max_val = 0; out.resize(0); if (number == -1) { unsigned int tmp_int = 0; for (unsigned int ui=0; ui max_val) max_val = tmp_int; } max_val++; } else max_val = number; if ( max_val < 15 ) { size_type = (int8_t)max_val; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); } else { size_type = (int8_t)15; size_type = size_type << 4; size_type = size_type | (int8_t)5; out.push_back( size_type ); vector size_vector; make_typed_int(size_vector, max_val, true ); out.insert(out.end(), size_vector.begin(), size_vector.end()); } float value; char missing[4] = {0x01, 0x00, 0x80, 0x7F}; for (unsigned int ui=0; ui &out, const unsigned int &type, const unsigned int &size) { uint8_t byte; vector tmp_vector; tmp_vector.resize(0); out.resize(0); if (size < 15) { byte = size; byte = byte << 4; } else { byte = (uint8_t)15; make_typed_int(tmp_vector, size, true); } byte = byte | (uint8_t)type; out.push_back(byte); out.insert(out.end(), tmp_vector.begin(), tmp_vector.end()); } float entry::get_typed_float(unsigned int * line_position, const vector& line) { unsigned int size, type; float out; get_type( line_position, line, type, size ); if (size > 1) { LOG.printLOG("Error: Float vector when expected only a single Float value.\n" ); exit(0); } if (type == 5) { memcpy(&out, &line[*line_position], sizeof(out)); *line_position += sizeof(out); } else { LOG.printLOG("Error: Float expected but found type " + int2str(type) + ".\n" ); exit(0); } return out; } vector entry::get_typed_float_vector(unsigned int * line_position, const vector& line) { unsigned int size, type; get_type( line_position, line, type, size ); vector out(size); if (type == 5) { float tmp; for (unsigned int ui=0; ui& line) { unsigned int size, type; string out; get_type( line_position, line, type, size ); if (type != 7) { LOG.printLOG("Error: Expected type 7 for string. Found type " + int2str(type) + ".\n"); } char * tmp = new char[size]; memcpy(tmp, &line[*line_position], size*sizeof(char)); *line_position += size; out = string( tmp, size ); if (out == "" or out == " ") out = "."; return out; } int entry::get_typed_int(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size) { int out; get_type( line_position, line, type, size ); if (size > 1) { LOG.printLOG("Error: Int vector when expected only a single Integer value.\n" ); exit(0); } if (type == 1) { int8_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else if (type == 2) { int16_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else if (type == 3) { int32_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); out = tmp; } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } return out; } vector entry::get_int_vector(unsigned int * line_position, const vector& line) { unsigned int size, type; get_type( line_position, line, type, size ); vector out(size); if (type == 1) { int8_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else if (type == 2) { int16_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else if (type == 3) { int32_t tmp; for (unsigned int ui=0; ui(&line[*line_position]); *line_position += sizeof(tmp); out[ui] = tmp; } } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } return out; } void entry::get_type(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size) { uint8_t byte = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(byte); size = byte >> 4; type = (byte & (uint8_t)15); if (size == 15) { int type2; byte = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(byte); type2 = (byte & (uint8_t)15); if (type2 == 1) { int8_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (unsigned int)tmp; } else if (type2 == 2) { int16_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (int)tmp; } else if (type2 == 3) { int32_t tmp; tmp = *reinterpret_cast(&line[*line_position]); *line_position += sizeof(tmp); size = (unsigned int)tmp; } else { LOG.printLOG("Error: Invalid type for integer size.\n"); exit(0); } } } void entry::skip_section(unsigned int *line_position, const vector &line) { unsigned int type, size; get_type(line_position, line, type, size); if ( (type == 1) || (type == 7) ) *line_position += sizeof(int8_t)*size; else if (type == 2) *line_position += sizeof(int16_t)*size; else if ( (type == 3) || (type == 5) ) *line_position += sizeof(int32_t)*size; } bool entry::check_missing(unsigned int line_position, const unsigned int type, const vector &line) { static char missing_float[4] = {0x01, 0x00, 0x80, 0x7F}; static char missing_int1 = 0x80; static char missing_int2[2] = {0x00, 0x80}; static char missing_int3[4] = {0x00, 0x00, 0x00, 0x80}; char test_char; bool missing = true; if (type==1) { test_char = *reinterpret_cast(&line[line_position]); missing = (test_char == missing_int1); } else if (type==2) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_int2[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==3) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_int3[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==5) { for (unsigned int ui=0; ui(&line[line_position]); if (test_char != missing_float[ui]) { missing = false; break; } line_position += sizeof(char); } } else if (type==7) missing = false; return missing; } void entry::decode_genotype(int8_t in, int >, bool &phased) { GT = (int)(in >> 1)-1; phased = (in & (int8_t)1); } void entry::get_number(uint32_t &out, unsigned int *line_position, const vector& line) { memcpy(&out, &line[*line_position], sizeof(out)); *line_position += sizeof(out); } vcftools_0.1.11/cpp/knetfile.c0000644000000000000000000004347512156354766014774 0ustar rootroot/* The MIT License Copyright (c) 2008 by Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Probably I will not do socket programming in the next few years and therefore I decide to heavily annotate this file, for Linux and Windows as well. -ac */ #include #include #include #include #include #include #include #include #ifndef _WIN32 #include #include #include #endif #include "knetfile.h" /* In winsock.h, the type of a socket is SOCKET, which is: "typedef * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed * integer -1. In knetfile.c, I use "int" for socket type * throughout. This should be improved to avoid confusion. * * In Linux/Mac, recv() and read() do almost the same thing. You can see * in the header file that netread() is simply an alias of read(). In * Windows, however, they are different and using recv() is mandatory. */ /* This function tests if the file handler is ready for reading (or * writing if is_read==0). */ static int socket_wait(int fd, int is_read) { fd_set fds, *fdr = 0, *fdw = 0; struct timeval tv; int ret; tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out FD_ZERO(&fds); FD_SET(fd, &fds); if (is_read) fdr = &fds; else fdw = &fds; ret = select(fd+1, fdr, fdw, 0, &tv); #ifndef _WIN32 if (ret == -1) perror("select"); #else if (ret == 0) fprintf(stderr, "select time-out\n"); else if (ret == SOCKET_ERROR) fprintf(stderr, "select: %d\n", WSAGetLastError()); #endif return ret; } #ifndef _WIN32 /* This function does not work with Windows due to the lack of * getaddrinfo() in winsock. It is addapted from an example in "Beej's * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ static int socket_connect(const char *host, const char *port) { #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) int on = 1, fd; struct linger lng = { 0, 0 }; struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; /* In Unix/Mac, getaddrinfo() is the most convenient way to get * server information. */ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); /* The following two setsockopt() are used by ftplib * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they * necessary. */ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); freeaddrinfo(res); return fd; } #else /* MinGW's printf has problem with "%lld" */ char *int64tostr(char *buf, int64_t x) { int cnt; int i = 0; do { buf[i++] = '0' + x % 10; x /= 10; } while (x); buf[i] = 0; for (cnt = i, i = 0; i < cnt/2; ++i) { int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; } return buf; } int64_t strtoint64(const char *buf) { int64_t x; for (x = 0; *buf != '\0'; ++buf) x = x * 10 + ((int64_t) *buf - 48); return x; } /* In windows, the first thing is to establish the TCP connection. */ int knet_win32_init() { WSADATA wsaData; return WSAStartup(MAKEWORD(2, 2), &wsaData); } void knet_win32_destroy() { WSACleanup(); } /* A slightly modfied version of the following function also works on * Mac (and presummably Linux). However, this function is not stable on * my Mac. It sometimes works fine but sometimes does not. Therefore for * non-Windows OS, I do not use this one. */ static SOCKET socket_connect(const char *host, const char *port) { #define __err_connect(func) \ do { \ fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ return -1; \ } while (0) int on = 1; SOCKET fd; struct linger lng = { 0, 0 }; struct sockaddr_in server; struct hostent *hp = 0; // open socket if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); // get host info if (isalpha(host[0])) hp = gethostbyname(host); else { struct in_addr addr; addr.s_addr = inet_addr(host); hp = gethostbyaddr((char*)&addr, 4, AF_INET); } if (hp == 0) __err_connect("gethost"); // connect server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); server.sin_family= AF_INET; server.sin_port = htons(atoi(port)); if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) return fd; } #endif static off_t my_netread(int fd, void *buf, off_t len) { off_t rest = len, curr, l = 0; /* recv() and read() may not read the required length of data with * one call. They have to be called repeatedly. */ while (rest) { if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading curr = netread(fd, (void*)((char*)buf + l), rest); /* According to the glibc manual, section 13.2, a zero returned * value indicates end-of-file (EOF), which should mean that * read() will not return zero if EOF has not been met but data * are not immediately available. */ if (curr == 0) break; l += curr; rest -= curr; } return l; } /************************* * FTP specific routines * *************************/ static int kftp_get_response(knetFile *ftp) { #ifndef _WIN32 unsigned char c; #else char c; #endif int n = 0; char *p; if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O //fputc(c, stderr); if (n >= ftp->max_response) { ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; ftp->response = (char*)realloc(ftp->response, ftp->max_response); } ftp->response[n++] = c; if (c == '\n') { if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) && ftp->response[3] != '-') break; n = 0; continue; } } if (n < 2) return -1; ftp->response[n-2] = 0; return strtol(ftp->response, &p, 0); } static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) { if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); return is_get? kftp_get_response(ftp) : 0; } static int kftp_pasv_prep(knetFile *ftp) { char *p; int v[6]; kftp_send_cmd(ftp, "PASV\r\n", 1); for (p = ftp->response; *p && *p != '('; ++p); if (*p != '(') return -1; ++p; sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; return 0; } static int kftp_pasv_connect(knetFile *ftp) { char host[80], port[10]; if (ftp->pasv_port == 0) { fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); return -1; } sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); sprintf(port, "%d", ftp->pasv_port); ftp->fd = socket_connect(host, port); if (ftp->fd == -1) return -1; return 0; } int kftp_connect(knetFile *ftp) { ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); if (ftp->ctrl_fd == -1) return -1; kftp_get_response(ftp); kftp_send_cmd(ftp, "USER anonymous\r\n", 1); kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); kftp_send_cmd(ftp, "TYPE I\r\n", 1); return 0; } int kftp_reconnect(knetFile *ftp) { if (ftp->ctrl_fd != -1) { netclose(ftp->ctrl_fd); ftp->ctrl_fd = -1; } netclose(ftp->fd); ftp->fd = -1; return kftp_connect(ftp); } // initialize ->type, ->host, ->retr and ->size knetFile *kftp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p; int l; if (strstr(fn, "ftp://") != fn) return 0; for (p = (char*)fn + 6; *p && *p != '/'; ++p); if (*p != '/') return 0; l = p - fn - 6; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_FTP; fp->fd = -1; /* the Linux/Mac version of socket_connect() also recognizes a port * like "ftp", but the Windows version does not. */ fp->port = strdup("21"); fp->host = (char*)calloc(l + 1, 1); if (strchr(mode, 'c')) fp->no_reconnect = 1; strncpy(fp->host, fn + 6, l); fp->retr = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->retr, "RETR %s\r\n", p); fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); sprintf(fp->size_cmd, "SIZE %s\r\n", p); fp->seek_offset = 0; return fp; } // place ->fd at offset off int kftp_connect_file(knetFile *fp) { int ret; long long file_size; if (fp->fd != -1) { netclose(fp->fd); if (fp->no_reconnect) kftp_get_response(fp); } kftp_pasv_prep(fp); kftp_send_cmd(fp, fp->size_cmd, 1); #ifndef _WIN32 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) { fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); return -1; } #else const char *p = fp->response; while (*p != ' ') ++p; while (*p < '0' || *p > '9') ++p; file_size = strtoint64(p); #endif fp->file_size = file_size; if (fp->offset>=0) { char tmp[32]; #ifndef _WIN32 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); #else strcpy(tmp, "REST "); int64tostr(tmp + 5, fp->offset); strcat(tmp, "\r\n"); #endif kftp_send_cmd(fp, tmp, 1); } kftp_send_cmd(fp, fp->retr, 0); kftp_pasv_connect(fp); ret = kftp_get_response(fp); if (ret != 150) { fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); netclose(fp->fd); fp->fd = -1; return -1; } fp->is_ready = 1; return 0; } /************************** * HTTP specific routines * **************************/ knetFile *khttp_parse_url(const char *fn, const char *mode) { knetFile *fp; char *p, *proxy, *q; int l; if (strstr(fn, "http://") != fn) return 0; // set ->http_host for (p = (char*)fn + 7; *p && *p != '/'; ++p); l = p - fn - 7; fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->http_host = (char*)calloc(l + 1, 1); strncpy(fp->http_host, fn + 7, l); fp->http_host[l] = 0; for (q = fp->http_host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; // get http_proxy proxy = getenv("http_proxy"); // set ->host, ->port and ->path if (proxy == 0) { fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. fp->port = strdup(*q? q : "80"); fp->path = strdup(*p? p : "/"); } else { fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); for (q = fp->host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; fp->port = strdup(*q? q : "80"); fp->path = strdup(fn); } fp->type = KNF_TYPE_HTTP; fp->ctrl_fd = fp->fd = -1; fp->seek_offset = 0; return fp; } int khttp_connect_file(knetFile *fp) { int ret, l = 0; char *buf, *p; if (fp->fd != -1) netclose(fp->fd); fp->fd = socket_connect(fp->host, fp->port); buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); l += sprintf(buf + l, "\r\n"); netwrite(fp->fd, buf, l); l = 0; while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency if (buf[l] == '\n' && l >= 3) if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; ++l; } buf[l] = 0; if (l < 14) { // prematured header netclose(fp->fd); fp->fd = -1; return -1; } ret = strtol(buf + 8, &p, 0); // HTTP return code if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file off_t rest = fp->offset; while (rest) { off_t l = rest < 0x10000? rest : 0x10000; rest -= my_netread(fp->fd, buf, l); } } else if (ret != 206 && ret != 200) { free(buf); fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); netclose(fp->fd); fp->fd = -1; return -1; } free(buf); fp->is_ready = 1; return 0; } /******************** * Generic routines * ********************/ knetFile *knet_open(const char *fn, const char *mode) { knetFile *fp = 0; if (mode[0] != 'r') { fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); return 0; } if (strstr(fn, "ftp://") == fn) { fp = kftp_parse_url(fn, mode); if (fp == 0) return 0; if (kftp_connect(fp) == -1) { knet_close(fp); return 0; } kftp_connect_file(fp); } else if (strstr(fn, "http://") == fn) { fp = khttp_parse_url(fn, mode); if (fp == 0) return 0; khttp_connect_file(fp); } else { // local file #ifdef _WIN32 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may * be undefined on some systems, although it is defined on my * Mac and the Linux I have tested on. */ int fd = open(fn, O_RDONLY | O_BINARY); #else int fd = open(fn, O_RDONLY); #endif if (fd == -1) { perror("open"); return 0; } fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; fp->ctrl_fd = -1; } if (fp && fp->fd == -1) { knet_close(fp); return 0; } return fp; } knetFile *knet_dopen(int fd, const char *mode) { knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); fp->type = KNF_TYPE_LOCAL; fp->fd = fd; return fp; } ssize_t knet_read(knetFile *fp, void *buf, size_t len) { off_t l = 0; if (fp->fd == -1) return 0; if (fp->type == KNF_TYPE_FTP) { if (fp->is_ready == 0) { if (!fp->no_reconnect) kftp_reconnect(fp); kftp_connect_file(fp); } } else if (fp->type == KNF_TYPE_HTTP) { if (fp->is_ready == 0) khttp_connect_file(fp); } if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX size_t rest = len; ssize_t curr; while (rest) { do { curr = read(fp->fd, (void*)((char*)buf + l), rest); } while (curr < 0 && EINTR == errno); if (curr < 0) return -1; if (curr == 0) break; l += curr; rest -= curr; } } else l = my_netread(fp->fd, buf, len); fp->offset += l; return l; } off_t knet_seek(knetFile *fp, off_t off, int whence) { if (whence == SEEK_SET && off == fp->offset) return 0; if (fp->type == KNF_TYPE_LOCAL) { /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ off_t offset = lseek(fp->fd, off, whence); if (offset == -1) return -1; fp->offset = offset; return fp->offset; } else if (fp->type == KNF_TYPE_FTP) { if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else if (whence == SEEK_END) fp->offset = fp->file_size + off; else return -1; fp->is_ready = 0; return fp->offset; } else if (fp->type == KNF_TYPE_HTTP) { if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); errno = ESPIPE; return -1; } if (whence == SEEK_CUR) fp->offset += off; else if (whence == SEEK_SET) fp->offset = off; else return -1; fp->is_ready = 0; return fp->offset; } errno = EINVAL; fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); return -1; } int knet_close(knetFile *fp) { if (fp == 0) return 0; if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific if (fp->fd != -1) { /* On Linux/Mac, netclose() is an alias of close(), but on * Windows, it is an alias of closesocket(). */ if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); else netclose(fp->fd); } free(fp->host); free(fp->port); free(fp->response); free(fp->retr); // FTP specific free(fp->path); free(fp->http_host); // HTTP specific free(fp); return 0; } #ifdef KNETFILE_MAIN int main(void) { char *buf; knetFile *fp; int type = 4, l; #ifdef _WIN32 knet_win32_init(); #endif buf = calloc(0x100000, 1); if (type == 0) { fp = knet_open("knetfile.c", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 1) { // NCBI FTP, large file fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); knet_seek(fp, 2500000000ll, SEEK_SET); l = knet_read(fp, buf, 255); } else if (type == 2) { fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 3) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); knet_seek(fp, 1000, SEEK_SET); } else if (type == 4) { fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); knet_read(fp, buf, 10000); knet_seek(fp, 20000, SEEK_SET); knet_seek(fp, 10000, SEEK_SET); l = knet_read(fp, buf+10000, 10000000) + 10000; } if (type != 4 && type != 1) { knet_read(fp, buf, 255); buf[255] = 0; printf("%s\n", buf); } else write(fileno(stdout), buf, l); knet_close(fp); free(buf); return 0; } #endif vcftools_0.1.11/cpp/output_log.cpp0000644000000000000000000000341412156354766015721 0ustar rootroot/* * log.cpp * * Created on: Nov 11, 2009 * Author: Adam Auton * ($Revision: 66 $) */ #include "output_log.h" output_log::output_log() : output_to_screen(true) { } void output_log::open(const string &filename_prefix ) { LOG.open((filename_prefix + ".log").c_str()); } void output_log::close() { LOG.close(); } void output_log::set_screen_output(bool do_screen_output) { output_to_screen = do_screen_output; } void output_log::printLOG(string s) { LOG << s; LOG.flush(); if (output_to_screen) { cout << s; cout.flush(); } } void output_log::error(string err_msg, int error_code) { printLOG("Error:" + err_msg + "\n"); exit(error_code); } void output_log::error(string err_msg, double value1, double value2, int error_code) { printLOG("Error:" + err_msg + "\n"); stringstream ss; ss << "Value1=" << value1 << " Value2=" << value2 << endl; printLOG(ss.str()); exit(error_code); } void output_log::warning(string err_msg) { printLOG(err_msg + "\n"); } void output_log::one_off_warning(string err_msg) { static set previous_warnings; if (previous_warnings.find(err_msg) == previous_warnings.end()) { printLOG(err_msg + "\n"); previous_warnings.insert(err_msg); } } string output_log::int2str(int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } string output_log::longint2str(long int n) { std::ostringstream s2( std::stringstream::out ); s2 << n; return s2.str(); } string output_log::dbl2str(double n, int prc) { std::ostringstream s2; if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } string output_log::dbl2str_fixed(double n, int prc) { std::ostringstream s2; s2 << setiosflags( ios::fixed ); if ( prc > 0 ) s2.precision(prc); s2 << n; return s2.str(); } vcftools_0.1.11/cpp/bcf_entry.h0000644000000000000000000001044012156354766015135 0ustar rootroot/* * bcf_entry.h * * Created on: Sep 20, 2012 * Author: Anthony Marcketta * ($Revision: 1 $) */ #include #include #include #include #include #include "output_log.h" #include "entry.h" #include "header.h" extern output_log LOG; class bcf_entry : public entry { public: bcf_entry(const unsigned int N_indv, const header &header_obj, const vector &line); bcf_entry(const unsigned int N_indv, const header &header_obj); ~bcf_entry(); map INFO_map; map FILTER_map; map FORMAT_map; map CONTIG_map; map CONTIG_reverse_map; map FILTER_reverse_map; map INFO_reverse_map; map FORMAT_reverse_map; header entry_header; unsigned int N_samples; unsigned int N_info; unsigned int N_format; unsigned int L_shared; unsigned int L_indiv; unsigned int line_pos; void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false); void parse_full_entry(bool parse_FORMAT=true); void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false); void set_ALT(const int n_allele); void set_ALT(const string &in); void set_QUAL(const float &in); void set_FILTER(); void set_FORMAT(); void set_INFO(); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const vector &in); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const pair &genotype, char phase); void set_indv_GENOTYPE_and_PHASE(unsigned int indv, const unsigned int &pos, const unsigned int &size); void set_indv_GENOTYPE_ids(unsigned int indv, const pair &in); void set_indv_GQUALITY(unsigned int indv, const vector &in); void set_indv_GQUALITY(unsigned int indv, const float &in); void set_indv_DEPTH(unsigned int indv, const vector &in); void set_indv_DEPTH(unsigned int indv, int in); void set_indv_GFILTER(unsigned int indv, const string &in); void set_indv_GFILTER(unsigned int indv, const vector &in); void set_indv_PHASE(unsigned int indv, char in); void set_indv_GENOTYPE_alleles(unsigned int indv, char a1, char a2); void set_indv_GENOTYPE_alleles(unsigned int indv, const pair &in); void reset(const vector &data_line); void add_FORMAT_entry(const string &in, const unsigned int &fmt_key, const unsigned int &pos, const unsigned int &line_pos, const unsigned int &type, const unsigned int &size); void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out); void read_indv_generic_entry(unsigned int indv, const int &idx, string &out); void write_out( const char * filename, const bool stream ); void read_all_entries(string &out, const vector &include_indv, const vector &include_genotype); void filter_genotypes_by_quality(vector &include_genotype_out, double min_genotype_quality); void filter_genotypes_by_depth(vector &include_genotype_out, int min_depth, int max_depth); void filter_genotypes_by_filter_status(vector &include_genotype_out, const set &filter_flags_to_remove, bool remove_all = false); void print(ostream &out); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false); void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype); void print_bcf(BGZF* out); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false); void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype); static int add_INFO_descriptor(const string &in, int index); static int add_FILTER_descriptor(const string &in, int index); static int add_FORMAT_descriptor(const string &in, int index); static void add_CONTIG_descriptor(const string &in, int index); private: vector line; vector INFO_str, QUAL_str; vector FILTER_str; vector ALT_str; unsigned int INFO_pos, FILTER_pos, ALT_pos, FORMAT_pos; }; vcftools_0.1.11/cpp/entry.h0000644000000000000000000002002312156354766014321 0ustar rootroot/* * entry.h * * Created on: Dec 12, 2012 * Author: amarcketta */ #ifndef ENTRY_H_ #define ENTRY_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include "bgzf.h" #include "output_log.h" using namespace std; extern output_log LOG; enum Type_enum {Integer=0, Float=1, Character=2, String=3, Flag=4}; class Field_description { public: string ID; int N_entries; string N_entries_str; string Type_str; Type_enum Type; string Description; string Length; string Assembly; Field_description() : ID(""), N_entries(0), Type(Integer), Description("") {}; ~Field_description() {}; }; class entry { public: unsigned int N_indv; virtual void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false) = 0; virtual void parse_full_entry(bool parse_FORMAT=true) = 0; virtual void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; virtual void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; virtual void reset(const vector &data_line) = 0; string get_CHROM() const; void get_CHROM(string &out) const; int get_POS() const; string get_ID() const; string get_REF() const; string get_ALT() const; string get_ALT_allele(int allele_num) const; void get_allele(int allele_num, string &out) const; string get_allele(int allele_num) const; void get_alleles_vector(vector &out) const; string get_FILTER() const; void get_FILTER_vector(vector &out) const; double get_QUAL() const; string get_INFO(const set &INFO_to_keep, bool keep_all_INFO=false) const; string get_INFO_value(const string &key) const; string get_FORMAT() const; void get_indv_GENOTYPE_ids(unsigned int indv, pair &out) const; void get_indv_GENOTYPE_strings(unsigned int indv, pair &out) const; char get_indv_PHASE(unsigned int indv) const; double get_indv_GQUALITY(unsigned int indv) const; int get_indv_DEPTH(unsigned int indv) const; void get_indv_GFILTER(unsigned int indv, string &out) const; void get_indv_GFILTER_vector(unsigned int indv, vector &out) const; int get_indv_ploidy(unsigned int indv) const; bool is_SNP() const; bool is_biallelic_SNP() const; bool is_diploid(const vector &include_indv, const vector &include_genotype) const; virtual void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out) = 0; bool FORMAT_id_exists(const string &FORMAT_id); void get_allele_counts(vector &out, unsigned int &N_non_missing_chr_out, const vector &include_indv, const vector &include_genotype) const; void get_genotype_counts(const vector &include_indv, const vector &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const; unsigned int get_N_alleles() const; unsigned int get_N_chr(const vector &include_indv, const vector &include_genotype) const; void get_POS_binary(vector &out) const; void get_ID_binary(vector &out); void get_rlen(vector &out) const; void get_QUAL_binary(vector &out) const; void get_n_allele_info(vector &out) const; void get_n_fmt_sample(vector &out) const; void get_ALLELES_binary(vector &out); vector > get_INFO_vector(const set &INFO_to_keep, bool keep_all_INFO=false) const; void get_FORMAT_binary(vector &out) const; string get_typed_string( unsigned int * line_position, const vector& line ); void get_type(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size); vector get_int_vector(unsigned int * line_position, const vector& line); int get_typed_int(unsigned int * line_position, const vector& line, unsigned int &type, unsigned int &size); float get_typed_float(unsigned int * line_position, const vector& line); vector get_typed_float_vector(unsigned int * line_position, const vector& line); void get_number(uint32_t &out, unsigned int * line_position, const vector& line); void decode_genotype(int8_t in, int >, bool &phased); void make_typed_string(vector &out, const string &in, bool typed); void make_typed_int(vector &out, const int &in, bool typed); void make_int(vector &out, const int &in, int type); void make_typed_int_vector(vector &out, const vector &in, int number = -1); void make_typed_int_vector(vector &out, const string &in, int number = -1); void make_typed_int_vector(vector &out, const vector &in); void make_typed_float_vector(vector &out, const string &in, int number = -1); void make_typed_float_vector(vector &out, const vector &in, int number = -1); void make_typed_string_vector(vector &out, const vector &in, int number = -1); void make_typed_GT_vector(vector &out, vector &in); void make_type_size(vector &out, const unsigned int &type, const unsigned int &size); void encode_genotype(vector &out, string &in, int exp_size); void copy_object(vector &out, int &position, const vector &in); void skip_section(unsigned int *line_position, const vector &line); bool check_missing(unsigned int line_position, const unsigned int type, const vector &line); void set_CHROM(const string &in); void set_POS(const int in); void set_ID(const string &in); void set_REF(const string &in); void add_ALT_allele(const string &in); void add_FILTER_entry(const string &in); virtual void print(ostream &out) = 0; virtual void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO=false) = 0; virtual void print(ostream &out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) = 0; virtual void print_bcf(BGZF* out) = 0; virtual void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO=false) = 0; virtual void print_bcf(BGZF* out, const set &INFO_to_keep, bool keep_all_INFO, const vector &include_indv, const vector &include_genotype) = 0; virtual void filter_genotypes_by_depth(vector &include_genotype_out, int min_depth, int max_depth) = 0; virtual void filter_genotypes_by_quality(vector &include_genotype_out, double min_genotype_quality) = 0; virtual void filter_genotypes_by_filter_status(vector &include_genotype_out, const set &filter_flags_to_remove, bool remove_all = false) = 0; static double SNPHWE(int obs_hets, int obs_hom1, int obs_hom2); static void tokenize(const string &in, char token, vector &out); static int str2int(const string &in, const int missing_value=-1); static double str2double(const string &in, const double missing_value=-1.0); static string int2str(const int in, const int missing_value=-1); static string double2str(const double in, const double missing_value=-1.0); static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); }; protected: istringstream data_stream; bool basic_parsed; bool fully_parsed; bool parsed_ALT; bool parsed_FILTER; bool parsed_INFO; bool parsed_FORMAT; bool parsed_FORMAT_binary; string CHROM; int POS; string ID; string REF; vector ALT; double QUAL; vector FILTER; bool passed_filters; vector > INFO; vector FORMAT; vector FORMAT_binary; int N_INFO_removed; int N_FORMAT_removed; vector< pair > GENOTYPE; vector ploidy; vector PHASE; vector GQUALITY; vector DEPTH; vector< vector > GFILTER; vector parsed_GT; vector parsed_GQ; vector parsed_DP; vector parsed_FT; map FORMAT_to_idx; int GT_idx; int GQ_idx; int DP_idx; int FT_idx; vector FORMAT_positions, FORMAT_types, FORMAT_sizes, FORMAT_skip, FORMAT_keys; }; #endif /* ENTRY_H_ */ vcftools_0.1.11/.cproject0000644000000000000000000012074312156354770014044 0ustar rootroot vcftools_0.1.11/.project0000644000000000000000000000460512156354770013677 0ustar rootroot vcftools org.eclipse.cdt.managedbuilder.core.genmakebuilder clean,full,incremental, ?name? org.eclipse.cdt.make.core.append_environment true org.eclipse.cdt.make.core.autoBuildTarget all org.eclipse.cdt.make.core.buildArguments org.eclipse.cdt.make.core.buildCommand make org.eclipse.cdt.make.core.buildLocation ${workspace_loc:/vcftools/Debug} org.eclipse.cdt.make.core.cleanBuildTarget clean org.eclipse.cdt.make.core.contents org.eclipse.cdt.make.core.activeConfigSettings org.eclipse.cdt.make.core.enableAutoBuild false org.eclipse.cdt.make.core.enableCleanBuild true org.eclipse.cdt.make.core.enableFullBuild true org.eclipse.cdt.make.core.fullBuildTarget all org.eclipse.cdt.make.core.stopOnError true org.eclipse.cdt.make.core.useDefaultBuildCmd true org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder org.eclipse.cdt.core.cnature org.eclipse.cdt.core.ccnature org.eclipse.cdt.managedbuilder.core.managedBuildNature org.eclipse.cdt.managedbuilder.core.ScannerConfigNature vcftools_0.1.11/.settings/0000755000000000000000000000000012163074506014133 5ustar rootrootvcftools_0.1.11/.settings/org.eclipse.ltk.core.refactoring.prefs0000644000000000000000000000020612156354770023434 0ustar rootroot#Sat Oct 03 16:15:16 BST 2009 eclipse.preferences.version=1 org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false vcftools_0.1.11/.settings/org.eclipse.cdt.managedbuilder.core.prefs0000644000000000000000000000261212156354770024057 0ustar rootrooteclipse.preferences.version=1 environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/CPATH/delimiter=\: environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/CPATH/operation=remove environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/CPLUS_INCLUDE_PATH/delimiter=\: environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/CPLUS_INCLUDE_PATH/operation=remove environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/C_INCLUDE_PATH/delimiter=\: environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/C_INCLUDE_PATH/operation=remove environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/append=true environment/buildEnvironmentInclude/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/appendContributed=true environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/LIBRARY_PATH/delimiter=\: environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/LIBRARY_PATH/operation=remove environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/append=true environment/buildEnvironmentLibrary/cdt.managedbuild.config.gnu.macosx.exe.debug.621972300/appendContributed=true