pax_global_header00006660000000000000000000000064130174732260014517gustar00rootroot0000000000000052 comment=bce105b318c432cc642a2e2f2d3de245bba1bee3 bamclipper-1.0.0/000077500000000000000000000000001301747322600136335ustar00rootroot00000000000000bamclipper-1.0.0/LICENSE000066400000000000000000000020551301747322600146420ustar00rootroot00000000000000MIT License Copyright (c) 2016 Chun Hang Au Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. bamclipper-1.0.0/README.md000066400000000000000000000046541301747322600151230ustar00rootroot00000000000000BAMClipper ========== Remove gene-specific primer sequences from SAM/BAM alignments of PCR amplicons by soft-clipping [Download latest version in a ZIP package](https://github.com/tommyau/bamclipper/zipball/master) ### Dependencies, as tested on 64-bit CentOS 5.5 * [SAMtools](http://www.htslib.org/download/) (version 1.3 tested) * [GNU Parallel](http://www.gnu.org/software/parallel/) (version 20130522 tested) ### Usage `bamclipper.sh` soft-clips gene-specific primers from BAM alignment file based on *genomic coordinates* of primer pairs in BEDPE format. >./bamclipper.sh -b _BAM_ -p _BEDPE_ [-n _NTHREAD_] [-s _SAMTOOLS_] [-g _GNUPARALLEL_] [-u _UPSTREAM_] [-d _DOWNSTREAM_] Given a BAM file called **_NAME_.bam**, a new BAM file (**_NAME_.primerclipped.bam**) and its associated index (**_NAME_.primerclipped.bam.bai**) will be generated in the current working directory. _Notes_: For the sake of performance and simplicity, soft-clipping is performed solely based on genomic coordinates without involving the underlying sequence. Reference sequence names and coordinates of BAM and BEDPE are assumed to be derived from identical reference sequences (e.g. hg19). *Required arguments* - **-b** _FILE_: indexed BAM alignment file - **-p** _FILE_: [BEDPE](http://bedtools.readthedocs.io/en/latest/content/general-usage.html#bedpe-format) file of primer pair locations *Options* - **-n** _INT_: number of threads for clipprimer.pl (the workhorse Perl script of BAMClipper) and two samtools sort instances [1] - **-s** _FILE_: path to samtools executable [samtools] - **-g** _FILE_: path to gnu parallel executable [parallel] - **-u** _INT_: number of nucleotide upstream to 5' most nucleotide of primer (in addition to 5' most nucleotide of primer) for assigning alignments to primers based on the alignment starting position. [1] - **-d** _INT_: number of nucleotide downstream to 5' most nucleotide of primer (in addition to 5' most nucleotide of primer) for assigning alignments to primers based on the alignment starting position. [5] ### Example ```bash # Clip primers by BAMClipper >./bamclipper.sh -b sample1.bam -p trusight_myeloid.bedpe -n 12 # done! # sample1.primerclipped.bam and its index sample1.primerclipped.bam.bai will be generated. # Show an example line of primer pair BEDPE file >head -1 trusight_myeloid.bedpe chr1 36931667 36931695 chr1 36931911 36931937 ``` Citation -------- Au CH, Ho DN, Kwong A, Chan TL and Ma ESK, 2016. _(submitted)_ bamclipper-1.0.0/bamclipper.sh000077500000000000000000000030551301747322600163130ustar00rootroot00000000000000#!/usr/bin/env bash # bamclipper.sh usage() { echo "Usage: $0 -b BAM -p BEDPE [-n NTHREAD] [-s SAMTOOLS] [-g GNUPARALLEL] [-u UPSTREAM] [-d DOWNSTREAM]" 1>&2; exit 1; } NTHREAD=1 SAMTOOLS="samtools" PARALLEL="parallel" UPSTREAM=1 DOWNSTREAM=5 while getopts ":b:p:n::s::g::u::d::" o; do case "${o}" in b) BAM=${OPTARG} BAMbn="$(basename $BAM)" [[ -f "$BAM" && -f "$BAM.bai" ]] || usage ;; p) BEDPE=${OPTARG} [[ -f "$BEDPE" ]] || usage ;; n) NTHREAD=${OPTARG} [[ "$NTHREAD" -ge 1 ]] || usage ;; s) SAMTOOLS=${OPTARG} ;; g) PARALLEL=${OPTARG} ;; u) UPSTREAM=${OPTARG} [[ "$UPSTREAM" -ge 0 ]] || usage ;; d) DOWNSTREAM=${OPTARG} [[ "$DOWNSTREAM" -ge 0 ]] || usage ;; *) usage ;; esac done shift $((OPTIND-1)) if [ -z "$BAM" ] || [ -z "$BEDPE" ]; then usage fi SCRIPT_PATH="$(readlink -f $0)" SCRIPT_DIR="$(dirname $SCRIPT_PATH)" "$SAMTOOLS" sort -n -T ${BAMbn}.sort1 -@ "$NTHREAD" $BAM | "$SAMTOOLS" view -h | "$SCRIPT_DIR"/injectseparator.pl | "$PARALLEL" -j "$NTHREAD" --keep-order --remove-rec-sep --pipe --remove-rec-sep --recend '__\n' --block 1m "$SCRIPT_DIR/clipprimer.pl --in $BEDPE --upstream $UPSTREAM --downstream $DOWNSTREAM" | "$SAMTOOLS" sort -T ${BAMbn}.sort2 -l 0 -@ "$NTHREAD" > ${BAMbn%.bam}.primerclipped.bam && "$SAMTOOLS" index ${BAMbn%.bam}.primerclipped.bam bamclipper-1.0.0/clipprimer.pl000077500000000000000000000263471301747322600163550ustar00rootroot00000000000000#!/usr/bin/env perl # clipprimer.pl - the workhorse of BAMClipper use strict; use warnings; use Getopt::Long; use Data::Dumper; use IO::File; ## # internal logic below my $phredoffset = 33; my $bedpe; my $debug = 0; my $window_upstream = 1; my $window_downstream = 5; GetOptions ("in=s" => \$bedpe, "phredoffset=i" => \$phredoffset, "upstream=i" => \$window_upstream, "downstream=i" => \$window_downstream, "debug" => \$debug,); my %position2amplicon_positive; my %position2amplicon_negative; my $bedpe_fh = IO::File->new($bedpe) || die "ERROR: failed to open .bedpe panel description file"; BEDPE:while (<$bedpe_fh>) { chomp; next BEDPE if length($_) == 0 || substr($_,0,1) eq "#"; my @fields = split ("\t", $_); die "ERROR: unrecognized format of the .bedpe file provided" if scalar @fields < 6; my $name = scalar @fields >= 7 ? $fields[6] : join("-",@fields[0..5]); # lookup up table from position to amplicon # 1-based pos of left and right primers my $primers = [$fields[0], $fields[1]+1, $fields[2], $fields[3], $fields[4]+1, $fields[5], $name]; OFFSET:foreach my $offset ($window_upstream * -1 .. $window_downstream) { $position2amplicon_positive{$fields[0]}{$fields[1]+1+$offset} = $primers; } # 1-based pos of right primer OFFSET:foreach my $offset ($window_downstream * -1 .. $window_upstream) { $position2amplicon_negative{$fields[3]}{$fields[5]+$offset} = $primers; } } my $current_readname = ""; my @line_buffer; LINE:while (<>) { my $original_line = $_; my $line = $original_line; chomp($line); if (length($line) == 0 || substr($line,0,1) eq "@") { # SAM header lines print $original_line; next LINE; } my @output; my @fields = split("\t", $line); if ($fields[0] ne $current_readname && $current_readname ne ""){ process_line_buffer(); } $current_readname = $fields[0]; if ($fields[1] & 0x4 || $fields[1] & 0x0100) { # unmapped, or not primary alignment push @line_buffer, [\@fields,[0]]; next LINE; } my ($cigar_total_length, @cigar) = &parse_cigar($fields[5]); my $rawrefseq = "N"x$cigar_total_length; my $readseq = $fields[9]; my $readqual = $fields[10]; my $direction = $fields[1] & 0x0010 ? "-" : "+"; my @reconstruct_alignmentoutput = reconstruct_alignment($cigar_total_length, \@cigar, uc($readseq), $readqual, uc($rawrefseq), $fields[3], $fields[2], $direction); my @new_fields = @fields; if ($reconstruct_alignmentoutput[6] == 0) { # no M in CIGAR at all print "CIGAR $reconstruct_alignmentoutput[5] becomes *\n" if $debug; $new_fields[5] = "*"; push @line_buffer, [\@new_fields,[1]]; } elsif ($reconstruct_alignmentoutput[7] == 1) { # not trimmed, do nothing at the moment push @line_buffer, [\@fields,[0]]; } else { # alignment updated $new_fields[3] = $reconstruct_alignmentoutput[3]; $new_fields[5] = $reconstruct_alignmentoutput[5]; if ($debug) { print "to remove NM and MD from updated alignments: ".join(" ", grep {$_ =~ /^(?:NM|MD):/} @new_fields)."\n"; } @new_fields = map {$new_fields[$_]} grep {$_ <= 10 || (substr($new_fields[$_],0,5) ne "NM:i:" && substr($new_fields[$_],0,5) ne "MD:Z:")} (0..$#new_fields); push @line_buffer, [\@new_fields,[2]]; } if ($debug) { print "\@fields\n"; print join("\t", @fields). "\n"; print "\@new_fields\n"; print join("\t", @new_fields). "\n"; print "\n"; } } if (scalar @line_buffer >= 1){ process_line_buffer(); } sub process_line_buffer { # flush buffer # [*]->[1]->[0] flag # 0: print original line # 1: unmapped after clipping # 2: clipped foreach my $i (0..$#line_buffer) { # unmapped after clipping if ($line_buffer[$i]->[1]->[0] == 1 && $line_buffer[$i]->[0]->[5] eq "*") { # flag if ($debug) { print "unmapped after clipping for read: ".$line_buffer[$i]->[0]->[0]."\n"; print "original flag: ".$line_buffer[$i]->[0]->[1]."\n"; } $line_buffer[$i]->[0]->[1] = $line_buffer[$i]->[0]->[1] | 4; # turn on unmapped bit 4 (0x4) if ($debug) { print "modified flag: ".$line_buffer[$i]->[0]->[1]."\n"; } } } # if >= 1 line is clipped # identify read paired, mapped (not unmapped) && primary alignment (not not primary alignment) # save pos for first in pair as X # save pos for second in pair as Y # assign PNEXT of every first in pair alignment as Y # assign PNEXT of every second in pair alignment as X my @id_of_clipped_lines = grep {$line_buffer[$_]->[1]->[0] == 2} (0..$#line_buffer); if (scalar @id_of_clipped_lines >= 1) { my @id_of_primary_alignments = grep {($line_buffer[$_]->[0]->[1] & 0x1) && !($line_buffer[$_]->[0]->[1] & 0x4) && !($line_buffer[$_]->[0]->[1] & 0x100)} (0..$#line_buffer); if (scalar @id_of_primary_alignments == 2){ my $pos_of_first_in_pair; my $pos_of_second_in_pair; if (($line_buffer[$id_of_primary_alignments[0]]->[0]->[1] & 0x40) && ($line_buffer[$id_of_primary_alignments[1]]->[0]->[1] & 0x80)) { # first in pair, then second in pair $pos_of_first_in_pair = $line_buffer[$id_of_primary_alignments[0]]->[0]->[3]; $pos_of_second_in_pair = $line_buffer[$id_of_primary_alignments[1]]->[0]->[3]; } elsif (($line_buffer[$id_of_primary_alignments[1]]->[0]->[1] & 0x40) && ($line_buffer[$id_of_primary_alignments[0]]->[0]->[1] & 0x80)) { # second in pair, then first in pair $pos_of_first_in_pair = $line_buffer[$id_of_primary_alignments[1]]->[0]->[3]; $pos_of_second_in_pair = $line_buffer[$id_of_primary_alignments[0]]->[0]->[3]; } my @id_of_first_in_pair_alignments = grep {$line_buffer[$_]->[0]->[1] & 0x40} (0..$#line_buffer); my @id_of_second_in_pair_alignments = grep {$line_buffer[$_]->[0]->[1] & 0x80} (0..$#line_buffer); if (defined $pos_of_second_in_pair) { # assign PNEXT of every first in pair alignments as $pos_of_second_in_pair foreach my $i (@id_of_first_in_pair_alignments) { print "original PNEXT is ".$line_buffer[$i]->[0]->[7]."\n" if $debug; $line_buffer[$i]->[0]->[7] = $pos_of_second_in_pair; print "modified PNEXT is ".$line_buffer[$i]->[0]->[7]."\n" if $debug; } } if (defined $pos_of_first_in_pair) { # assign PNEXT of every second in pair alignments as $pos_of_first_in_pair foreach my $i (@id_of_second_in_pair_alignments) { print "original PNEXT is ".$line_buffer[$i]->[0]->[7]."\n" if $debug; $line_buffer[$i]->[0]->[7] = $pos_of_first_in_pair; print "modified PNEXT is ".$line_buffer[$i]->[0]->[7]."\n" if $debug; } } } } print "\@line_buffer\n" if $debug; map { print join("\t",@{$_->[0]})."\n" } @line_buffer; @line_buffer = (); } sub reconstruct_alignment { my ($cigar_total_length, $cigar_arrayref, $readseq, $readqual, $refseq, $refpos_start, $chrom, $alignmentdirection) = @_; my @output; my @readseq = split ("", $readseq); # 1-based, first element is not used unshift @readseq, undef; my @cigar_op; # 1-based, first element is not used my $cigar_pos_offset = 1; my @cigar_refpos; my $readbase_pos_offset = 0; my $refpos_pos_offset = $refpos_start; foreach my $cigarop (@$cigar_arrayref) { my $len = $cigarop->[0]; my $op = $cigarop->[1]; map {$cigar_op[$_] = $op} ($cigar_pos_offset .. $cigar_pos_offset + $len - 1); map {$cigar_refpos[$_] = ($op eq "I" || $op eq "S") ? "*" : $refpos_pos_offset + $_ - $cigar_pos_offset} ($cigar_pos_offset .. $cigar_pos_offset + $len - 1); $readbase_pos_offset -= $len if $op eq "D"; $refpos_pos_offset += $len if ($op ne "I" && $op ne "S"); $cigar_pos_offset += $len; } if ($debug) { print join("\t", map {defined $_ ? $_ : "_"} @cigar_refpos[1..$cigar_total_length])."\n"; print join("\t", map {defined $_ ? $_ : "_"} @cigar_op[1..$cigar_total_length])."\n"; } my $print_original_line = 0; my ($left_alignpos, $right_alignpos) = min_max(grep {defined $_ && $_ ne "*"} @cigar_refpos[1..$cigar_total_length]); my $primers = undef; if ($alignmentdirection eq "+") { if (defined $position2amplicon_positive{$chrom}{$left_alignpos}) { $primers = $position2amplicon_positive{$chrom}{$left_alignpos}; } } elsif ($alignmentdirection eq "-") { if (defined $position2amplicon_negative{$chrom}{$right_alignpos}) { $primers = $position2amplicon_negative{$chrom}{$right_alignpos}; } } if (!defined $primers) { $print_original_line = 1; } else { my $trim_5prime_up_to; my $trim_3prime_up_to; # 5' trim up to end position of left primer POS: foreach my $i (1..$cigar_total_length) { if ($cigar_refpos[$i] ne "*") { if ($cigar_refpos[$i] <= $primers->[2]) { # keep last base of primer if there is deletion right after primer $trim_5prime_up_to = $i unless ($i != $cigar_total_length && $cigar_op[$i] eq "M" && $cigar_op[$i+1] eq "D"); } else { last POS; } } } if (defined $trim_5prime_up_to) { print "trim_5prime_up_to: $trim_5prime_up_to\n" if $debug; map {$cigar_op[$_] = $cigar_op[$_] eq "D" ? "*" : "S"; $cigar_refpos[$_] = "*"} (1..$trim_5prime_up_to); } # 3' trim up to start position of right primer POS: foreach my $i (reverse(1..$cigar_total_length)) { if ($cigar_refpos[$i] ne "*") { if ($cigar_refpos[$i] >= $primers->[4]) { # keep first base of primer if there is deletion right before primer $trim_3prime_up_to = $i unless ($i != 1 && $cigar_op[$i-1] eq "D" && $cigar_op[$i] eq "M"); } else { last POS; } } } if (defined $trim_3prime_up_to) { print "trim_3prime_up_to: $trim_3prime_up_to\n" if $debug; map {$cigar_op[$_] = $cigar_op[$_] eq "D" ? "*" : "S"; $cigar_refpos[$_] = "*"} ($trim_3prime_up_to..$cigar_total_length); } if ($debug) { print join("\t", map {defined $_ ? $_ : "_"} @cigar_refpos[1..$cigar_total_length])."\n"; print join("\t", map {defined $_ ? $_ : "_"} @cigar_op[1..$cigar_total_length])."\n"; } } my ($collapsed_cigar, $containM) = collapse_cigar(grep {$_ ne "*"} @cigar_op[1..$cigar_total_length]); my ($new_left_alignpos, $new_right_alignpos) = min_max(grep {defined $_ && $_ ne "*"} @cigar_refpos[1..$cigar_total_length]); return (defined $primers ? $primers->[6] : "undef", $left_alignpos, $right_alignpos, $new_left_alignpos, $new_right_alignpos, $collapsed_cigar, $containM, $print_original_line); } sub min_max { my @array = sort {$a <=> $b} @_; return ($array[0], $array[$#array]); } sub parse_cigar { my ($cigar_string) = @_; my @cigar; my $i = 0; while ($cigar_string =~ m/([0-9]+)([MIDNSHPX=])/g) { # skip hard-clipping die "ERROR: Unexpected CIGAR operation $2" if $2 eq "N" || $2 eq "P"; if ($2 ne "H") { push @cigar, [$1, $2]; $i += $1; } } return ($i, @cigar); } sub collapse_cigar { my $output = ""; my $current_op = ""; my $current_len = 0; my $containM = 0; OP:foreach my $op (@_) { if ($op ne $current_op && $current_op ne "") { $output .= sprintf("%d%s",$current_len,$current_op); $current_len = 0; $containM = 1 if $current_op eq "M" || $current_op eq "=" || $current_op eq "X"; } $current_op = $op; $current_len++; } if ($current_op ne "") { $output .= sprintf("%d%s",$current_len,$current_op); $containM = 1 if $current_op eq "M" || $current_op eq "=" || $current_op eq "X"; } return ($output, $containM); } bamclipper-1.0.0/injectseparator.pl000077500000000000000000000005741301747322600173760ustar00rootroot00000000000000#!/usr/bin/env perl # inject separator after all alignment of a read pair use strict; use warnings; my $readname; while (<>) { my $original_line = $_; if (substr($original_line, 0, 1) ne "@") { my @fields = split("\t", $original_line); print "__\n" if defined $readname && $readname ne $fields[0]; $readname = $fields[0]; } print $original_line; }