pax_global_header00006660000000000000000000000064123176637320014524gustar00rootroot0000000000000052 comment=49aa199775e0d8bc1fee79aec7117a7fde8cb2bf DWGSIM-dwgsim.0.1.11/000077500000000000000000000000001231766373200140675ustar00rootroot00000000000000DWGSIM-dwgsim.0.1.11/.gitmodules000066400000000000000000000001271231766373200162440ustar00rootroot00000000000000[submodule "samtools"] path = samtools url = http://github.com/samtools/samtools.git DWGSIM-dwgsim.0.1.11/INSTALL000066400000000000000000000016031231766373200151200ustar00rootroot00000000000000Welcome to DWGSIM === Prerequisites === If you have just cloned this repository using GIT, you must initialize the SAMtools sub-module: git submodule init git submodule update You should now have the SAMtools-compatible version checked out in the samtools sub-directory. === Building DWGSIM === To build DWGSIM, execute the following commands: make To install DWGSIM, after building, copy the following binaries to their appropriate installation directories: dwgsim dwgsim_eval dwgims_pileup_eval.pl === DWGSIM with Galaxy === DWGSIM can be incorporated into galaxy. NB: this is out-of-date. 1. Copy over the wrapper scripts: cp scripts/galaxy/dwgsim_wrapper.* galaxy-dist/tools/sr_mapping 2. Add to the list of available tools. 2a. Open galaxy-dist/tool_conf.xml 2b. At the end of the NGS: mapping section, add the following: DWGSIM-dwgsim.0.1.11/LICENSE000066400000000000000000000431031231766373200150750ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. DWGSIM-dwgsim.0.1.11/Makefile000066400000000000000000000050401231766373200155260ustar00rootroot00000000000000PACKAGE_VERSION="0.1.11" CC= gcc CFLAGS= -g -Wall -O3 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -DPACKAGE_VERSION=\\\"${PACKAGE_VERSION}\\\" DWGSIM_AOBJS = src/dwgsim_opt.o src/mut.o src/contigs.o src/regions_bed.o \ src/mut_txt.o src/mut_bed.o src/mut_vcf.o src/mut_input.o src/dwgsim.o DWGSIM_EVAL_AOBJS = src/dwgsim_eval.o \ samtools/knetfile.o \ samtools/bgzf.o samtools/kstring.o samtools/bam_aux.o samtools/bam.o samtools/bam_import.o samtools/sam.o samtools/bam_index.o \ samtools/bam_pileup.o samtools/bam_lpileup.o samtools/bam_md.o samtools/razf.o samtools/faidx.o samtools/bedidx.o \ samtools/bam_sort.o samtools/sam_header.o samtools/bam_reheader.o samtools/kprobaln.o samtools/bam_cat.o PROG= dwgsim dwgsim_eval INCLUDES= -I. SUBDIRS= samtools . CLEAN_SUBDIRS= samtools src LIBPATH= .SUFFIXES:.c .o .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all-recur lib-recur clean-recur cleanlocal-recur install-recur: @target=`echo $@ | sed s/-recur//`; \ wdir=`pwd`; \ list='$(SUBDIRS)'; for subdir in $$list; do \ cd $$subdir; \ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ cd $$wdir; \ done; all:$(PROG) .PHONY:all lib clean cleanlocal .PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur dwgsim:lib-recur $(DWGSIM_AOBJS) $(CC) $(CFLAGS) -o $@ $(DWGSIM_AOBJS) -lm -lz -lpthread dwgsim_eval:lib-recur $(DWGSIM_EVAL_AOBJS) $(CC) $(CFLAGS) -o $@ $(DWGSIM_EVAL_AOBJS) -Lsamtools -lm -lz -lpthread cleanlocal: rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \ wdir=`pwd`; \ list='$(CLEAN_SUBDIRS)'; for subdir in $$list; do \ if [ -d $$subdir ]; then \ cd $$subdir; \ pwd; \ rm -vfr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib; \ cd $$wdir; \ fi; \ done; clean:cleanlocal-recur dist:clean if [ -f dwgsim-${PACKAGE_VERSION}.tar.gz ]; then \ rm -rv dwgsim-${PACKAGE_VERSION}.tar.gz; \ fi; \ if [ -f dwgsim-${PACKAGE_VERSION}.tar ]; then \ rm -rv dwgsim-${PACKAGE_VERSION}.tar; \ fi; \ if [ -d dwgsim-${PACKAGE_VERSION} ]; then \ rm -rv dwgsim-${PACKAGE_VERSION}; \ fi; \ mkdir dwgsim-${PACKAGE_VERSION}; \ cp -r INSTALL LICENSE Makefile README scripts src dwgsim-${PACKAGE_VERSION}/.; \ tar -vcf dwgsim-${PACKAGE_VERSION}.tar dwgsim-${PACKAGE_VERSION}; \ gzip -9 dwgsim-${PACKAGE_VERSION}.tar; \ rm -rv dwgsim-${PACKAGE_VERSION}; DWGSIM-dwgsim.0.1.11/README000066400000000000000000000005131231766373200147460ustar00rootroot00000000000000Welcome to DWGSIM. Please see the file LICENSE for details. Please see the file INSTALL for installation instructions; This software package has limited support since it is in active development. Please see the following page for more details: https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation DWGSIM-dwgsim.0.1.11/samtools/000077500000000000000000000000001231766373200157305ustar00rootroot00000000000000DWGSIM-dwgsim.0.1.11/scripts/000077500000000000000000000000001231766373200155565ustar00rootroot00000000000000DWGSIM-dwgsim.0.1.11/scripts/dwgsim_eval_plot.py000066400000000000000000000105041231766373200214670ustar00rootroot00000000000000#!/usr/bin/env python import os import sys from optparse import OptionParser import re import string import pylab class Table: def __init__(self, fn, min_mapq): ncol = -1 # read in the data data = list() fh = open(fn, 'r') for line in fh: line = line.rstrip("\r\n") if '#' == line[0]: continue else: table = self.__parse(line) if table[0] < min_mapq: continue data.append(table) if -1 == ncol: ncol = len(table) elif ncol != len(table): sys.stderr.write("Error: table was misformatted\n") sys.exit(1) fh.close() # re-arrange the data self.matrix = [[data[i][j] for j in xrange(ncol)] for i in xrange(len(data))] # NB: assumes sorted descending def __parse(self, line): tokens = re.findall(r'[\w+-.]+', line) for i in range(0, 13): tokens[i] = int(tokens[i]) for i in range(13, len(tokens)): tokens[i] = float(tokens[i]) return tokens def main(options): fmts = ['-', '--', '-.', ':', '.'] if None == options.fns or 0 == len(options.fns): return # read in through the files fmts_i = 0 for fn in options.fns: table = Table(fn, options.min_mapq) # extract sensitivity x = [table.matrix[i][17] for i in range(len(table.matrix))] # extract specificity y = [table.matrix[i][16] for i in range(len(table.matrix))] pylab.plot(x, y, fmts[fmts_i]) fmts_i = fmts_i + 1 if len(fmts) <= fmts_i: fmts_i = 0 # plot values pylab.title('DWGSIM ROC') pylab.xlabel('specificity') pylab.ylabel('sensitivity') # xlim if None != options.xlim: nums = re.findall(r'\d+\.?\d*', options.xlim) if 2 == len(nums): pylab.xlim(xmin=float(nums[0]), xmax=float(nums[1])) # ylim if None != options.ylim: nums = re.findall(r'\d+\.?\d*', options.ylim) if 2 == len(nums): pylab.ylim(ymin=float(nums[0]), ymax=float(nums[1])) # legend if not options.no_legend: if None != options.ids and len(options.fns) == len(options.ids): pylab.legend(options.ids, title='IDs', loc='lower left') elif options.infer_ids: ids = list() for fn in options.fns: i = re.sub(r'^.*\/', '', fn) i = re.sub(r'\.sam\.txt$', '', i) i = re.sub(r'^out', '', i) i = re.sub(r'^dwgsim_eval', '', i) i = re.sub(r'^\.', '', i) ids.append(i) pylab.legend(ids, title='IDs', loc='lower left') # show the plot if None == options.out: pylab.show() else: # check if the file extension is present, if not, add it if len(options.out) <= len(options.outtype) or options.outtype != options.out[len(options.out)-len(options.outtype):]: pylab.savefig("%s.%s" % (options.out, options.outtype), format=options.outtype) else: pylab.savefig(options.out, format=options.outtype) pylab.close() if __name__ == '__main__': parser = OptionParser() parser.add_option('--fn', help="an output file from dwgsim_eval", action="append", dest="fns") parser.add_option('--id', help="the corresponding ID to add to the plot legened", action="append", dest="ids") parser.add_option('--infer-ids', help="infer the ids from the file names", action="store_true", default=True, dest="infer_ids") parser.add_option('--xlim', help="the x-axis range", dest="xlim") parser.add_option('--ylim', help="the y-axis range", dest="ylim") parser.add_option('--out', help="the output file", dest="out") parser.add_option('--outtype', help="the output file type (ex. pdf, png)", dest="outtype", default="pdf") parser.add_option('--min-mapq', help="minimum mapping quality (inclusive)", dest="min_mapq", type=int, default=0) parser.add_option('--no-legend', help="do not add a legend to the plot", dest="no_legend", action="store_true", default=False) if len(sys.argv[1:]) < 1: parser.print_help() else: options, args = parser.parse_args() main(options) DWGSIM-dwgsim.0.1.11/scripts/dwgsim_mut_to_vcf.pl000066400000000000000000000046301231766373200216350ustar00rootroot00000000000000#!/bin/perl use strict; use warnings; # TODO: # getopt # better messaging # usage # pod... my $fn = shift || die "No input file"; my %to_alt = ( "AR" => "G", "GR" => "A", "CY" => "T", "TY" => "C", "CS" => "G", "GS" => "C", "AW" => "T", "TW" => "A", "GK" => "T", "TK" => "G", "AM" => "C", "CM" => "A" ); open(FH, "$fn") || die "Could not open '$fn' for reading.\n"; print STDOUT "##fileformat=VCFv4.1\n"; print STDOUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample\n"; my $type = -1; # 1 - SUB, 2 - INS, 3 - DEL my $prev_name = -1; my $prev_pos = -1; my $del_n = 0; my $del_name = ""; my $del_pos = -1; my $del_ref = ""; my $del_s = -1; my $line_n = 0; while(defined(my $line = )) { if($line =~ m/^(.+)\t(\d+)\t(.+)\t(.+)\t(\d+)$/) { $line_n++; my $name = $1; my $pos = $2; my $ref = $3; my $alt = $4; my $s = $5; if($prev_name ne $name) { if(0 < $del_n) { printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n", $del_name, $del_pos, $del_ref, $del_s); } # reset $prev_pos = -1; $del_n = 0; $del_s = -1; $del_name = ""; $del_pos = -1; $del_ref = ""; } if("-" eq $alt) { # DEL if(0 < $del_n && $del_pos + $del_n == $pos && $s == $del_s) { $del_n++; $del_ref .= $ref; } else { if(0 < $del_n) { printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n", $del_name, $del_pos, $del_ref, $del_s); } $del_n = 1; $del_name = $name; $del_pos = $pos; $del_ref = $ref; $del_s = $s; } } else { # print deletion, if it exists if(0 < $del_n) { printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n", $del_name, $del_pos, $del_ref, $s); # reset $prev_pos = -1; $del_n = 0; $del_s = -1; $del_name = ""; $del_pos = -1; $del_ref = ""; } if("-" eq $ref) { # INS printf(STDOUT "%s\t%d\t.\t.\t%s\t100\tPASS\tpl=%d\n", $name, $pos, $alt, $s); } else { # SUB if(3 != $s) { if(!defined($to_alt{"$ref"."$alt"})) { die "Could not convert $ref$alt"." on $line_n.\n"; } $alt = $to_alt{"$ref"."$alt"}; } printf(STDOUT "%s\t%d\t.\t%s\t%s\t100\tPASS\tpl=%d\n", $name, $pos, $ref, $alt, $s); } } $prev_name = $name; $prev_pos = $pos; } else { die "Input file is not in the proper format.\n"; } } # print deletion, if it exists if(0 < $del_n) { printf(STDOUT "%s\t%d\t.\t%s\t.\t100\tPASS\tpl=%d\n", $del_name, $del_pos, $del_ref, $del_s); } close(FH); DWGSIM-dwgsim.0.1.11/scripts/dwgsim_pileup_eval.pl000066400000000000000000000125721231766373200220010ustar00rootroot00000000000000#!/usr/bin/perl -w # Copied from SAMtools (http://samtools.sourceforge.net) # This perl code is very cryptic. Evaluates the fraction # of reads correctly aligned at various mapping quality # thresholds. use strict; use warnings; use Getopt::Std; select STDERR; $| = 1; # make STDERR unbuffered &dwgsim_eval; exit; sub dwgsim_eval { my %opts = (g=>5, n=>0); getopts('bpc:gn:', \%opts); print_usage() if (@ARGV == 0 && -t STDIN); my ($flag, $m, $ctr) = (0, 0, 0); my $gap = $opts{g}; my $bwa = (defined $opts{b}) ? 1 : 0; $flag |= 1 if (defined $opts{p}); $flag |= 2 if (defined $opts{c}); my $n = 2*$opts{n}; my @a = (); my $prev_read_name = "ZZZ"; my @reads = (); my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0); print STDERR "Analyzing...\nCurrently on:\n0"; while (<>) { $ctr++; if(0 == ($ctr % 10000)) { print STDERR "\r$ctr"; } next if (/^\@/); my @t = split("\t"); next if (@t < 11); my $sam = $_; my $cur_read_name = $t[0]; if($prev_read_name ne $cur_read_name) { # process the read(s) die unless (0 == length($prev_read_name) || 0 < scalar(@reads)); # go through each end # want to know # - is the correct location represented in each end? # - if so, does the correct location have the best alignment score? # - what's the combined alignment score of the best pair? @a = process_reads(\@reads, $flag, $gap, $bwa); $found_correct_1 += $a[0]; $found_correct_2 += $a[1]; $is_correct_1 += $a[2]; $is_correct_2 += $a[3]; $found_pair_correct += $a[4]; $is_pair_correct += $a[5]; $prev_read_name = $cur_read_name; @reads = (); $m++; } # store the read push(@reads, $sam); #print STDERR $line if (($flag&1) && !$is_correct && $q > 0); } @a = process_reads(\@reads, $flag, $gap, $bwa); $found_correct_1 += $a[0]; $found_correct_2 += $a[1]; $is_correct_1 += $a[2]; $is_correct_2 += $a[3]; $found_pair_correct += $a[4]; $is_pair_correct += $a[5]; print STDERR "\r$ctr\n"; printf(STDERR "%d\t%d\t%d\t%d\t%d\t%d\t%d\n", $m, $found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct); print STDERR "Analysis complete.\n"; } sub print_usage { print STDERR "Usage: dwgsim_eval.pl [-pcnd] [-g GAP] \n"; print STDERR "\t\t-p\t\tprint incorrect alignments\n"; print STDERR "\t\t-c\t\tcolor space alignments\n"; print STDERR "\t\t-n\tINT\tnumber of raw input paired-end reads\n"; print STDERR "\t\t-d\tINT\tdivide quality by this factor\n"; print STDERR "\t\t-i\t\tprint only alignments with indels\n"; print STDERR "\t\t-b\t\talignments are from BWA\n"; exit(1); } sub swap { my ($a, $b) = @_; return ($b, $a); } sub process_reads { my ($reads, $flag, $gap, $bwa) = @_; my ($found_correct_1, $found_correct_2, $is_correct_1, $is_correct_2, $found_pair_correct, $is_pair_correct) = (0, 0, 0, 0, 0, 0); my ($correct_index_1, $correct_index_2, $max_AS_index_1, $max_AS_index_2, $max_AS_1, $max_AS_2) = (-1, -1, -1, -1, -1000000, -1000000); # go through reads for(my $i=0;$i[$i]; my @t = split(/\t/, $line); die if (@t < 11); my ($q, $chr, $left, $rght) = ($t[4], $t[2], $t[3], $t[3]); # right coordinate $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; --$rght; # skip unmapped reads next if (($t[1]&0x4) || $chr eq '*'); # parse read name if($t[0] =~ m/^(\S+)_(\d+)_(\d+)_(\d)_(\d)_(\d+):(\d+):(\d+)_(\d+):(\d+):(\d+)_(\S+)/) { my ($o_chr, $pos_1, $pos_2, $str_1, $str_2) = ($1, $2, $3, $4, $5); my ($n_err_1, $n_sub_1, $n_indel_1, $n_err_2, $n_sub_2, $n_indel_2) = ($6, $7, $8, $9, $10, $11); my $end = 2; if(($t[1]&0x40)) { # read #1 $end = 1; } # check alignment score for(my $j=11;$j= 0: stdout = line.strip() break if stdout: sys.stdout.write( '%s\n' % stdout ) else: raise Exception except: sys.stdout.write( 'Could not determine DWGSIM_EVAL version\n' ) buffsize = 1048576 # make temp directory for dwgsim, requires trailing slash tmp_dir = '%s/' % tempfile.mkdtemp() #'generic' options used in all dwgsim commands here try: tmp_dir = '%s/' % tempfile.mkdtemp() dwgsim_eval_cmd = 'dwgsim_eval' if True == options.alignmentScore: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -a' if True == options.bwa: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -b' if True == options.colorSpace: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -c' use_p = False if 0 <= options.numSnps: use_p = True dwgsim_eval_cmd = dwgsim_eval_cmd + (' -s %s' % options.numSnps) if 0 <= options.numErrors: use_p = True dwgsim_eval_cmd = dwgsim_eval_cmd + (' -e %s' % options.numErrors) if True == options.indels: use_p = True dwgsim_eval_cmd = dwgsim_eval_cmd + ' -i' if True == use_p or True == options.printIncorrect: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -p' if True == options.singleEnd: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -z' dwgsim_eval_cmd = '%s -d %s -g %s -n %s -q %s' % (dwgsim_eval_cmd, \ options.scoreFactor, \ options.wiggle, \ options.numReads, \ options.minMapq) if None != options.sam: dwgsim_eval_cmd = dwgsim_eval_cmd + ' -S ' + options.sam elif None != options.bam: dwgsim_eval_cmd = dwgsim_eval_cmd + ' ' + options.bam else: raise Exception, 'Input file was neither a SAM nor BAM' dwgsim_eval_cmd = dwgsim_eval_cmd + ' > ' + options.output # need to nest try-except in try-finally to handle 2.4 try: # dwgsim run_process ( dwgsim_eval_cmd, 'dwgsim', tmp_dir, buffsize ) # check that there are results in the output file check_output ( options.output, False ) sys.stdout.write( 'DWGSIM_EVAL successful' ) except Exception, e: stop_err( 'DWGSIM_EVAL failed.\n' + str( e ) ) finally: # clean up temp dir if os.path.exists( tmp_dir ): shutil.rmtree( tmp_dir ) if __name__=="__main__": __main__() DWGSIM-dwgsim.0.1.11/scripts/galaxy/dwgsim_eval_wrapper.xml000066400000000000000000000053721231766373200236350ustar00rootroot00000000000000 from a SAM/BAM file using dwgsim_eval if 'sam' == input.ext: dwgsim_wrapper.py \ $alignmentScore \ $bwa \ $colorSpace \ -d $scoreFactor \ -g $wiggle \ -n $numReads \ -q $minMapq \ $singleEnd \ $printIncorrect \ -s $numSnps \ -e $numErrors \ $indels \ -s $input \ -o $output else: dwgsim_wrapper.py \ $alignmentScore \ $bwa \ $colorSpace \ -d $scoreFactor \ -g $wiggle \ -n $numReads \ -q $minMapq \ $singleEnd \ $printIncorrect \ -s $numSnps \ -e $numErrors \ $indels \ -b $input \ -o $output This tool evaluates simulated reads from DWGSIM. For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation#Evaluating_mapping_-_dwgsim_eval DWGSIM-dwgsim.0.1.11/scripts/galaxy/dwgsim_wrapper.py000066400000000000000000000224131231766373200224510ustar00rootroot00000000000000#!/usr/bin/env python """ Runs DWGSIM usage: dwgsim_wrapper.py [options] -e,--eRateFirstread=e: base/color error rate of the first read [0.020] -E,--eRateSecondread=E: base/color error rate of the second read [0.020] -d,--innerDist=d: inner distance between the two ends [500] -s,--stdev=s: standard deviation [50] -N,--numReads=N: number of read pairs [1000000] -1,--loFirstread=1: length of the first read [70] -2,--loSecondread=2: length of the second read [70] -r,--mutRate=r: rate of mutations [0.0010] -R,--fracIndels=R: fraction of mutations that are indels [0.10] -X,--indelExt=X: probability an indel is extended [0.30] -y,--randProb=y: probability of a random DNA read [0.10] -n,--maxN=n: maximum number of Ns allowed in a given read [0] -c,--platformType=c: generate reads for Illumina (nuc space), for SOLiD (in color space) or Ion Torrent -S,--strand=S: strand 0: default, 1: same strand, 2: opposite strand -f,--flowOrder=f: the flow order for Ion Torrent data [(null)] -O,--splitOutput=O: which output needs to be reported, one: one single; two: two files; all: all output options -H,--haploid=H: haploid mode -i,--input=i: the reference genome FASTA -3,--outputBFAST=3: the BFAST output FASTQ -4,--outputBWA1=4: the first BWA output FASTQ -5,--outputBWA2=5: the second BWA output FASTQ -6,--outputMutations=6: the output mutations TXT """ import optparse, os, shutil, subprocess, sys, tempfile def stop_err( msg ): sys.stderr.write( '%s\n' % msg ) sys.exit() def run_process ( cmd, name, tmp_dir, buffsize ): try: tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name tmp_stderr = open( tmp, 'wb' ) proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() # get stderr, allowing for case where it's very large tmp_stderr = open( tmp, 'rb' ) stderr = '' try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() if returncode != 0: raise Exception, stderr except Exception, e: raise Exception, 'Error in \'' + name + '\'. \n' + str( e ) def check_output ( output, canBeEmpty ): if 0 < os.path.getsize( output ): return True elif False == canBeEmpty: raise Exception, 'The output file is empty:' + output def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option( '-e', '--eRateFirstread', dest='eRateFirstread', type='float', help='base/color error rate of the first read' ) parser.add_option( '-E', '--eRateSecondread', dest='eRateSecondread', type='float', help='base/color error rate of the second read' ) parser.add_option( '-d', '--innerDist', dest='innerDist', type='int', help='inner distance between the two ends' ) parser.add_option( '-s', '--stdev', dest='stdev', type='float', help='standard deviation' ) parser.add_option( '-N', '--numReads', dest='numReads', type='int', help='number of read pairs' ) parser.add_option( '-r', '--mutRate', dest='mutRate', type='float', help='rate of mutations' ) parser.add_option( '-R', '--fracIndels', dest='fracIndels', type='float', help='fraction of mutations that are indels' ) parser.add_option( '-X', '--indelExt', dest='indelExt', type='float', help='probability an indel is extended' ) parser.add_option( '-y', '--randProb', dest='randProb', type='float', help='probability of a random DNA read' ) parser.add_option( '-n', '--maxN', dest='maxN', type='int', help='maximum number of Ns allowed in a given read' ) parser.add_option( '-c', '--platformType', dest='platformType', type='int', help='Platform to simulate: 0: Illumina; 1: solid; 2: Ion Torrent' ) parser.add_option( '-S', '--strand', dest='strand', type='choice', default='0', choices=('0', '1', '2'), help='strand 0: default, 1: same strand, 2: opposite strand' ) parser.add_option( '-f', '--flowOrder', dest='flowOrder', type='int', default='2') parser.add_option( '-H', '--haploid', action='store_true', dest='haploid', default=False, help='haploid mode' ) parser.add_option( '-i', '--input', dest='input', help='The reference genome fasta' ) parser.add_option( '-O', dest='splitoutput', type='choice', default='two',choices=('two', 'one', 'all'), help='one: one single; two: two files; all: all output options' ) parser.add_option( '-1', '--loFirstread', dest='loFirstread', type='int', help='length of the first read' ) parser.add_option( '-2', '--loSecondread', dest='loSecondread', type='int', help='length of the second read' ) parser.add_option( '-6', '--outputMutations', dest='outputMutations', help='the output mutations TXT' ) parser.add_option( '-4', '--outputBWA1', dest='outputBWA1', help='the first BWA output FASTQ' ) parser.add_option( '-5', '--outputBWA2', dest='outputBWA2', help='the second BWA output FASTQ' ) parser.add_option( '-3', '--outputBFAST', dest='outputBFAST', help='the BFAST output FASTQ' ) (options, args) = parser.parse_args() # output version # of tool try: tmp = tempfile.NamedTemporaryFile().name tmp_stdout = open( tmp, 'wb' ) proc = subprocess.Popen( args='dwgsim 2>&1', shell=True, stdout=tmp_stdout ) tmp_stdout.close() returncode = proc.wait() stdout = None for line in open( tmp_stdout.name, 'rb' ): if line.lower().find( 'version' ) >= 0: stdout = line.strip() break if stdout: sys.stdout.write( '%s\n' % stdout ) else: raise Exception except: sys.stdout.write( 'Could not determine DWGSIM version\n' ) buffsize = 1048576 # make temp directory for dwgsim, requires trailing slash tmp_dir = '%s/' % tempfile.mkdtemp() #'generic' options used in all dwgsim commands here try: reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name os.symlink( options.input, reference_filepath ) assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.' tmp_dir = '%s/' % tempfile.mkdtemp() dwgsim_output_prefix = "dwgsim_output_prefix" dwgsim_cmd = 'dwgsim -e %s -E %s -d %s -s %s -N %s -1 %s -2 %s -r %s -R %s -X %s -y %s -c %s -n %s' % \ (options.eRateFirstread, \ options.eRateSecondread, \ options.innerDist, \ options.stdev, \ options.numReads, \ options.loFirstread, \ options.loSecondread, \ options.mutRate, \ options.fracIndels, \ options.indelExt, \ options.randProb, \ options.platformType, \ options.maxN) if options.haploid: dwgsim_cmd = dwgsim_cmd + ' -H' if options.platformType == "2": dwgsim_cmd = dwgsim_cmd + options.flowOrder dwgsim_cmd = dwgsim_cmd + ' ' + options.input dwgsim_cmd = dwgsim_cmd + ' ' + tmp_dir + dwgsim_output_prefix # temporary file to check the command being launched tmpfile = open('/tmp/dwgsim.log', 'w') tmpfile.write("%s" % dwgsim_cmd) # need to nest try-except in try-finally to handle 2.4 try: # launch dwgsim + accept mutations.txt output run_process ( dwgsim_cmd, 'dwgsim', tmp_dir, buffsize ) cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.mutations.txt' + ' ' + options.outputMutations run_process ( cmd, 'mv #1', tmp_dir, buffsize ) check_output ( options.outputMutations, True ) # check which output required and move files accordingly tmpfile.write("\nsplitoutput: %s \ncondition one: %s \ncondition two: %s \ncondition all: %s\n" % (options.splitoutput, str((options.splitoutput == 'one')), str((options.splitoutput == 'two')), str((options.splitoutput == 'all')))) if any( [options.splitoutput == 'two' , options.splitoutput == 'all'] ) : # two files cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read1.fastq' + ' ' + options.outputBWA1 run_process ( cmd, 'mv #3', tmp_dir, buffsize ) cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bwa.read2.fastq' + ' ' + options.outputBWA2 run_process ( cmd, 'mv #4', tmp_dir, buffsize ) tmpfile.write("\nmoved!!: %s" % options.splitoutput) # check that there are results in the output file check_output ( options.outputBWA1, False ) check_output ( options.outputBWA2, False ) if any( [options.splitoutput == 'one' , options.splitoutput == 'all'] ): cmd = 'mv ' + tmp_dir + '/' + dwgsim_output_prefix + '.bfast.fastq' + ' ' + options.outputBFAST run_process ( cmd, 'mv #2', tmp_dir, buffsize ) # check that there are results in the output file check_output ( options.outputBFAST, False ) sys.stdout.write( 'DWGSIM successful' ) except Exception, e: stop_err( 'DWGSIM failed.\n' + str( e ) ) finally: # clean up temp dir if os.path.exists( tmp_dir ): shutil.rmtree( tmp_dir ) if __name__=="__main__": __main__() DWGSIM-dwgsim.0.1.11/scripts/galaxy/dwgsim_wrapper.xml000066400000000000000000000114751231766373200226270ustar00rootroot00000000000000 from a fasta sequence using dwgsim #if $outputFormat.splitOutput == "two" #dwgsim_wrapper.py \ -4 $outputPEFirst \ -5 $outputPESecond \ #else if $outputFormat.splitOutput == "one" #dwgsim_wrapper.py \ -3 $outputSingleReads \ #else #dwgsim_wrapper.py \ -3 $outputSingleReads \ -4 $outputPEFirst \ -5 $outputPESecond \ #end if -6 $outputMutations \ -e $eRateFirstread \ -E $eRateSecondread \ -d $innerDist \ -s $stDev \ -N $noReadpairs \ -1 $loFirstread \ -2 $loSecondread \ -r $rateOfMut \ -R $rateOfIndels \ -X $probExtRead \ -y $probRandRead \ -n $maxN \ -S $strandDirection \ -c $platform.platformType \ #if $platform.platformType == "2" # -f $platform.flowOrder #end if $haploMode \ -O $outputFormat.splitOutput \ -i $input \ (outputFormat['splitOutput'] == 'one') | (outputFormat['splitOutput'] == 'all') (outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all') (outputFormat['splitOutput'] == 'two') | (outputFormat['splitOutput'] == 'all') This tool simulate reads from a given fasta file. For more information, please see https://sourceforge.net/apps/mediawiki/dnaa/index.php?title=Whole_Genome_Simulation DWGSIM-dwgsim.0.1.11/src/000077500000000000000000000000001231766373200146565ustar00rootroot00000000000000DWGSIM-dwgsim.0.1.11/src/contigs.c000066400000000000000000000032361231766373200164740ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include "contigs.h" contigs_t* contigs_init() { return calloc(1, sizeof(contigs_t)); } void contigs_add(contigs_t *c, char *name, uint32_t len) { c->n++; c->contigs = realloc(c->contigs, c->n * sizeof(contig_t)); c->contigs[c->n-1].name = strdup(name); c->contigs[c->n-1].len = len; } void contigs_destroy(contigs_t *c) { int32_t i; for(i=0;in;i++) { free(c->contigs[i].name); } free(c->contigs); free(c); } DWGSIM-dwgsim.0.1.11/src/contigs.h000066400000000000000000000004611231766373200164760ustar00rootroot00000000000000#ifndef CONTIGS_H_ #define CONTIGS_H_ typedef struct { char *name; int32_t len; } contig_t; typedef struct { contig_t *contigs; int32_t n; } contigs_t; contigs_t* contigs_init(); void contigs_add(contigs_t *c, char *name, uint32_t len); void contigs_destroy(contigs_t *c); #endif DWGSIM-dwgsim.0.1.11/src/dwgsim.c000066400000000000000000001273231231766373200163240ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Original Contact: Heng Li */ /* This program is separated from maq's read simulator with Colin * Hercus' modification to allow longer indels. Colin is the chief * developer of novoalign. */ /* This program was modified by Nils Homer for inclusion in the * DNAA package */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "mut.h" #include "mut_txt.h" #include "mut_bed.h" #include "regions_bed.h" #include "dwgsim_opt.h" #include "dwgsim.h" //#include uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; #define __gen_read(x, start, iter) do { \ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ mut_t c = currseq->s[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ ext_coor[x] = i; \ if(1 == strand[x]) ext_coor[x] -= s[x]-1; \ } \ if (mut_type == DELETE) { \ ++n_indel[x]; \ if(1 == strand[x]) ext_coor[x]--; \ if(0 == k) n_indel_first[x]++; \ } \ else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ tmp_seq[x][k++] = c & 0xf; \ if (mut_type == SUBSTITUTE) { \ ++n_sub[x]; \ if(0 == k) n_sub_first[x]++; \ } \ } else { \ mut_t n, ins; \ assert(mut_type == INSERT); \ ++n_indel[x]; \ n_indel_first[x]++; \ if(1 == mut_get_ins(currseq, i, &n, &ins)) { \ if(0 == strand[x]) { \ while(n > 0 && k < s[x]) { \ tmp_seq[x][k++] = ins & 0x3; \ --n, ins >>= 2; \ } \ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \ } else { \ tmp_seq[x][k++] = c & 0xf; \ while(n > 0 && k < s[x]) { \ ext_coor[x]++; \ tmp_seq[x][k++] = (ins >> ((n-1) << 1) & 0x3); \ --n; \ } \ } \ } else { \ int32_t byte_index, bit_index; \ uint32_t num_ins; \ uint8_t *insertion = NULL; \ insertion = mut_get_ins_long_n(currseq->ins[ins], &num_ins); \ if(0 == strand[x]) { \ byte_index = mut_packed_len(num_ins) - 1; bit_index = 3 - (num_ins & 3); \ while(num_ins > 0 && k < s[x]) { \ assert(0 <= byte_index); \ tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3; \ --num_ins; \ bit_index--; \ if (bit_index < 0) { \ bit_index = 3; \ byte_index--; \ } \ } \ if(k < s[x]) tmp_seq[x][k++] = c & 0xf; \ } else { \ tmp_seq[x][k++] = c & 0xf; \ byte_index = 0; bit_index = 0; \ while(num_ins > 0 && k < s[x]) { \ ext_coor[x]++; \ tmp_seq[x][k++] = (insertion[byte_index] >> (bit_index << 1)) & 0x3; \ --num_ins; \ bit_index++; \ if (4 == bit_index) { \ bit_index = 0; \ byte_index++; \ } \ } \ } \ } \ } \ } \ if (k != s[x]) ext_coor[x] = -10; \ if (1 == strand[x]) { \ for (k = 0; k < s[x]; ++k) tmp_seq[x][k] = tmp_seq[x][k] < 4? 3 - tmp_seq[x][k] : 4; \ } \ } while (0) /* Simple normal random number generator, copied from genran.c */ double ran_normal() { static int iset = 0; static double gset; double fac, rsq, v1, v2; if (iset == 0) { do { v1 = 2.0 * drand48() - 1.0; v2 = 2.0 * drand48() - 1.0; rsq = v1 * v1 + v2 * v2; } while (rsq >= 1.0 || rsq == 0.0); fac = sqrt(-2.0 * log(rsq) / rsq); gset = v1 * fac; iset = 1; return v2 * fac; } else { iset = 0; return gset; } } int32_t ran_num(double prob, int32_t n) { int32_t i, r; for(i = r = 0; i < n; i++) { if(drand48() < prob) r++; } return r; } int32_t get_muttype(char *str) { int32_t i; for(i=0;i= 4) c = 4; \ else if(drand48() < e[_j]->start + e[_j]->by*i) { \ c = (c + (mut_t)(drand48() * 3.0 + 1)) & 3; \ ++n_err[_j]; \ if(0 == i) ++n_err_first[_j]; \ } \ _cur_seq[_j][i] = c; \ } \ } while(0) int32_t generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err) { int32_t i, j, k, hp_l, flow_i, n_err; uint8_t prev_c, c; // remove Ns for(i=0;i>1;i++) { c = (*seq)[i]; (*seq)[i] = (*seq)[len-i-1]; (*seq)[len-i-1] = c; } } // skip forward to the first flow for(i=0;iflow_order_len;i++) { c = (4 <= (*seq)[0]) ? 0 : (*seq)[0]; if(c == opt->flow_order[i]) { break; } (*mask)[i] = 0; } assert(opt->flow_order_len != i); flow_i = i; prev_c = 4; for(i=0;iflow_order[flow_i]) { // skip paste empty flows (*mask)[flow_i] = 0; flow_i = (flow_i + 1) % opt->flow_order_len; } if(prev_c != c) { // new hp (*mask)[flow_i] = 0; n_err = 0; while(drand48() < e) { // how many bases should we insert/delete n_err++; } if(0 < n_err) { if(drand48() < 0.5) { // insert // more memory while((*mem) <= len + n_err) { (*mem) <<= 1; // double (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem)); (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem)); } // shift up for(j=len-1;i<=j;j--) { (*seq)[j+n_err] = (*seq)[j]; } for(j=i;jflow_order[(flow_i + j) % opt->flow_order_len]) { // skip paste empty flows j++; } assert(0 < j); // pick one to fill in k = (int)(drand48() * j); // shift up for(j=len-1;i<=j;j--) { (*seq)[j+1] = (*seq)[j]; } // add the filled in base (*seq)[i] = opt->flow_order[(flow_i + k) % opt->flow_order_len]; len++; } } (*_n_err) += n_err; } prev_c = c; } } // second pass: add empty flows prev_c = 4; for(i=0;iflow_order[flow_i]) { n_err = 0; while(drand48() < e) { n_err++; } if(0 == (*mask)[flow_i] && 0 < n_err) { // insert // more memory while((*mem) <= len + n_err) { (*mem) <<= 1; // double (*seq) = realloc((*seq), sizeof(uint8_t) * (*mem)); (*mask) = realloc((*mask), sizeof(uint8_t) * (*mem)); } // shift up for(j=len-1;i<=j;j--) { (*seq)[j+n_err] = (*seq)[j]; } // copy for(j=i;jflow_order[flow_i]; } len += n_err; (*_n_err) += n_err; } flow_i = (flow_i + 1) % opt->flow_order_len; } } if(1 == strand) { for(i=0;i>1;i++) { c = (*seq)[i]; (*seq)[i] = (*seq)[len-i-1]; (*seq)[len-i-1] = c; } } return len; } void dwgsim_core(dwgsim_opt_t * opt) { seq_t seq; mutseq_t *mutseq[2]={NULL,NULL}; uint64_t tot_len, ii=0, ctr=0; int i, l, m, n_ref, contig_i; char name[1024], *qstr; int32_t name_len_max=0; int size[2], prev_skip=0, qstr_l=0; int num_n[2]; uint8_t *tmp_seq[2]={NULL,NULL}; uint8_t *tmp_seq_flow_mask[2]={NULL,NULL}; int32_t tmp_seq_mem[2]={0,0}; int64_t n_sim = 0; error_t *e[2]={NULL,NULL}; FILE *fp_muts_input = NULL; FILE *fp_regions_bed = NULL; muts_input_t *muts_input = NULL; regions_bed_txt *regions_bed = NULL; contigs_t *contigs = NULL; e[0] = &opt->e[0]; e[1] = &opt->e[1]; INIT_SEQ(seq); seq_set_block_size(0x1000000); l = opt->length[0] > opt->length[1]? opt->length[0] : opt->length[1]; qstr_l = l; qstr = (char*)calloc(qstr_l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); if(IONTORRENT == opt->data_type) { tmp_seq_flow_mask[0] = (uint8_t*)calloc(l+2, 1); tmp_seq_flow_mask[1] = (uint8_t*)calloc(l+2, 1); } tmp_seq_mem[0] = tmp_seq_mem[1] = l+2; size[0] = opt->length[0]; size[1] = opt->length[1]; if(0 <= opt->fn_muts_input_type) { contigs = contigs_init(); } if(NULL != opt->fn_regions_bed) { fp_regions_bed = xopen(opt->fn_regions_bed, "r"); if(NULL == contigs) contigs = contigs_init(); } tot_len = n_ref = 0; mut_print_header_pre(opt->fp_vcf); if(NULL != opt->fp_fai) { int dummy_int[3]; while(0 < fscanf(opt->fp_fai, "%s\t%d\t%d\t%d\t%d", name, &l, &dummy_int[0], &dummy_int[1], &dummy_int[2])) { fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l); tot_len += l; ++n_ref; mut_print_header_contig(opt->fp_vcf, name, l); if(NULL != contigs) { contigs_add(contigs, name, l); } } } else { while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) { fprintf(stderr, "[dwgsim_core] %s length: %d\n", name, l); tot_len += l; ++n_ref; mut_print_header_contig(opt->fp_vcf, name, l); if(NULL != contigs) { contigs_add(contigs, name, l); } } } fprintf(stderr, "[dwgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); rewind(opt->fp_fa); mut_print_header_post(opt->fp_vcf); if(0 <= opt->fn_muts_input_type) { fp_muts_input = xopen(opt->fn_muts_input, "r"); muts_input = muts_input_init(fp_muts_input, contigs, opt->fn_muts_input_type); // read in the mutation file } if(NULL != opt->fn_regions_bed) { regions_bed = regions_bed_init(fp_regions_bed, contigs); // recalculate the total length tot_len = 0; for(i=0;in;i++) { tot_len += regions_bed->end[i] - regions_bed->start[i] + 1; } } if(NULL != contigs) { contigs_destroy(contigs); contigs = NULL; } if(0 == opt->muts_only) { fprintf(stderr, "[dwgsim_core] Currently on: \n0"); } else { fprintf(stderr, "[dwgsim_core] Currently on:"); } contig_i = 0; while ((l = seq_read_fasta(opt->fp_fa, &seq, name, 0)) >= 0) { int64_t n_pairs = 0; n_ref--; if(1 == opt->muts_only) { fprintf(stderr, "\r[dwgsim_core] Currently on: %s", name); if(name_len_max < strlen(name)) { name_len_max = strlen(name); } else { for(i=0;imuts_only) { if(0 == n_ref && opt->C < 0) { n_pairs = opt->N - n_sim; } else { if(NULL != regions_bed) { // recalculate l m = 0; for(i=0;in;i++) { if(contig_i == regions_bed->contig[i]) { m += regions_bed->end[i] - regions_bed->start[i] + 1; } } if(0 == m) { fprintf(stderr, "[dwgsim_core] #0 skip sequence '%s' as it is not in the targeted region\n", name); contig_i++; continue; // skip this region } l = m; int num_n = 0; for(i=0;in;i++) { if(contig_i == regions_bed->contig[i]) { int m; for(m=regions_bed->start[i];m<=regions_bed->end[i];m++) { switch (seq.s[m-1]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'T': break; default: num_n++; break; } } } } if(0.95 < num_n / (double)l) { // TODO: arbitrary cutoff fprintf(stderr, "[dwgsim_core] #1 skip sequence '%s' as %d out of %d bases are non-ACGT\n", name, num_n, l); contig_i++; continue; } } if(0 < opt->N) { // based on -N n_pairs = (uint64_t)((long double)l / tot_len * opt->N + 0.5); if(opt->N - n_sim < n_pairs) n_pairs = opt->N - n_sim; // make sure we don't simulate too many reads } else { // based on coverage, with added random reads n_pairs = (uint64_t)(l * opt->C / ((long double)(size[0] + size[1])) / (1.0 - opt->rand_read) + 0.5); } } // for paired end/mate pair, make sure we have enough bases in this // sequence if (0 < opt->length[1] && l < opt->dist + 3 * opt->std_dev) { if(0 == prev_skip) fprintf(stderr, "\n"); prev_skip = 1; fprintf(stderr, "[dwgsim_core] #2 skip sequence '%s' as it is shorter than %f!\n", name, opt->dist + 3 * opt->std_dev); contig_i++; continue; } else if (l < opt->length[0] || (0 < opt->length[1] && l < opt->length[1])) { if(0 == prev_skip) fprintf(stderr, "\n"); prev_skip = 1; fprintf(stderr, "[dwgsim_core] #3 skip sequence '%s' as it is shorter than %d!\n", name, (l < opt->length[0]) ? opt->length[0] : opt->length[1]); contig_i++; continue; } else if (n_pairs < 0) { // NB: this should not happen // not enough pairs continue; } prev_skip = 0; } // generate mutations and print them out mutseq[0] = mutseq_init(); mutseq[1] = mutseq_init(); mut_diref(opt, &seq, mutseq[0], mutseq[1], contig_i, muts_input); mut_print(name, &seq, mutseq[0], mutseq[1], opt->fp_mut, opt->fp_vcf); if(0 == opt->muts_only) { for (ii = 0; ii != n_pairs; ++ii, ++ctr) { // the core loop if(0 == (ctr % 10000)) { fprintf(stderr, "\r[dwgsim_core] %llu", (unsigned long long int)ctr); } double ran; int d, pos, s[2], strand[2]; int n_sub[2], n_indel[2], n_err[2], ext_coor[2]={0,0}, j, k; int n_sub_first[2], n_indel_first[2], n_err_first[2]; // need this for SOLID data int c1, c2, c; s[0] = size[0]; s[1] = size[1]; if(opt->rand_read < drand48()) { if(NULL == regions_bed) { do { // avoid boundary failure if(0 < s[1]) { // paired end/mate pair ran = ran_normal(); ran = ran * opt->std_dev + opt->dist; d = (int)(ran + 0.5); } else { d = 0; } pos = (int)((l - d + 1) * drand48()); } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1])))); } else { do { // avoid boundary failure if(0 < s[1]) { ran = ran_normal(); ran = ran * opt->std_dev + opt->dist; d = (int)(ran + 0.5); } else { d = 0; } pos = (int)((l - d + 1) * drand48()); // convert in the bed file for(i=0;in;i++) { // TODO: regions are in sorted order... so optimize if(contig_i == regions_bed->contig[i]) { j = regions_bed->end[i] - regions_bed->start[i] + 1; if(pos < j) { pos = regions_bed->start[i] + pos - 1; // zero-based break; } else { pos -= j; } } } } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l || (0 < s[1] && 0 == opt->is_inner && ((0 < s[0] && d <= s[1]) || (d <= s[0] && 0 < s[1]))) || 0 == regions_bed_query(regions_bed, contig_i, pos, pos + s[0] + s[1] + d - 1)); } // generate the read sequences mutseq_t *currseq = mutseq[drand48()mut_freq?0:1]; // haplotype from which the reads are generated n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; n_sub_first[0] = n_sub_first[1] = n_indel_first[0] = n_indel_first[1] = n_err_first[0] = n_err_first[1] = 0; num_n[0]=num_n[1]=0; // strand if(2 == opt->strandedness || (0 == opt->strandedness && ILLUMINA == opt->data_type)) { // opposite strand by default for Illumina strand[0] = 0; strand[1] = 1; } else if(1 == opt->strandedness || (0 == opt->strandedness && (SOLID == opt->data_type || IONTORRENT == opt->data_type))) { // same strands by default for SOLiD strand[0] = 0; strand[1] = 0; } else { // should not reach here assert(1 == 0); } if (drand48() < 0.5) { // which strand ? // Flip strands strand[0] = (1 + strand[0]) % 2; strand[1] = (1 + strand[1]) % 2; } // generate the reads in base space if(0 < s[1]) { // paired end or mate pair if(strand[0] == strand[1]) { // same strand if(0 == strand[0]) { // + strand /* * 5' E2 -----> .... E1 -----> 3' * 3' .... 5' */ if(0 == opt->is_inner) { __gen_read(0, pos + d - s[0], ++i); } else { __gen_read(0, pos + s[1] + d, ++i); } __gen_read(1, pos, ++i); } else { // - strand /* * 3' .... 5' * 5' <----- E1 .... <----- E2 3' */ __gen_read(0, pos + s[0], --i); if(0 == opt->is_inner) { __gen_read(1, pos + d, --i); } else { __gen_read(1, pos + s[0] + d + s[1], --i); } } } else { // opposite strand if(0 == strand[0]) { // + strand /* * 5' E1 -----> .... 3' * 3' .... <----- E2 5' */ __gen_read(0, pos, ++i); if(0 == opt->is_inner) { __gen_read(1, pos + d, --i); } else { __gen_read(1, pos + s[0] + d + s[1], --i); } } else { // - strand /* * 5' E2 -----> .... 3' * 3' .... <----- E1 5' */ if(0 == opt->is_inner) { __gen_read(0, pos + d, --i); } else { __gen_read(0, pos + s[1] + d + s[0], --i); } __gen_read(1, pos, i++); } } } else { // fragment if(0 == strand[0]) { __gen_read(0, pos, ++i); // + strand } else { __gen_read(0, pos + s[0] - 1, --i); // - strand } } // Count # of Ns for (j = 0; j < 2; ++j) { num_n[j]=0; if(0 < s[j]) { for (i = 0; i < s[j]; ++i) { if(tmp_seq[j][i] == 4) num_n[j]++; } } } if (ext_coor[0] < 0 || ext_coor[1] < 0 || opt->max_n < num_n[0] || opt->max_n < num_n[1]) { // fail to generate the read(s) --ii; --ctr; continue; } if(SOLID == opt->data_type) { // Convert to color sequence, use the first base as the adaptor for (j = 0; j < 2; ++j) { if(0 < s[j]) { c1 = 0; // adaptor for (i = 0; i < s[j]; ++i) { c2 = tmp_seq[j][i]; // current base c = __gf_add(c1, c2); tmp_seq[j][i] = c; c1 = c2; // save previous base } } } } // generate sequencing errors if(IONTORRENT == opt->data_type) { s[0] = generate_errors_flows(opt, &tmp_seq[0], &tmp_seq_flow_mask[0], &tmp_seq_mem[0], s[0], strand[0], e[0]->start, &n_err[0]); s[1] = generate_errors_flows(opt, &tmp_seq[1], &tmp_seq_flow_mask[1], &tmp_seq_mem[1], s[1], strand[1], e[1]->start, &n_err[1]); } else { // Illumina/SOLiD if(0 < s[0]) { if(0 == strand[0]) { __gen_errors_mismatches(tmp_seq, 0, 0, ++i, s[0]); } else { __gen_errors_mismatches(tmp_seq, 0, s[0]-1, --i, s[0]); } } if(0 < s[1]) { if(0 == strand[1]) { __gen_errors_mismatches(tmp_seq, 1, 0, ++i, s[1]); } else { __gen_errors_mismatches(tmp_seq, 1, s[1]-1, --i, s[1]); } } } // print for (j = 0; j < 2; ++j) { if(s[j] <= 0) { continue; } if(IONTORRENT == opt->data_type && qstr_l < s[j]) { qstr_l = s[j]; qstr = realloc(qstr, (1+qstr_l) * sizeof(char)); } if(NULL != opt->fixed_quality) { for (i = 0; i < s[j]; ++i) { qstr[i] = opt->fixed_quality[0]; } } else { for (i = 0; i < s[j]; ++i) { qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33; if(0 < opt->quality_std) { qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5); if(qstr[i] < 33) qstr[i] = 33; if(73 < qstr[i]) qstr[i] = 73; } } } qstr[i] = 0; // BWA FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2; if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) { fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0, n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1],n_indel[1], (long long)ii, j+1); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo); fprintf(fpo, "\n+\n%s\n", qstr); } else { // Note: BWA ignores the adapter and the first color, so this is a misrepresentation // in samtools. We must first skip the first color. Basically, a 50 color read is a // 49 color read for BWA. // // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2, // annotated as read "1". fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0, n_err[0] - n_err_first[0], n_sub[0] - n_sub_first[0], n_indel[0] - n_indel_first[0], n_err[1] - n_err_first[1], n_sub[1] - n_sub_first[1], n_indel[1] - n_indel_first[1], (long long)ii, 2 - j); //fputc('A', fpo); for (i = 1; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo); fprintf(fpo, "\n+\n"); for (i = 1; i < s[j]; ++i) fputc(qstr[i], fpo); fprintf(fpo, "\n"); } // BFAST output fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", name, ext_coor[0]+1, ext_coor[1]+1, strand[0], strand[1], 0, 0, n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], (long long)ii); if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) { for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast); fprintf(opt->fp_bfast, "\n+\n%s\n", qstr); } else { fputc('A', opt->fp_bfast); for (i = 0; i < s[j]; ++i) fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast); fprintf(opt->fp_bfast, "\n+\n"); for (i = 0; i < s[j]; ++i) fputc(qstr[i], opt->fp_bfast); fprintf(opt->fp_bfast, "\n"); } } n_sim++; } else { // random DNA read for(j=0;j<2;j++) { if(s[j] <= 0) { continue; } if(IONTORRENT == opt->data_type && qstr_l < s[j]) { qstr_l = s[j]; qstr = realloc(qstr, (1+qstr_l) * sizeof(char)); } // get random sequence for (i = 0; i < s[j]; ++i) { tmp_seq[j][i] = (int)(drand48() * 4.0) & 3; } if(NULL != opt->fixed_quality) { for (i = 0; i < s[j]; ++i) { qstr[i] = opt->fixed_quality[0]; } } else { for (i = 0; i < s[j]; ++i) { qstr[i] = (int)(-10.0 * log(e[j]->start + e[j]->by*i) / log(10.0) + 0.499) + 33; if(0 < opt->quality_std) { qstr[i] += (int)((ran_normal() * opt->quality_std) + 0.5); if(qstr[i] < 33) qstr[i] = 33; if(73 < qstr[i]) qstr[i] = 73; } } } qstr[i] = 0; if(SOLID == opt->data_type) { // convert to color space if(0 < s[j]) { c1 = 0; // adaptor for (i = 0; i < s[j]; ++i) { c2 = tmp_seq[j][i]; // current base c = __gf_add(c1, c2); tmp_seq[j][i] = c; c1 = c2; // save previous base } } } // BWA FILE *fpo = (0 == j) ? opt->fp_bwa1: opt->fp_bwa2; if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) { fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", "rand", 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, (long long)ii, j+1); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo); fprintf(fpo, "\n+\n%s\n", qstr); } else { // Note: BWA ignores the adapter and the first color, so this is a misrepresentation // in samtools. We must first skip the first color. Basically, a 50 color read is a // 49 color read for BWA. // // Note: BWA outputs F3 to read1, annotated as read "2", and outputs R3 to read2, // annotated as read "1". fprintf(fpo, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx/%d\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", "rand", 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, (long long)ii, 2 - j); //fputc('A', fpo); for (i = 1; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo); fprintf(fpo, "\n+\n"); for (i = 1; i < s[j]; ++i) fputc(qstr[i], fpo); fprintf(fpo, "\n"); } // BFAST output fprintf(opt->fp_bfast, "@%s%s%s_%u_%u_%1u_%1u_%1u_%1u_%d:%d:%d_%d:%d:%d_%llx\n", (NULL == opt->read_prefix) ? "" : opt->read_prefix, (NULL == opt->read_prefix) ? "" : "_", "rand", 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, (long long)ii); if(ILLUMINA == opt->data_type || IONTORRENT == opt->data_type) { for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], opt->fp_bfast); fprintf(opt->fp_bfast, "\n+\n%s\n", qstr); } else { fputc('A', opt->fp_bfast); for (i = 0; i < s[j]; ++i) fputc("01234"[(int)tmp_seq[j][i]], opt->fp_bfast); fprintf(opt->fp_bfast, "\n+\n"); for (i = 0; i < s[j]; ++i) fputc(qstr[i], opt->fp_bfast); fprintf(opt->fp_bfast, "\n"); } } n_sim++; } } fprintf(stderr, "\r[dwgsim_core] %llu", (unsigned long long int)ctr); } mutseq_destroy(mutseq[0]); mutseq_destroy(mutseq[1]); contig_i++; } fprintf(stderr, "\n[dwgsim_core] Complete!\n"); free(seq.s); free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); if(IONTORRENT == opt->data_type) { free(tmp_seq_flow_mask[0]); free(tmp_seq_flow_mask[1]); } if(0 <= opt->fn_muts_input_type) { muts_input_destroy(muts_input); } if(NULL != opt->fn_regions_bed) { regions_bed_destroy(regions_bed); fclose(fp_regions_bed); } } int main(int argc, char *argv[]) { dwgsim_opt_t *opt = NULL; // update the mutant sequence bounds mutseq_init_bounds(); opt = dwgsim_opt_init(); char fn_fai[1024]="\0"; char fn_tmp[1024]="\0"; if(0 == dwgsim_opt_parse(opt, argc, argv)) { return dwgsim_opt_usage(opt); } // Open files opt->fp_fa = xopen(argv[optind+0], "r"); strcpy(fn_fai, argv[optind+0]); strcat(fn_fai, ".fai"); opt->fp_fai = fopen(fn_fai, "r"); // NB: depends on returning NULL; strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.txt"); opt->fp_mut = xopen(fn_tmp, "w"); strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".mutations.vcf"); opt->fp_vcf = xopen(fn_tmp, "w"); if(0 == opt->muts_only) { strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bfast.fastq"); opt->fp_bfast = xopen(fn_tmp, "w"); strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read1.fastq"); opt->fp_bwa1 = xopen(fn_tmp, "w"); strcpy(fn_tmp, argv[optind+1]); strcat(fn_tmp, ".bwa.read2.fastq"); opt->fp_bwa2 = xopen(fn_tmp, "w"); } // Run simulation dwgsim_core(opt); // Close files if(0 == opt->muts_only) { fclose(opt->fp_fa); fclose(opt->fp_bfast); fclose(opt->fp_bwa1); fclose(opt->fp_bwa2); } if(NULL != opt->fp_fai) fclose(opt->fp_fai); fclose(opt->fp_mut); fclose(opt->fp_vcf); dwgsim_opt_destroy(opt); return 0; } DWGSIM-dwgsim.0.1.11/src/dwgsim.h000066400000000000000000000010731231766373200163220ustar00rootroot00000000000000#ifndef DWGSIM_H #define DWGSIM_H #include "dwgsim_opt.h" #define __gf_add(_x, _y) ((_x >= 4 || _y >= 4) ? 4 : (_x ^ _y)) #define __IS_TRUE(_val) ((_val == 1) ? "True" : "False") extern uint8_t nst_nt4_table[256]; enum data_type_t { ILLUMINA=0, SOLID=1, IONTORRENT=2 }; char iupac_and_base_to_mut(char iupac, char base); char bases_to_iupac(char b1, char b2); int32_t get_muttype(char *str); int32_t generate_errors_flows(dwgsim_opt_t *opt, uint8_t **seq, uint8_t **mask, int32_t *mem, int32_t len, uint8_t strand, double e, int32_t *_n_err); #endif DWGSIM-dwgsim.0.1.11/src/dwgsim_eval.c000077500000000000000000000612531231766373200173350ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "samtools/bam.h" #include "samtools/sam.h" #include "dwgsim_eval.h" #define __IS_TRUE(_val) ((_val == 1) ? "True" : "False") int print_usage(dwgsim_eval_args_t *args) { fprintf(stderr, "\n"); fprintf(stderr, "Program: dwgsim_eval (short read simulation evaluator)\n"); fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); fprintf(stderr, "Contact: Nils Homer \n\n"); fprintf(stderr, "Usage: dwgsim_eval [options] \n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, "\t-a\tINT\tsplit by [%d]:\n", args->a); fprintf(stderr, "\t\t\t\t\t0: by mapping quality\n"); fprintf(stderr, "\t\t\t\t\t1: by alignment score\n"); fprintf(stderr, "\t\t\t\t\t2: by suboptimal alignment score\n"); fprintf(stderr, "\t\t\t\t\t3: by alignment score - suboptimal alignment score\n"); fprintf(stderr, "\t-b\t\talignments are from BWA (SOLiD only) [%s]\n", __IS_TRUE(args->b)); fprintf(stderr, "\t-c\t\tcolor space alignments [%s]\n", __IS_TRUE(args->c)); fprintf(stderr, "\t-d\tINT\tdivide quality/alignment score by this factor [%d]\n", args->d); fprintf(stderr, "\t-g\t\tgap \"wiggle\" [%d]\n", args->g); fprintf(stderr, "\t-m\t\tconsecutive alignments with the same name (and end for multi-ends) should be treated as multi-mapped reads [%s]\n", __IS_TRUE(args->m)); fprintf(stderr, "\t-n\tINT\tnumber of raw input paired-end reads (otherwise, inferred from all SAM records present) [%d]\n", args->n); fprintf(stderr, "\t-q\tINT\tconsider only alignments with this mapping quality or greater [%d]\n", args->q); fprintf(stderr, "\t-z\t\tinput contains only single end reads [%s]\n", __IS_TRUE(args->z)); fprintf(stderr, "\t-S\t\tinput is SAM [%s]\n", __IS_TRUE(args->S)); fprintf(stderr, "\t-p\t\tprint incorrect alignments [%s]\n", __IS_TRUE(args->p)); fprintf(stderr, "\t-s\tINT\tconsider only alignments with the number of specified SNPs [%d]\n", args->s); fprintf(stderr, "\t-e\tINT\tconsider only alignments with the number of specified errors [%d]\n", args->e); fprintf(stderr, "\t-i\t\tconsider only alignments with indels [%s]\n", __IS_TRUE(args->i)); fprintf(stderr, "\t-P\tSTRING\ta read prefix that was prepended to each read name [%s]\n", (NULL == args->P) ? "not using" : args->P); fprintf(stderr, "\t-h\t\tprint this help message\n"); return 1; } /* Action */ enum {Exit, Warn, LastActionType}; /* Type */ enum { Dummy, OutOfRange, /* e.g. command line args */ InputArguments, IllegalFileName, IllegalPath, OpenFileError, EndOfFile, ReallocMemory, MallocMemory, ThreadError, ReadFileError, WriteFileError, DeleteFileError, LastErrorType, }; #define BREAK_LINE "************************************************************\n" void dwgsim_eval_print_error(char* FunctionName, char *VariableName, char* Message, int Action, int type) { static char ErrorString[][20]= { "\0", "OutOfRange", "InputArguments", "IllegalFileName", "IllegalPath", "OpenFileError", "EndOfFile", "ReallocMemory", "MallocMemory", "ThreadError", "ReadFileError", "WriteFileError", "DeleteFileError"}; static char ActionType[][20]={"Fatal Error", "Warning"}; fprintf(stderr, "%s\rIn function \"%s\": %s[%s]. ", BREAK_LINE, FunctionName, ActionType[Action], ErrorString[type]); /* Only print variable name if is available */ if(VariableName) { fprintf(stderr, "Variable/Value: %s.\n", VariableName); } /* Only print message name if is available */ if(Message) { fprintf(stderr, "Message: %s.\n", Message); } if(type == ReadFileError || type == OpenFileError || type == WriteFileError) { perror("The file stream error was:"); } switch(Action) { case Exit: fprintf(stderr, " ***** Exiting due to errors *****\n"); fprintf(stderr, "%s", BREAK_LINE); exit(EXIT_FAILURE); break; /* Not necessary actually! */ case Warn: fprintf(stderr, " ***** Warning *****\n"); fprintf(stderr, "%s", BREAK_LINE); break; default: fprintf(stderr, "Trouble!!!\n"); fprintf(stderr, "%s", BREAK_LINE); } } int main(int argc, char *argv[]) { char c; dwgsim_eval_args_t args; args.a = args.b = args.c = args.i = args.m = args.n = args.p = args.q = args.z = 0; args.d = 1; args.e = -1; args.g = 5; args.s = -1; args.S = 0; args.P = NULL; while(0 <= (c = getopt(argc, argv, "a:d:e:g:m:n:q:s:bchimpzSP:"))) { switch(c) { case 'a': args.a = atoi(optarg); break; case 'b': args.b = 1; break; case 'c': args.c = 1; break; case 'd': args.d = atoi(optarg); break; case 'g': args.g = atoi(optarg); break; case 'm': args.m = 1; break; case 'h': return print_usage(&args); break; case 'n': args.n = atoi(optarg); break; case 'q': args.q = atoi(optarg); break; case 'z': args.z = 1; break; case 'S': args.S = 1; break; case 'p': args.p = 1; break; case 's': args.s = atoi(optarg); break; case 'e': args.e = atoi(optarg); break; case 'i': args.i = 1; break; case 'P': free(args.P); args.P = strdup(optarg); break; default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 1; } } if(argc == optind) { return print_usage(&args); } run(&args, argc - optind, argv + optind); free(args.P); return 0; } void run(dwgsim_eval_args_t *args, int32_t num_files, char *files[]) { char *FnName="run"; int32_t i, n = 0; samfile_t *fp_in = NULL; samfile_t *fp_out = NULL; bam1_t *b=NULL; dwgsim_eval_counts_t *counts; char *prev_qname=NULL; int32_t prev_end=-1; // initialize counts counts = dwgsim_eval_counts_init(); fprintf(stderr, "Analyzing...\nCurrently on:\n0"); for(i=0;iS) ? "r" : "rb", 0); if(NULL == fp_in) { dwgsim_eval_print_error(FnName, files[i], "Could not open file for reading", Exit, OpenFileError); } if(0 == i && 1 == args->p) { fp_out = samopen("-", "wh", fp_in->header); if(NULL == fp_out) { dwgsim_eval_print_error(FnName, "stdout", "Could not open file stream for writing", Exit, OpenFileError); } } b = bam_init1(); while(0 < samread(fp_in, b)) { if(1 == args->m && NULL != prev_qname && prev_end == (BAM_FREAD1 & b->core.flag) && 0 == strcmp(prev_qname, bam1_qname(b))) { // do nothing } else { if(1 == args->m) { free(prev_qname); prev_qname = strdup(bam1_qname(b)); prev_end = (BAM_FREAD1 & b->core.flag); } process_bam(counts, args, fp_in->header, b, fp_out); if((BAM_FPAIRED & b->core.flag)) { // paired end if(1 == args->z) { // expect single end dwgsim_eval_print_error(FnName, NULL, "Found a read that was paired end", Exit, OutOfRange); } if((BAM_FREAD1 & b->core.flag)) { // count # of pairs n++; } } else { // single end if(0 == args->z) { // expect paired end dwgsim_eval_print_error(FnName, NULL, "Found a read that was not paired", Exit, OutOfRange); } n++; } if(0 == (n % 10000)) { fprintf(stderr, "\r%lld", (long long int)n); } } bam_destroy1(b); b = bam_init1(); } bam_destroy1(b); // Close the file samclose(fp_in); } free(prev_qname); if(1 == args->p) samclose(fp_out); fprintf(stderr, "\r%lld\n", (long long int)n); if(0 < args->n) { if(n != args->n) { fprintf(stderr, "(-n)=%d\tn=%d\n", args->n, n); dwgsim_eval_print_error(FnName, NULL, "Number of reads found differs from the number specified (-n)", Warn, OutOfRange); } } if(0 == args->z) { dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? 2*args->n : 2*n); } else { dwgsim_eval_counts_print(counts, args->a, args->d, (0 < args->n) ? args->n : n); } dwgsim_eval_counts_destroy(counts); fprintf(stderr, "Analysis complete.\n"); } uint32_t bam_calclip(const bam1_t *b) { const bam1_core_t *c = &b->core; const uint32_t *cigar = bam1_cigar(b); uint32_t k, end = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; if(op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) { end += cigar[k] >> BAM_CIGAR_SHIFT; } else { break; } } return end; } void process_bam(dwgsim_eval_counts_t *counts, dwgsim_eval_args_t *args, bam_header_t *header, bam1_t *b, samfile_t *fp_out) { char *FnName="process_bam"; int32_t left, metric=INT_MIN; char *chr=NULL; char *name=NULL, *ptr=NULL; char chr_name[1028]="\0"; char read_num[1028]="\0"; int32_t pos_1, pos_2, str_1, str_2, rand_1, rand2; int32_t n_err_1, n_sub_1, n_indel_1, n_err_2, n_sub_2, n_indel_2; int32_t pos, str, rand; int32_t i, j, tmp; int32_t predicted_value, actual_value; int32_t clip; // mapping quality threshold if(b->core.qual < args->q) return; // parse read name name = strdup(bam1_qname(b)); ptr = name; // save to be freed char *to_rm="_::_::_______"; // to remove for(i=b->core.l_qname-1,j=0;0<=i && j<13;i--) { // replace with spaces if(name[i] == to_rm[j]) { name[i] = ' '; j++; } } // check for the prefix if(NULL != args->P) { j = strlen(name); tmp = strlen(args->P); if(j < tmp || 0 != strncmp(args->P, name, tmp)) { dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] could not match read name with given read name prefix (-P)", Exit, OutOfRange); free(ptr); return; } name += tmp + 1; } if(14 != sscanf(name, "%s %d %d %1d %1d %1d %1d %d %d %d %d %d %d %s", chr_name, &pos_1, &pos_2, &str_1, &str_2, &rand_1, &rand2, &n_err_1, &n_sub_1, &n_indel_1, &n_err_2, &n_sub_2, &n_indel_2, read_num)) { dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] read was not generated by dwgsim?", Exit, OutOfRange); free(ptr); return; } // check for a prefix, and make sure it was removed correctly if(1 == args->z || (b->core.flag & BAM_FREAD1)) { rand = rand_1; } else { rand = rand2; } if(0 == rand) { for(j=0;jn_targets;j++) { i = strlen(name); tmp = strlen(header->target_name[j]); i = (i < tmp) ? i : tmp; if(0 == strncmp(name, header->target_name[j], i)) { break; } } if(j == header->n_targets) { dwgsim_eval_print_error(FnName, name, "[dwgsim_eval] the mapped contig does not exist in the SAM header; perhaps you have a read name prefix?", Exit, OutOfRange); } } free(ptr); ptr = name = NULL; // get metric value if(0 == args->a) { metric = (b->core.qual / args->d); if(DWGSIM_EVAL_MAXQ < metric) metric = DWGSIM_EVAL_MAXQ; } else if((BAM_FUNMAP & b->core.flag) || 0 == b->core.qual) { // unmapped or zero quality metric = DWGSIM_EVAL_MINAS; } else { uint8_t *aux_AS=NULL, *aux_XS=NULL; metric = DWGSIM_EVAL_MINAS+1; if(1 == args->a || 3 == args->a) { aux_AS = bam_aux_get(b, "AS"); if(NULL == aux_AS) { metric = DWGSIM_EVAL_MINAS; } } if(2 == args->a || 3 == args->a) { aux_XS = bam_aux_get(b, "XS"); if(NULL == aux_XS) { metric = DWGSIM_EVAL_MINAS; } } if(metric != DWGSIM_EVAL_MINAS) { switch(args->a) { case 1: metric = bam_aux2i(aux_AS); break; case 2: metric = bam_aux2i(aux_XS); break; case 3: metric = bam_aux2i(aux_AS) - bam_aux2i(aux_XS); break; default: metric = DWGSIM_EVAL_MINAS; break; } } } metric /= args->d; if(metric < DWGSIM_EVAL_MINAS) metric = DWGSIM_EVAL_MINAS; if(1 == args->i) { // indels only if(1 == args->z || (b->core.flag & BAM_FREAD1)) { if(0 == n_indel_1) return; } else { if(0 == n_indel_2) return; } } else if(0 <= args->e && n_err_1 != args->e) { // # of errors return; } else if(0 <= args->s && n_sub_1 != args->s) { // # of snps return; } if(1 == args->c && 1 == args->b) { // SOLiD and BWA // Swap 1 and 2 tmp=n_err_1; n_err_1=n_err_2; n_err_2=tmp; tmp=n_sub_1; n_sub_1=n_sub_2; n_sub_2=tmp; tmp=n_indel_1; n_indel_1=n_indel_2; n_indel_2=tmp; } // copy data if(1 == args->z || (b->core.flag & BAM_FREAD1)) { pos = pos_1; str = str_1; rand = rand_1; } else { pos = pos_2; str = str_2; rand = rand2; } // get the actual value if(1 == rand) { actual_value = DWGSIM_EVAL_UNMAPPABLE; } else { actual_value = DWGSIM_EVAL_MAPPABLE; } // get the predicted value if((BAM_FUNMAP & b->core.flag)) { // unmapped predicted_value = DWGSIM_EVAL_UNMAPPED; } else { // mapped (correctly?) clip = bam_calclip(b); chr = header->target_name[b->core.tid]; left = b->core.pos - clip; if(1 == rand // should not map || str != bam1_strand(b) // different strand || 0 != strcmp(chr, chr_name) // different chromosome || args->g < fabs(pos - left)) { // out of bounds (positionally) predicted_value = DWGSIM_EVAL_MAPPED_INCORRECTLY; } else { predicted_value = DWGSIM_EVAL_MAPPED_CORRECTLY; } } dwgsim_eval_counts_add(counts, metric, actual_value, predicted_value); // print incorrect alignments if(1 == args->p && DWGSIM_EVAL_MAPPED_INCORRECTLY == predicted_value) { if(samwrite(fp_out, b) <= 0) { dwgsim_eval_print_error(FnName, "stdout", "Could not write to stream", Exit, WriteFileError); } } } dwgsim_eval_counts_t * dwgsim_eval_counts_init() { dwgsim_eval_counts_t *counts; counts = malloc(sizeof(dwgsim_eval_counts_t)); counts->min_score = counts->max_score = 0; counts->mc = malloc(sizeof(int32_t)); assert(NULL != counts->mc); counts->mi = malloc(sizeof(int32_t)); assert(NULL != counts->mi); counts->mu = malloc(sizeof(int32_t)); assert(NULL != counts->mu); counts->um = malloc(sizeof(int32_t)); assert(NULL != counts->um); counts->uu = malloc(sizeof(int32_t)); assert(NULL != counts->uu); counts->mc[0] = counts->mi[0] = counts->mu[0] = 0; counts->um[0] = counts->uu[0] = 0; return counts; } void dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts) { free(counts->mc); free(counts->mi); free(counts->mu); free(counts->um); free(counts->uu); free(counts); } void dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value) { char *FnName="dwgsim_eval_counts_add"; int32_t i, m, n; if(counts->max_score < score) { m = score - counts->min_score + 1; n = counts->max_score - counts->min_score + 1; counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc); counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi); counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu); counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um); counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu); // initialize to zero for(i=n;imc[i] = counts->mi[i] = counts->mu[i] = 0; counts->um[i] = counts->uu[i] = 0; } counts->max_score = score; } else if(score < counts->min_score) { m = counts->max_score - score + 1; n = counts->max_score - counts->min_score + 1; counts->mc = realloc(counts->mc, sizeof(int32_t)*m); assert(NULL != counts->mc); counts->mi = realloc(counts->mi, sizeof(int32_t)*m); assert(NULL != counts->mi); counts->mu = realloc(counts->mu, sizeof(int32_t)*m); assert(NULL != counts->mu); counts->um = realloc(counts->um, sizeof(int32_t)*m); assert(NULL != counts->um); counts->uu = realloc(counts->uu, sizeof(int32_t)*m); assert(NULL != counts->uu); // shift up for(i=m-1;m-n<=i;i--) { counts->mc[i] = counts->mc[i-(m-n)]; counts->mi[i] = counts->mi[i-(m-n)]; counts->mu[i] = counts->mu[i-(m-n)]; counts->um[i] = counts->um[i-(m-n)]; counts->uu[i] = counts->uu[i-(m-n)]; } // initialize to zero for(i=0;imc[i] = counts->mi[i] = counts->mu[i] = 0; counts->um[i] = counts->uu[i] = 0; } counts->min_score = score; } // check actual value switch(actual_value) { case DWGSIM_EVAL_MAPPABLE: case DWGSIM_EVAL_UNMAPPABLE: break; default: dwgsim_eval_print_error(FnName, "actual_value", "Could not understand actual value", Exit, OutOfRange); } // check predicted value switch(predicted_value) { case DWGSIM_EVAL_MAPPED_CORRECTLY: case DWGSIM_EVAL_MAPPED_INCORRECTLY: case DWGSIM_EVAL_UNMAPPED: break; default: dwgsim_eval_print_error(FnName, "predicted_value", "Could not understand predicted value", Exit, OutOfRange); } switch(actual_value) { case DWGSIM_EVAL_MAPPABLE: switch(predicted_value) { case DWGSIM_EVAL_MAPPED_CORRECTLY: counts->mc[score-counts->min_score]++; break; case DWGSIM_EVAL_MAPPED_INCORRECTLY: counts->mi[score-counts->min_score]++; break; case DWGSIM_EVAL_UNMAPPED: counts->mu[score-counts->min_score]++; break; default: break; // should not reach here } break; case DWGSIM_EVAL_UNMAPPABLE: switch(predicted_value) { case DWGSIM_EVAL_MAPPED_CORRECTLY: dwgsim_eval_print_error(FnName, "predicted_value", "predicted value cannot be mapped correctly when the read is unmappable", Exit, OutOfRange); break; case DWGSIM_EVAL_MAPPED_INCORRECTLY: counts->um[score-counts->min_score]++; break; case DWGSIM_EVAL_UNMAPPED: counts->uu[score-counts->min_score]++; break; default: break; // should not reach here } break; default: break; // should not reach here } } void dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n) { int32_t i; int32_t max = 0; int32_t mc_sum, mi_sum, mu_sum, um_sum, uu_sum; int32_t m_total, mm_total, u_total; char format[1024]="\0"; mc_sum = mi_sum = mu_sum = um_sum = uu_sum = 0; m_total = mm_total = u_total = 0; // create the format string for(i=counts->max_score - counts->min_score;0<=i;i--) { m_total += counts->mc[i] + counts->mi[i] + counts->mu[i]; u_total += counts->um[i] + counts->uu[i]; max += counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i]; } max = 1 + log10(max); strcat(format, "%.2d "); for(i=0;i<12;i++) { sprintf(format + (int)strlen(format), "%%%dd ", max); } strcat(format + (int)strlen(format), "%.3e %.3e %.3e %.3e %.3e %.3e\n"); // header fprintf(stdout, "# thr | the minimum %s threshold\n", (0 == a) ? "mapping quality" : "alignment score"); fprintf(stdout, "# mc | the number of reads mapped correctly that should be mapped at the threshold\n"); fprintf(stdout, "# mi | the number of reads mapped incorrectly that should be mapped be mapped at the threshold\n"); fprintf(stdout, "# mu | the number of reads unmapped that should be mapped be mapped at the threshold\n"); fprintf(stdout, "# um | the number of reads mapped that should be unmapped be mapped at the threshold\n"); fprintf(stdout, "# uu | the number of reads unmapped that should be unmapped be mapped at the threshold\n"); fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at the threshold\n"); fprintf(stdout, "# mc' | the number of reads mapped correctly that should be mapped at or greater than that threshold\n"); fprintf(stdout, "# mi' | the number of reads mapped incorrectly that should be mapped be mapped at or greater than that threshold\n"); fprintf(stdout, "# mu' | the number of reads unmapped that should be mapped be mapped at or greater than that threshold\n"); fprintf(stdout, "# um' | the number of reads mapped that should be unmapped be mapped at or greater than that threshold\n"); fprintf(stdout, "# uu' | the number of reads unmapped that should be unmapped be mapped at or greater than that threshold\n"); fprintf(stdout, "# mc' + mi' + mu' + um' + uu' | the total number of reads mapped at or greater than the threshold\n"); fprintf(stdout, "# (mc / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at the threshold\n"); fprintf(stdout, "# (mc / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at the threshold\n"); fprintf(stdout, "# (um / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at the threshold\n"); fprintf(stdout, "# (mc' / (mc' + mi' + mu')) | sensitivity: the fraction of reads that should be mapped that are mapped correctly at or greater than the threshold\n"); fprintf(stdout, "# (mc' / mc' + mi') | positive predictive value: the fraction of mapped reads that are mapped correctly at or greater than the threshold\n"); fprintf(stdout, "# (um' / (um' + uu')) | false discovery rate: the fraction of random reads that are mapped at or greater than the threshold\n"); // print for(i=counts->max_score - counts->min_score;0<=i;i--) { double num, den; mc_sum += counts->mc[i]; mi_sum += counts->mi[i]; mu_sum += counts->mu[i]; um_sum += counts->um[i]; uu_sum += counts->uu[i]; mm_total += counts->mc[i] + counts->mi[i]; /* Notes: * notice that the denominator for sensitivity (and fdr) for the "ge" * (greater than or equal) threshold is the "total", while the denominator * for ppv is "@ >= Q". The reasoning behind this is ppv is a measure of * the quality of mappings that will be returned when using a Q threshold * while the sensitivity want to measure the fraction of mappings that * will be returned compared to the maximum. Basically, if we accept * only mappings at a given threshold, and call the rest unmapped, what * happens? * - sensitivity tells us the # of correct mappings out of the total possible * mappings. * - ppv tells us the # of correct mappings out of the total mappings. * - fdr tells us the # of random mappings out of the total unmappable. */ // "at" sensitivity: mapped correctly @ Q / mappable @ Q num = counts->mc[i]; den = counts->mc[i] + counts->mi[i] + counts->mu[i]; double sens_at_thr = (0 == den) ? 0. : (num / (double)den); // "ge" sensitivity: mapped correctly @ >= Q / total mappable double sens_ge_thr = (0 == m_total) ? 0. : (mc_sum / (double)m_total); // "at" positive predictive value: mapped correctly @ Q / mappable and mapped @ Q num = counts->mc[i]; den = counts->mc[i] + counts->mi[i]; double ppv_at_thr = (0 == den) ? 0. : (num / (double)den); // "ge" positive predictive value: mapped correctly @ >= Q / mappable and mapped @ >= Q double ppv_ge_thr = (0 == mm_total) ? 0. : (mc_sum / (double)mm_total); // "at" false discovery rate: unmappable and mapped @ Q / unmappable @ Q num = counts->um[i]; den = counts->um[i] + counts->uu[i]; double fdr_at_thr = (0 == den) ? 0. : (num / (double)den); // "ge" false discovery rate: unmappable and mapped @ >= Q / unmappable @ >= Q double fdr_ge_thr = (0 == u_total) ? 0. : (um_sum / (double)u_total); fprintf(stdout, format, (i + counts->min_score)*d, counts->mc[i], counts->mi[i], counts->mu[i], counts->um[i], counts->uu[i], counts->mc[i] + counts->mi[i] + counts->mu[i] + counts->um[i] + counts->uu[i], mc_sum, mi_sum, mu_sum, um_sum, uu_sum, mc_sum + mi_sum + mu_sum + um_sum + uu_sum, sens_at_thr, ppv_at_thr, fdr_at_thr, sens_ge_thr, ppv_ge_thr, fdr_ge_thr); } } DWGSIM-dwgsim.0.1.11/src/dwgsim_eval.h000066400000000000000000000036461231766373200173410ustar00rootroot00000000000000#ifndef DWGSIM_EVAL_H_ #define DWGSIM_EVAL_H_ #define DWGSIM_EVAL_MAXQ 255 #define DWGSIM_EVAL_MINAS -5000 typedef struct { int32_t a; // alignment score or not int32_t b; // bwa or not int32_t c; // color space or not int32_t d; // divide by factor int32_t e; // print only alignments with # of errors int32_t g; // gap "wiggle" int32_t i; // indel only int32_t m; // multi-mapped int32_t n; // # of pe alignments int32_t p; // print incorrect alignments or not int32_t q; // consider only alignments with this mapping quality or greater int32_t s; // print only alignments with # of SNPs int32_t z; // input reads are single end int32_t S; // input reads are in text SAM format char *P; // read name prefix } dwgsim_eval_args_t; // Actual value enum { DWGSIM_EVAL_MAPPABLE =0, DWGSIM_EVAL_UNMAPPABLE =1 }; // Prediction enum { DWGSIM_EVAL_MAPPED_CORRECTLY =0, DWGSIM_EVAL_MAPPED_INCORRECTLY =1, DWGSIM_EVAL_UNMAPPED =2 }; typedef struct { int32_t *mc; // mappable && mapped correctly int32_t *mi; // mappable && mapped incorrectly int32_t *mu; // mappable && unmapped int32_t *um; // unmappable && mapped int32_t *uu; // unmappable && unmapped int32_t min_score, max_score; } dwgsim_eval_counts_t; void run(dwgsim_eval_args_t *args, int32_t num_files, char *files[]); void process_bam(dwgsim_eval_counts_t *counts, dwgsim_eval_args_t *args, bam_header_t *header, bam1_t *b, samfile_t *fp_out); dwgsim_eval_counts_t * dwgsim_eval_counts_init(); void dwgsim_eval_counts_destroy(dwgsim_eval_counts_t *counts); void dwgsim_eval_counts_add(dwgsim_eval_counts_t *counts, int32_t score, int32_t actual_value, int32_t predicted_value); void dwgsim_eval_counts_print(dwgsim_eval_counts_t *counts, int32_t a, int32_t d, int32_t n); #endif DWGSIM-dwgsim.0.1.11/src/dwgsim_opt.c000066400000000000000000000412741231766373200172060ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "mut.h" #include "dwgsim.h" #include "dwgsim_opt.h" dwgsim_opt_t* dwgsim_opt_init() { dwgsim_opt_t *opt; opt = calloc(1, sizeof(dwgsim_opt_t)); opt->e[0].start = opt->e[0].end = opt->e[1].start = opt->e[1].end = 0.02; opt->e[0].by = opt->e[1].by = 0; opt->is_inner = 0; opt->dist = 500; opt->std_dev = 50; opt->N = -1; opt->C = 100; opt->length[0] = opt->length[1] = 70; opt->mut_rate = 0.001; opt->mut_freq = 0.5; opt->indel_frac = 0.1; opt->indel_extend = 0.3; opt->indel_min = 1; opt->rand_read = 0.05; opt->data_type = ILLUMINA; opt->strandedness = 0; opt->max_n = 0; opt->flow_order = NULL; opt->flow_order_len = 0; opt->use_base_error = 0; opt->seed = -1; opt->muts_only = 0; opt->fixed_quality = NULL; opt->quality_std = 2.0; opt->fn_muts_input = NULL; opt->fn_muts_input_type = -1; opt->fn_regions_bed = NULL; opt->fp_mut = opt->fp_bfast = opt->fp_bwa1 = opt->fp_bwa2 = NULL; opt->fp_fa = opt->fp_fai = NULL; opt->read_prefix = NULL; return opt; } void dwgsim_opt_destroy(dwgsim_opt_t *opt) { free(opt->fixed_quality); free(opt->fn_muts_input); free(opt->fn_regions_bed); free(opt->flow_order); free(opt->read_prefix); free(opt); } int dwgsim_opt_usage(dwgsim_opt_t *opt) { mutseq_init_bounds(); fprintf(stderr, "\n"); fprintf(stderr, "Program: dwgsim (short read simulator)\n"); fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); fprintf(stderr, "Contact: Nils Homer \n\n"); fprintf(stderr, "Usage: dwgsim [options] \n\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -e FLOAT per base/color/flow error rate of the first read [from %.3f to %.3f by %.3f]\n", opt->e[0].start, opt->e[0].end, opt->e[0].by); fprintf(stderr, " -E FLOAT per base/color/flow error rate of the second read [from %.3f to %.3f by %.3f]\n", opt->e[1].start, opt->e[1].end, opt->e[1].by); fprintf(stderr, " -i use the inner distance instead of the outer distance for pairs [%s]\n", __IS_TRUE(opt->is_inner)); fprintf(stderr, " -d INT %s distance between the two ends for pairs [%d]\n", (0 == opt->is_inner) ? "outer" : "inner", opt->dist); fprintf(stderr, " -s INT standard deviation of the distance for pairs [%.3f]\n", opt->std_dev); fprintf(stderr, " -N INT number of read pairs (-1 to disable) [%lld]\n", (signed long long int)opt->N); fprintf(stderr, " -C FLOAT mean coverage across available positions (-1 to disable) [%.2lf]\n", opt->C); fprintf(stderr, " -1 INT length of the first read [%d]\n", opt->length[0]); fprintf(stderr, " -2 INT length of the second read [%d]\n", opt->length[1]); fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", opt->mut_rate); fprintf(stderr, " -F FLOAT frequency of given mutation to simulate low fequency somatic mutations [%.4f]\n", opt->mut_freq); fprintf(stderr, " NB: freqeuncy F refers to the first strand of mutation, therefore mutations \n"); fprintf(stderr, " on the second strand occour with a frequency of 1-F \n"); fprintf(stderr, " -R FLOAT fraction of mutations that are indels [%.2f]\n", opt->indel_frac); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", opt->indel_extend); fprintf(stderr, " -I INT the minimum length indel [%d]\n", opt->indel_min); fprintf(stderr, " -y FLOAT probability of a random DNA read [%.2f]\n", opt->rand_read); fprintf(stderr, " -n INT maximum number of Ns allowed in a given read [%d]\n", opt->max_n); fprintf(stderr, " -c INT generate reads for [%d]:\n", opt->data_type); fprintf(stderr, " 0: Illumina\n"); fprintf(stderr, " 1: SOLiD\n"); fprintf(stderr, " 2: Ion Torrent\n"); fprintf(stderr, " -S INT generate reads [%d]:\n", opt->strandedness); fprintf(stderr, " 0: default (opposite strand for Illumina, same strand for SOLiD/Ion Torrent)\n"); fprintf(stderr, " 1: same strand (mate pair)\n"); fprintf(stderr, " 2: opposite strand (paired end)\n"); fprintf(stderr, " -f STRING the flow order for Ion Torrent data [%s]\n", (char*)opt->flow_order); fprintf(stderr, " -B use a per-base error rate for Ion Torrent data [%s]\n", __IS_TRUE(opt->use_base_error)); fprintf(stderr, " -H haploid mode [%s]\n", __IS_TRUE(opt->is_hap)); fprintf(stderr, " -z INT random seed (-1 uses the current time) [%d]\n", opt->seed); fprintf(stderr, " -M generate a mutations file only [%s]\n", __IS_TRUE(opt->muts_only)); fprintf(stderr, " -m FILE the mutations txt file to re-create [%s]\n", (MUT_INPUT_TXT != opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input); fprintf(stderr, " -b FILE the bed-like file set of candidate mutations [%s]\n", (MUT_INPUT_BED == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input); fprintf(stderr, " -v FILE the vcf file set of candidate mutations (use pl tag for strand) [%s]\n", (MUT_INPUT_VCF == opt->fn_muts_input_type) ? "not using" : opt->fn_muts_input); fprintf(stderr, " -x FILE the bed of regions to cover [%s]\n", (NULL == opt->fn_regions_bed) ? "not using" : opt->fn_regions_bed); fprintf(stderr, " -P STRING a read prefix to prepend to each read name [%s]\n", (NULL == opt->read_prefix) ? "not using" : opt->read_prefix); fprintf(stderr, " -q STRING a fixed base quality to apply (single character) [%s]\n", (NULL == opt->fixed_quality) ? "not using" : opt->fixed_quality); fprintf(stderr, " -Q FLOAT standard deviation of the base quality scores [%.2lf]\n", (NULL == opt->fixed_quality) ? opt->quality_std : 0.0); fprintf(stderr, " -s INT standard deviation of the distance for pairs [%.3f]\n", opt->std_dev); fprintf(stderr, " -h print this message\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: For SOLiD mate pair reads and BFAST, the first read is F3 and the second is R3. For SOLiD mate pair reads\n"); fprintf(stderr, "and BWA, the reads in the first file are R3 the reads annotated as the first read etc.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: The longest supported insertion is %u.\n", UINT32_MAX); fprintf(stderr, "\n"); return 1; } static void get_error_rate(const char *str, error_t *e) { int32_t i; e->start = atof(str); for(i=0;iend = atof(str+i); } else { e->end = e->start; } } int32_t dwgsim_opt_is_int(char *optarg) { int32_t i; for(i=0;i= 0) { switch (c) { case 'i': opt->is_inner = 1; break; case 'd': opt->dist = dwgsim_atoi(optarg, 'd'); break; case 's': opt->std_dev = atof(optarg); break; case 'N': opt->N = dwgsim_atoi(optarg, 'N'); opt->C = -1; break; case 'C': opt->C = atof(optarg); opt->N = -1; break; case '1': opt->length[0] = dwgsim_atoi(optarg, '1'); break; case '2': opt->length[1] = dwgsim_atoi(optarg, '2'); break; case 'e': get_error_rate(optarg, &opt->e[0]); break; case 'E': get_error_rate(optarg, &opt->e[1]); break; case 'r': opt->mut_rate = atof(optarg); break; case 'F': opt->mut_freq = atof(optarg); break; case 'R': opt->indel_frac = atof(optarg); break; case 'X': opt->indel_extend = atof(optarg); break; case 'I': opt->indel_min = dwgsim_atoi(optarg, 'I'); break; case 'c': opt->data_type = dwgsim_atoi(optarg, 'c'); break; case 'S': opt->strandedness = dwgsim_atoi(optarg, 'S'); break; case 'n': opt->max_n = dwgsim_atoi(optarg, 'n'); break; case 'y': opt->rand_read = atof(optarg); break; case 'f': if(NULL != opt->flow_order) free(opt->flow_order); opt->flow_order = (int8_t*)strdup(optarg); break; case 'B': opt->use_base_error = 1; break; case 'H': opt->is_hap = 1; break; case 'h': return 0; case 'z': opt->seed = dwgsim_atoi(optarg, 'n'); break; case 'M': opt->muts_only = 1; break; case 'm': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_TXT; muts_input_type |= 0x1; break; case 'b': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_BED; muts_input_type |= 0x2; break; case 'v': free(opt->fn_muts_input); opt->fn_muts_input = strdup(optarg); opt->fn_muts_input_type = MUT_INPUT_VCF; muts_input_type |= 0x4; break; case 'x': free(opt->fn_regions_bed); opt->fn_regions_bed = strdup(optarg); break; case 'P': free(opt->read_prefix); opt->read_prefix = strdup(optarg); break; case 'q': opt->fixed_quality = strdup(optarg); break; case 'Q': opt->quality_std = atof(optarg); break; default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 0; } } if (argc - optind < 2) return 0; __check_option(opt->is_inner, 0, 1, "-i"); __check_option(opt->dist, 0, INT32_MAX, "-d"); __check_option(opt->std_dev, 0, INT32_MAX, "-s"); if(opt->N < 0 && opt->C < 0) { fprintf(stderr, "Must use one of -N or -C"); return 0; } else if(0 < opt->N && 0 < opt->C) { fprintf(stderr, "Cannot use both -N or -C"); return 0; } else if(0 < opt->N) { __check_option(opt->N, 1, INT32_MAX, "-N"); __check_option(opt->C, INT32_MIN, -1, "-C"); } else { __check_option(opt->N, INT32_MIN, -1, "-N"); __check_option(opt->C, 0, INT32_MAX, "-C"); } __check_option(opt->length[0], 1, INT32_MAX, "-1"); __check_option(opt->length[1], 0, INT32_MAX, "-2"); // error rate for(i=0;i<2;i++) { if(opt->e[i].start < 0.0 || 1.0 < opt->e[i].start) { fprintf(stderr, "End %s: the start error is out of range (-e)\n", (0 == i) ? "one" : "two"); return 0; } if(opt->e[i].end < 0.0 || 1.0 < opt->e[i].end) { fprintf(stderr, "End %s: the end error is out of range (-e)\n", (0 == i) ? "one" : "two"); return 0; } if(IONTORRENT == opt->data_type) { if(opt->e[i].end != opt->e[i].start) { fprintf(stderr, "End %s: a uniform error rate must be given for Ion Torrent data\n", (0 == i) ? "one" : "two"); return 0; } } } __check_option(opt->mut_rate, 0, 1.0, "-r"); __check_option(opt->indel_frac, 0, 1.0, "-R"); __check_option(opt->indel_extend, 0, 1.0, "-X"); __check_option(opt->indel_min, 1, INT32_MAX, "-I"); __check_option(opt->data_type, 0, 2, "-c"); __check_option(opt->strandedness, 0, 2, "-S"); __check_option(opt->max_n, 0, INT32_MAX, "-n"); __check_option(opt->rand_read, 0, 1.0, "-y"); if(IONTORRENT == opt->data_type && NULL == opt->flow_order) { fprintf(stderr, "Error: command line option -f is required\n"); return 0; } __check_option(opt->use_base_error, 0, 1, "-B"); __check_option(opt->is_hap, 0, 1, "-H"); if(NULL != opt->fixed_quality && 1 != strlen(opt->fixed_quality)) { fprintf(stderr, "Error: command line option -q requires one character\n"); return 0; } __check_option(opt->quality_std, 0, INT32_MAX, "-Q"); if(NULL != opt->read_prefix) { fprintf(stderr, "Warning: remember to use the -P option with dwgsim_eval\n"); } switch(muts_input_type) { case 0x0: case 0x1: case 0x2: case 0x4: break; default: fprintf(stderr, "Error: -m/-b/-v cannot be used together\n"); return 0; break; } // random seed srand48((-1 == opt->seed) ? time(0) : opt->seed); if(IONTORRENT == opt->data_type) { if(NULL != opt->flow_order) { // uniform error rates only (so far) if(opt->e[0].start != opt->e[0].end || opt->e[1].start != opt->e[1].end) { fprintf(stderr, "Error: non-uniform error rate not support for Ion Torrent data\n"); return 0; } // update flow order opt->flow_order_len = strlen((char*)opt->flow_order); for(i=0;iflow_order_len;i++) { opt->flow_order[i] = nst_nt4_table[opt->flow_order[i]]; } } else { fprintf(stderr, "Error: -f must be given for Ion Torrent data\n"); return 0; } } // use base error rate if(IONTORRENT == opt->data_type && NULL != opt->flow_order && 1 == opt->use_base_error) { uint8_t *tmp_seq=NULL; uint8_t *tmp_seq_flow_mask=NULL; int32_t tmp_seq_mem, s, cur_n_err, n_err, counts; int32_t j, k; double sf = 0.0; for(i=0;i<2;i++) { if(opt->length[i] <= 0) continue; fprintf(stderr, "[dwgsim_core] Updating error rate for end %d\n", i+1); if(0 < i && opt->length[i] == opt->length[1-i]) { opt->e[i].start = opt->e[1-i].start; opt->e[i].end = opt->e[1-i].end; opt->e[i].by = opt->e[1-i].by; fprintf(stderr, "[dwgsim_core] Using scaling factor from previous end\n[dwgsim_core] Updated with scaling factor %.5lf\n", sf); continue; } tmp_seq_mem = opt->length[i]+2; tmp_seq = (uint8_t*)calloc(tmp_seq_mem, 1); tmp_seq_flow_mask = (uint8_t*)calloc(tmp_seq_mem, 1); n_err = counts = 0; for(j=0;jlength[i];k++) { tmp_seq[k] = (int)(drand48() * 4.0) & 3; } cur_n_err = 0; s = opt->length[i]; s = generate_errors_flows(opt, &tmp_seq, &tmp_seq_flow_mask, &tmp_seq_mem, s, 0, opt->e[i].start, &cur_n_err); n_err += cur_n_err; counts += s; } //fprintf(stderr, "before %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end); sf = opt->e[i].start / (n_err / (1.0 * counts)); opt->e[i].start = opt->e[i].end *= sf; opt->e[i].by = (opt->e[i].end - opt->e[i].start) / opt->length[i]; //fprintf(stderr, "after %lf,%lf,%lf\n", opt->e[i].start, opt->e[i].by, opt->e[i].end); free(tmp_seq); free(tmp_seq_flow_mask); fprintf(stderr, "\r[dwgsim_core] %d\n[dwgsim_core] Updated with scaling factor %.5lf!\n", j, sf); } } else { opt->e[0].by = (opt->e[0].end - opt->e[0].start) / opt->length[0]; opt->e[1].by = (opt->e[1].end - opt->e[1].start) / opt->length[1]; } __check_option(opt->muts_only, 0, 1, "-M"); return 1; } DWGSIM-dwgsim.0.1.11/src/dwgsim_opt.h000066400000000000000000000024631231766373200172100ustar00rootroot00000000000000#ifndef DWGSIM_OPT_H #define DWGSIM_OPT_H #define ERROR_RATE_NUM_RANDOM_READS 1000000 typedef struct { double start, by, end; } error_t; typedef struct { error_t e[2]; int32_t is_inner; int32_t dist; double std_dev; int64_t N; double C; int32_t length[2]; double mut_rate; double mut_freq; double indel_frac; double indel_extend; int32_t indel_min; double rand_read; int32_t max_n; int32_t data_type; int32_t strandedness; int8_t *flow_order; int32_t flow_order_len; int32_t use_base_error; int32_t is_hap; int32_t seed; int32_t muts_only; char *fixed_quality; double quality_std; char *fn_muts_input; int32_t fn_muts_input_type; char *fn_regions_bed; FILE *fp_mut; FILE *fp_vcf; FILE *fp_bfast; FILE *fp_bwa1; FILE *fp_bwa2; FILE *fp_fa; FILE *fp_fai; char *read_prefix; } dwgsim_opt_t; dwgsim_opt_t* dwgsim_opt_init(); void dwgsim_opt_destroy(dwgsim_opt_t *opt); #define __check_option(_val, _min, _max, _opt) \ if(_val < _min || _max < _val) { \ fprintf(stderr, "Error: command line option %s was out of range\n", _opt); \ return 0; \ } int dwgsim_opt_usage(dwgsim_opt_t *opt); int32_t dwgsim_opt_parse(dwgsim_opt_t *opt, int argc, char *argv[]); #endif DWGSIM-dwgsim.0.1.11/src/mut.c000066400000000000000000001043231231766373200156320ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "mut_txt.h" #include "mut_bed.h" #include "regions_bed.h" #include "dwgsim_opt.h" #include "mut.h" static int SEQ_BLOCK_SIZE = 512; void seq_set_block_size(int size) { SEQ_BLOCK_SIZE = size; } int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) { int c, l, max; char *p; c = 0; while (!feof(fp) && fgetc(fp) != '>'); if (feof(fp)) return -1; p = locus; while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') if (c != '\r') *p++ = c; *p = '\0'; if (comment) { p = comment; if (c != '\n') { while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); if (c != '\n') { *p++ = c; while (!feof(fp) && (c = fgetc(fp)) != '\n') if (c != '\r') *p++ = c; } } *p = '\0'; } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); l = 0; max = seq->m; while (!feof(fp) && (c = fgetc(fp)) != '>') { if (isalpha(c) || c == '-' || c == '.') { if (l + 1 >= max) { max += SEQ_BLOCK_SIZE; seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); } seq->s[l++] = (unsigned char)c; } } if (c == '>') ungetc(c,fp); seq->s[l] = 0; seq->m = max; seq->l = l; return l; } mut_t mutmsk; mut_t mut_and_type_mask; mut_t muttype_shift; mut_t ins_length_shift; mut_t ins_length_mask; mut_t ins_length_max; mut_t ins_long_length_max; mut_t ins_mask; void mutseq_init_bounds() { int32_t i; mutmsk = (mut_t)0x30; mut_and_type_mask = (mut_t)0x3F; muttype_shift = 6; // bits 5-6 store the mutation type ins_length_shift = 59; // bits 60-64 store the insertion length ins_length_mask = 0x1F; // bits 60-64 store the insertion length ins_length_max = ((ins_length_shift - muttype_shift) >> 1); ins_long_length_max = UINT32_MAX; if(ins_length_mask < ins_length_max) { // we exceed storing the length of the indel ins_length_max = ins_length_mask; } // create the insertion mask based on the maximum length insertion ins_mask = 0; for(i=0;iins_l;i++) { free(seq->ins[i]); } free(seq->ins); free(seq->s); free(seq); } mut_t mut_get_ins_length(mutseq_t *seq, int32_t i) { mut_t m, n, index; assert(0 <= i && i < seq->l); m = seq->s[i]; assert(INSERT == (m & mutmsk)); n = (m >> ins_length_shift) & ins_length_mask; if(0 == n) { uint32_t l = 0; index = (m >> muttype_shift) & ins_mask; assert(0 <= index && index < seq->ins_l); mut_get_ins_long_n(seq->ins[index], &l); return (mut_t)l; } else { return n; } } // returns 0 if it is a long insertion, 1 otherwise // if 0 is returned, the index into the long insertion list // is returned in "ins" and "n" is 0. int32_t mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins) { mut_t m; assert(0 <= i && i < seq->l); m = seq->s[i]; assert(INSERT == (m & mutmsk)); (*n) = (m >> ins_length_shift) & ins_length_mask; (*ins) = (m >> muttype_shift) & ins_mask; if(0 == (*n)) { assert(0 <= (*ins) && (*ins) < seq->ins_l); return 0; } else { assert(0 < (*n)); assert((*n) <= ins_length_max); return 1; } } inline int32_t mut_get_ins_bytes(int32_t n) { int32_t num_bytes = 1; // one-byte for the type // variable number of bytes to store the length if(n <= UINT8_MAX) num_bytes += sizeof(uint8_t); else if(n <= UINT16_MAX) num_bytes += sizeof(uint16_t); else if(n <= UINT32_MAX) num_bytes += sizeof(uint32_t); else { fprintf(stderr, "UINT32_MAX < n"); exit(1); } // store the insertion num_bytes += mut_packed_len(n); return num_bytes; } uint8_t* mut_get_ins_long_n(uint8_t *ins, uint32_t *n) { switch(ins[0]) { case sizeof(uint8_t): (*n) = (uint32_t)(ins[1]); return ins + 2; break; case sizeof(uint16_t): (*n) = (uint32_t)(((uint16_t*)(ins+1))[0]); return ins + 3; break; case sizeof(uint32_t): default: (*n) = (uint32_t)(((uint32_t*)(ins+1))[0]); return ins + 5; break; } return NULL; } inline uint8_t* mut_set_ins_long_n(uint8_t *ins, uint32_t n) { // set type if(n <= UINT8_MAX) ins[0] = (uint8_t)sizeof(uint8_t); else if(n <= UINT16_MAX) ins[0] = (uint8_t)sizeof(uint16_t); else ins[0] = (uint8_t)sizeof(uint32_t); // set length switch(ins[0]) { case sizeof(uint8_t): ins[1] = (uint8_t)n; return ins + 2; break; case sizeof(uint16_t): (*((uint16_t*)(ins+1))) = (uint16_t)n; return ins + 3; break; case sizeof(uint32_t): default: (*((uint32_t*)(ins+1))) = n; return ins + 5; break; } return NULL; } static inline void mut_print_ins(FILE *fp, mutseq_t *seq, int32_t i) { mut_t n, ins; if(1 == mut_get_ins(seq, i, &n, &ins)) { while (n > 0) { fputc("ACGTN"[ins & 0x3], fp); ins >>= 2; n--; } } else { // long insertion int32_t byte_index, bit_index; uint32_t num_ins = 0; uint8_t *insertion = NULL; assert(NULL != seq->ins[ins]); insertion = mut_get_ins_long_n(seq->ins[ins], &num_ins); // reverse order byte_index = mut_packed_len(num_ins)-1; bit_index = (num_ins+3) & 3; // % 4 while(0 < num_ins) { fputc("ACGTN"[(insertion[byte_index] >> (bit_index << 1)) & 0x3], fp); bit_index--; if(bit_index < 0) { bit_index = 3; byte_index--; } num_ins--; } } } // bases is NULL if we are to randomly simulate the bases void mut_add_ins(dwgsim_opt_t *opt, mutseq_t *hap1, mutseq_t *hap2, int32_t i, int32_t c, int8_t hap, char *bases, mut_t num_ins) { mut_t ins = 0; int64_t j; if (NULL == bases) { if(num_ins == 0) { // get the new insertion length do { num_ins++; } while (num_ins < ins_long_length_max && (num_ins < opt->indel_min || drand48() < opt->indel_extend)); } } else { num_ins = strlen(bases); // ignores num_ins } if (ins_long_length_max < num_ins) num_ins = ins_long_length_max; if (hap < 0) { // set ploidy if (opt->is_hap || drand48() < 0.333333) { // hom-ins hap = 3; } else if (drand48() < 0.5) { hap = 1; } else { hap = 2; } } if (num_ins <= ins_length_max) { // short // generate the insertion if (NULL == bases) { for (j=0;js[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c; if (hap & 0x2) hap2->s[i] = (num_ins << ins_length_shift) | (ins << muttype_shift) | INSERT | c; } else { // long int32_t byte_index, bit_index; uint8_t *hap1_ins = NULL, *hap2_ins = NULL; if (hap & 0x1) { while (hap1->ins_m <= hap1->ins_l) { // realloc hap1->ins_m = (hap1->ins_m < 16) ? 16 : (hap1->ins_m << 1); hap1->ins = realloc(hap1->ins, sizeof(uint8_t*) * hap1->ins_m); } hap1->ins[hap1->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t)); hap1_ins = mut_set_ins_long_n(hap1->ins[hap1->ins_l], num_ins); } if (hap & 0x2) { while (hap2->ins_m <= hap2->ins_l) { // realloc hap2->ins_m = (hap2->ins_m < 16) ? 16 : (hap2->ins_m << 1); hap2->ins = realloc(hap2->ins, sizeof(uint8_t*) * hap2->ins_m); } hap2->ins[hap2->ins_l] = calloc(mut_get_ins_bytes(num_ins), sizeof(uint8_t)); hap2_ins = mut_set_ins_long_n(hap2->ins[hap2->ins_l], num_ins); } byte_index = 0; bit_index = 0; while(0 < num_ins) { uint8_t b; if (NULL == bases) { b = ((uint8_t)(drand48() * 4.0)) << (bit_index << 1); } else { b = nst_nt4_table[(int)bases[num_ins-1]] << (bit_index << 1); } if (hap & 0x1) hap1_ins[byte_index] |= b; if (hap & 0x2) hap2_ins[byte_index] |= b; bit_index++; if(4 == bit_index) { bit_index=0; byte_index++; } num_ins--; } if (hap & 0x1) { assert(hap1->ins_l <= ins_mask); hap1->s[i] = (hap1->ins_l << muttype_shift) | INSERT | c; hap1->ins_l++; } if (hap & 0x2) { assert(hap2->ins_l <= ins_mask); hap2->s[i] = (hap2->ins_l << muttype_shift) | INSERT | c; hap2->ins_l++; } } } void mut_debug(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) { int32_t i; // DEBUG for (i = 0; i < seq->l; ++i) { mut_t c[3]; c[0] = nst_nt4_table[(mut_t)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution assert((c[1]&0x3) == (c[2]&0x3)); assert((c[0]&0x3) != (c[1]&0x3)); continue; } else if ((c[1]&mutmsk) == DELETE) { // del continue; } else if ((c[1] & mutmsk) == INSERT) { // ins mut_t n1 = mut_get_ins_length(hap1, i); mut_t n2 = mut_get_ins_length(hap1, i); assert(n1 > 0); assert(n1 <= ins_long_length_max); assert(n1 == n2); } else assert(0); } else { // het if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution assert((c[1]&0x3) != (c[2]&0x3)); assert((c[0]&0x3) == (c[1]&0x3) || (c[0]&0x3) == (c[2]&0x3)); assert((c[0]&0x3) != (c[1]&0x3) || (c[0]&0x3) != (c[2]&0x3)); continue; } else if ((c[1]&mutmsk) == DELETE) { continue; } else if ((c[2]&mutmsk) == DELETE) { continue; } else if ((c[1]&mutmsk) == INSERT) { // ins 1 mut_t n = mut_get_ins_length(hap1, i); assert(n > 0); assert(n <= ins_long_length_max); } else if ((c[2]&mutmsk) == INSERT) { // ins 2 mut_t n = mut_get_ins_length(hap2, i); assert(n > 0); assert(n <= ins_long_length_max); } else assert(0); } } } } static void mut_left_justify_ins(mutseq_t *hap1, int32_t i) { // NB: should we also be checking both haps for homozygous cases mut_t n, ins, j; if(1 == mut_get_ins(hap1, i, &n, &ins)) { // short assert(n > 0); j=i; while(0 < j && INSERT != (hap1->s[j-1]&mutmsk) // no insertion && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution && DELETE != (hap1->s[j-1]&mutmsk) // no deletion && ((ins >> ((n-1) << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base // update ins ins = (ins & ~((mut_t)3 << ((n-1) << 1))); // zero out the last base ins <<= 2; // shift over ins |= (hap1->s[j-1]&3); // insert the first base hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE j--; } hap1->s[j] = (n << ins_length_shift) | (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert } else { // long int32_t bit_index, byte_index; uint32_t num_ins; uint8_t *insertion = NULL; insertion = mut_get_ins_long_n(hap1->ins[ins], &num_ins); assert(num_ins > 0); j=i; while(0 < j && INSERT != (hap1->s[j-1]&mutmsk) // no insertion && SUBSTITUTE != (hap1->s[j-1]&mutmsk) // no substitution && DELETE != (hap1->s[j-1]&mutmsk) // no deletion // NB: last base of the insertion has a bit index of zero and byte index of zero && ((insertion[0] >> (0 << 1)) & 3) == (hap1->s[j-1]&3)) { // end of insertion matches previous base // start with the last base for (byte_index = 0; byte_index < mut_packed_len(num_ins); byte_index++) { insertion[byte_index] >>= 2; // shift down, get rid of first base (last two bits) if (byte_index + 1 < mut_packed_len(num_ins)) { uint8_t b = (insertion[byte_index+1] >> (0 << 1)) & 3; // last base of the next byte (first two bits) insertion[byte_index] |= b << 6; // move the new base to the last two bits } } // add a base to the front bit_index = (num_ins+3) & 3; // % 4 byte_index = mut_packed_len(num_ins)-1; insertion[byte_index] |= (hap1->s[j-1]&3) << (bit_index << 1); hap1->s[j] = (hap1->s[j]&3); // make it NOCHANGE j--; } hap1->s[j] = (ins << muttype_shift) | INSERT | (hap1->s[j]&3); // re-insert } } // left-justify all the insertions and deletions static void mut_left_justify(const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) { mut_t i, j; int del_length; int prev_del[2] = {0, 0}; assert(hap1->l == seq->l); assert(hap2->l == seq->l); for (i = 0; i < seq->l; ++i) { mut_t c[3]; c[0] = nst_nt4_table[(mut_t)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom // TODO: code re-use if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution prev_del[0] = prev_del[1] = 0; continue; } else if ((c[1]&mutmsk) == DELETE) { // del if(prev_del[0] == 1 || prev_del[1] == 1) continue; prev_del[0] = prev_del[1] = 1; for(j=i+1,del_length=1;jl && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length del_length++; } if(seq->l <= i+del_length) continue; // left-justify for(j=i-1;0 < i && 0<=j;j--) { if(INSERT != (hap1->s[j]&mutmsk) && INSERT != (hap2->s[j]&mutmsk) // no insertion && DELETE != (hap1->s[j]&mutmsk) && DELETE != (hap2->s[j]&mutmsk) // no deletion && (hap1->s[j]&3) == (hap1->s[j+del_length]&3) // hap1 bases match && (hap2->s[j]&3) == (hap2->s[j+del_length]&3)) { // hap2 bases match // shift and make it NOCHANGE mut_t t; t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk; t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk; } else { break; } if(0 == j) break; } } else if ((c[1] & mutmsk) == INSERT) { // ins prev_del[0] = prev_del[1] = 0; mut_left_justify_ins(hap1, i); mut_left_justify_ins(hap2, i); } else assert(0); } else { // het if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution prev_del[0] = prev_del[1] = 0; continue; } else if ((c[1]&mutmsk) == DELETE) { if(prev_del[0] == 1) continue; prev_del[0] = 1; for(j=i+1,del_length=1;jl && (hap1->s[j]&mutmsk) == DELETE;j++) { // get del length del_length++; } if(seq->l <= i+del_length) continue; // left-justify for(j=i-1;0 < i && 0<=j;j--) { if(INSERT != (hap1->s[j]&mutmsk) // no insertion && SUBSTITUTE != (hap1->s[j]&mutmsk) // no substitution && DELETE != (hap1->s[j]&mutmsk) // no deletion && (hap1->s[j]&3) == (hap1->s[j+del_length]&3)) { // hap1 bases match // shift and make it NOCHANGE mut_t t; t = hap1->s[j]; hap1->s[j] = hap1->s[j+del_length]; hap1->s[j+del_length] = (t | mutmsk) ^ mutmsk; } else { break; } if(0 == j) break; } } else if ((c[2]&mutmsk) == DELETE) { if(prev_del[1] == 1) continue; prev_del[1] = 1; for(j=i+1,del_length=1;jl && (hap2->s[j]&mutmsk) == DELETE;j++) { // get del length del_length++; } if(seq->l <= i+del_length) continue; // left-justify for(j=i-1;0 < i && 0<=j;j--) { if(INSERT != (hap2->s[j]&mutmsk) // no insertion && SUBSTITUTE != (hap2->s[j]&mutmsk) // no substitution && DELETE != (hap2->s[j]&mutmsk) // no deletion && (hap2->s[j]&3) == (hap2->s[j+del_length]&3)) { // hap2 bases match // shift and make it NOCHANGE mut_t t; t = hap2->s[j]; hap2->s[j] = hap2->s[j+del_length]; hap2->s[j+del_length] = (t | mutmsk) ^ mutmsk; } else { break; } if(0 == j) break; } } else if ((c[1]&mutmsk) == INSERT) { // ins 1 prev_del[0] = prev_del[1] = 0; mut_left_justify_ins(hap1, i); } else if ((c[2]&mutmsk) == INSERT) { // ins 2 prev_del[0] = prev_del[1] = 0; mut_left_justify_ins(hap2, i); } else assert(0); } } else { prev_del[0] = prev_del[1] = 0; } } } void mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, int32_t contig_i, muts_input_t *muts_input) { int32_t i, j, deleting = 0, deletion_length = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; ret[0]->l = seq->l; ret[1]->l = seq->l; ret[0]->m = seq->m; ret[1]->m = seq->m; ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); ret[0]->ins = NULL; ret[1]->ins = NULL; ret[0]->ins_l = 0; ret[1]->ins_l = 0; ret[0]->ins_m = 0; ret[1]->ins_m = 0; if(NULL == muts_input) { for (i = 0; i < seq->l; ++i) { mut_t c; c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; if (deleting) { if (deletion_length < opt->indel_min || drand48() < opt->indel_extend) { if (deleting & 1) ret[0]->s[i] |= DELETE|c; if (deleting & 2) ret[1]->s[i] |= DELETE|c; deletion_length++; continue; } else deleting = deletion_length = 0; } if (c < 4 && drand48() < opt->mut_rate) { // mutation if (drand48() >= opt->indel_frac) { // substitution double r = drand48(); c = (c + (mut_t)(r * 3.0 + 1)) & 3; if (opt->is_hap || drand48() < 0.333333) { // hom ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; } else { // het ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; } } else { // indel if (drand48() < 0.5) { // deletion if (opt->is_hap || drand48() < 0.3333333) { // hom-del ret[0]->s[i] = ret[1]->s[i] = DELETE|c; deleting = 3; } else { // het-del deleting = drand48()<0.5?1:2; ret[deleting-1]->s[i] = DELETE|c; } deletion_length = 1; } else { // insertion mut_add_ins(opt, ret[0], ret[1], i, c, -1, NULL, 0); } } } } } else { if(MUT_INPUT_BED == muts_input->type) { // BED muts_bed_t *muts_bed = muts_input->data.bed; // seed for (i = 0; i < seq->l; ++i) { ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; } // mutates exactly based on a BED file for (i = 0; i < muts_bed->n; ++i) { if (muts_bed->muts[i].contig == contig_i) { int32_t has_bases = 1, is_hom = 0, hap = 0; int32_t which_hap = 0; mut_t c; // does this mutation have random bases? if (0 == strcmp("*", muts_bed->muts[i].bases)) has_bases = 0; // random bases // het or hom? if (opt->is_hap || drand48() < 0.333333) { is_hom = 1; // hom hap = 3; } else { which_hap = drand48()<0.5?0:1; hap = 1 << which_hap; } // mutate if (SUBSTITUTE == muts_bed->muts[i].type) { for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base c = (mut_t)nst_nt4_table[(int)seq->s[j]]; if (0 == has_bases) { // random DNA base double r = drand48(); c = (c + (mut_t)(r * 3.0 + 1)) & 3; } else { c = (mut_t)nst_nt4_table[(int)muts_bed->muts[i].bases[j - muts_bed->muts[i].start]]; } if (1 == is_hom) { ret[0]->s[j] = ret[1]->s[j] = SUBSTITUTE|c; } else { // het ret[which_hap]->s[j] = SUBSTITUTE|c; } } } else if (DELETE == muts_bed->muts[i].type) { for (j = muts_bed->muts[i].start; j < muts_bed->muts[i].end; ++j) { // for each base c = (mut_t)nst_nt4_table[(int)seq->s[j]]; if (1 == is_hom) { ret[0]->s[j] = ret[1]->s[j] = DELETE|c; } else { // het-del ret[which_hap]->s[j] = DELETE|c; } } } else if (INSERT == muts_bed->muts[i].type) { c = (mut_t)nst_nt4_table[(int)seq->s[muts_bed->muts[i].start]]; if (0 == has_bases) { mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, NULL, muts_bed->muts[i].end - muts_bed->muts[i].start); } else { mut_add_ins(opt, ret[0], ret[1], muts_bed->muts[i].start, c, hap, muts_bed->muts[i].bases, 0); } } } else if (contig_i < muts_bed->muts[i].contig) { break; } } } else if(MUT_INPUT_TXT == muts_input->type || MUT_INPUT_VCF == muts_input->type) { // TXT or VCF muts_txt_t *muts_txt = NULL; if(MUT_INPUT_TXT == muts_input->type) { muts_txt = muts_input->data.txt; } else { muts_txt = muts_input->data.vcf; } // seed for (i = 0; i < seq->l; ++i) { ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; } for (i = 0; i < muts_txt->n; ++i) { if (muts_txt->muts[i].contig == contig_i) { int8_t type = muts_txt->muts[i].type; uint32_t pos = muts_txt->muts[i].pos; int8_t is_hap = muts_txt->muts[i].is_hap; mut_t c = (mut_t)nst_nt4_table[(int)seq->s[pos-1]]; if (DELETE == type) { if (is_hap & 1) ret[0]->s[pos-1] |= DELETE|c; if (is_hap & 2) ret[1]->s[pos-1] |= DELETE|c; } else if (SUBSTITUTE == type) { if (is_hap & 1) ret[0]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]]; if (is_hap & 2) ret[1]->s[pos-1] = SUBSTITUTE|nst_nt4_table[(int)muts_txt->muts[i].bases[0]]; } else if (INSERT == type) { mut_add_ins(opt, ret[0], ret[1], pos-1, c, is_hap, muts_txt->muts[i].bases, 0); } } } } else { fprintf(stderr, "Error: unknown mutation input type!\n"); exit(1); } } // DEBUG mut_debug(seq, hap1, hap2); // DEBUG mut_left_justify(seq, hap1, hap2); mut_debug(seq, hap1, hap2); } void mut_print_header_pre(FILE *fpout_vcf) { fprintf(fpout_vcf, "##fileformat=VCFv4.1\n"); } void mut_print_header_post(FILE *fpout_vcf) { fprintf(fpout_vcf, "##INFO=\n"); fprintf(fpout_vcf, "##INFO=\n"); fprintf(fpout_vcf, "##INFO=\n"); fprintf(fpout_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length) { // TODO: does mutating the reference change the sequence length? If so, this // is not correct. fprintf(fpout_vcf, "##contig=\n", name, length); } // TODO: add AD, DP, GQ, GT, PL void mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf) { int32_t i, j, hap; // body mut_t mut_prev[2] = {NOCHANGE,NOCHANGE}; // for deletions for (i = 0; i < seq->l; ++i) { mut_t c[3]; c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) { // do nothing } else if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { fprintf(fpout_txt, "%s\t%d\t", name, i+1); if ((c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask)) { // hom if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution fprintf(fpout_txt, "%c\t%c\t3\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=1.0;pl=3;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); } else if ((c[1]&mutmsk) == DELETE) { // del fprintf(fpout_txt, "%c\t-\t3\n", "ACGTN"[c[0]]); if(0 == mut_prev[0] || 0 == mut_prev[1]) { fprintf(fpout_vcf, "%s\t%d\t.\t", name, i); if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf); for (j = i; j < seq->l && (c[1] & mut_and_type_mask) == (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) { fputc("ACGTN"[c[0]], fpout_vcf); // NB: this modifies 'c' if (j+1 < seq->l) { c[0] = nst_nt4_table[(int)seq->s[j+1]]; c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1]; } } if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]); else fprintf(fpout_vcf, "\t."); fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=DELETE\n"); } // NB: convert back 'c' c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; } else if ((c[1] & mutmsk) == INSERT) { // ins fprintf(fpout_txt, "-\t"); mut_print_ins(fpout_txt, hap1, i); fprintf(fpout_txt, "\t3\n"); fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]); mut_print_ins(fpout_vcf, hap1, i); fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=3;mt=INSERT\n"); } else assert(0); } else { // het if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution hap = ((c[1]&mutmsk) == SUBSTITUTE) ? 1 : 2; fprintf(fpout_txt, "%c\t%c\t%d\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)], hap); if(1 == hap) fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=1;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); else fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c\t100\tPASS\tAF=0.5;pl=2;mt=SUBSTITUTE\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[2]&0xf]); } else if ((c[1]&mutmsk) == DELETE) { fprintf(fpout_txt, "%c\t-\t1\n", "ACGTN"[c[0]]); if(0 == mut_prev[0]) { fprintf(fpout_vcf, "%s\t%d\t.\t", name, i); if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf); for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[1]&mutmsk) == DELETE; ++j) { fputc("ACGTN"[c[0]], fpout_vcf); // NB: this modifies 'c' if (j+1 < seq->l) { c[0] = nst_nt4_table[(int)seq->s[j+1]]; c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1]; } } if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]); else fprintf(fpout_vcf, "\t."); fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=1;mt=DELETE\n"); } // NB: convert back 'c' c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; } else if ((c[2]&mutmsk) == DELETE) { fprintf(fpout_txt, "%c\t-\t2\n", "ACGTN"[c[0]]); if(0 == mut_prev[1]) { fprintf(fpout_vcf, "%s\t%d\t.\t", name, i); if (0 < i) fputc("ACGTN"[nst_nt4_table[(int)seq->s[i-1]]], fpout_vcf); for (j = i; j < seq->l && (c[1] & mut_and_type_mask) != (c[2] & mut_and_type_mask) && (c[2]&mutmsk) == DELETE; ++j) { fputc("ACGTN"[c[0]], fpout_vcf); // NB: this modifies 'c' if (j+1 < seq->l) { c[0] = nst_nt4_table[(int)seq->s[j+1]]; c[1] = hap1->s[j+1]; c[2] = hap2->s[j+1]; } } if (0 < i) fprintf(fpout_vcf, "\t%c", "ACGTN"[nst_nt4_table[(int)seq->s[i-1]]]); else fprintf(fpout_vcf, "\t."); fprintf(fpout_vcf, "\t100\tPASS\tAF=1.0;pl=2;mt=DELETE\n"); } // NB: convert back 'c' c[0] = nst_nt4_table[(int)seq->s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; } else if ((c[1]&mutmsk) == INSERT) { // ins 1 fprintf(fpout_txt, "-\t"); mut_print_ins(fpout_txt, hap1, i); fprintf(fpout_txt, "\t1\n"); fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]); mut_print_ins(fpout_vcf, hap1, i); fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=1;mt=INSERT\n"); } else if ((c[2]&mutmsk) == INSERT) { // ins 2 fprintf(fpout_txt, "-\t"); mut_print_ins(fpout_txt, hap2, i); fprintf(fpout_txt, "\t2\n"); fprintf(fpout_vcf, "%s\t%d\t.\t%c\t%c", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[0]]); mut_print_ins(fpout_vcf, hap2, i); fprintf(fpout_vcf, "\t100\tPASS\tAF=0.5;pl=2;mt=INSERT\n"); } else assert(0); } } mut_prev[0] = (c[1] & mutmsk) != NOCHANGE; mut_prev[1] = (c[2] & mutmsk) != NOCHANGE; } } DWGSIM-dwgsim.0.1.11/src/mut.h000066400000000000000000000041211231766373200156320ustar00rootroot00000000000000#ifndef MUT_H #define MUT_H #include "contigs.h" #include "mut_txt.h" #include "mut_bed.h" #include "mut_input.h" #include "dwgsim_opt.h" #include "dwgsim.h" /* FASTA parser, copied from seq.c */ typedef struct { int l, m; /* length and maximum buffer size */ unsigned char *s; /* sequence */ } seq_t; #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 void seq_set_block_size(int size); int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment); enum muttype_t { NOCHANGE = 0, INSERT = 0x10, SUBSTITUTE = 0x20, DELETE = 0x30 }; typedef uint64_t mut_t; extern mut_t mutmsk; extern mut_t mut_and_type_mask; extern mut_t muttype_shift; extern mut_t ins_length_shift; extern mut_t ins_length_mask; extern mut_t ins_length_max; extern mut_t ins_long_length_max; extern mut_t ins_mask; typedef struct { int l, m; /* length and maximum buffer size */ mut_t *s; /* sequence */ uint8_t **ins; /* long insertions */ int ins_l, ins_m; /* length and maximum buffer size for long insertions */ } mutseq_t; void mutseq_init_bounds(); mutseq_t * mutseq_init(); void mutseq_destroy(mutseq_t *seq); inline int32_t mut_get_ins_bytes(int32_t n); inline uint8_t* mut_get_ins_long_n(uint8_t *ins, uint32_t *n); mut_t mut_get_ins_length(mutseq_t *seq, int32_t i); int32_t mut_get_ins(mutseq_t *seq, int32_t i, mut_t *n, mut_t *ins); void mut_diref(dwgsim_opt_t *opt, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, int32_t contig_i, muts_input_t *muts_input); void mut_print_header_pre(FILE *fpout_vcf); void mut_print_header_post(FILE *fpout_vcf); void mut_print_header_contig(FILE *fpout_vcf, const char *name, int32_t length); // Columns: // 1 - chromosome name // 2 - position (one-based) // 3 - reference base (dash if there was an insertion) // 4 - variant allele (IUPAC code or insertion base(s)) // 5 - '-' for homozygous, '+' for heterozygous void mut_print(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2, FILE *fpout_txt, FILE *fpout_vcf); // 0 - 0 // 1 - 1 // 5 - 2 // 9 - 3 // ... #define mut_packed_len(_n) ((_n + 3) >> 2) #endif DWGSIM-dwgsim.0.1.11/src/mut_bed.c000066400000000000000000000101251231766373200164400ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include "contigs.h" #include "mut.h" #include "dwgsim.h" #include "mut_bed.h" muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c) { muts_bed_t *m = NULL; int32_t i; char name[1024]; uint32_t start, end; char type[1024]; char bases[1024]; uint32_t prev_contig=0, max_end=0; m = calloc(1, sizeof(muts_bed_t)); m->n = 0; m->mem = 1024; m->muts = malloc(m->mem * sizeof(mut_bed_t)); i = 0; // start is zero-based // one is one-based while(0 < fscanf(fp, "%s\t%u\t%u\t%s\t%s", name, &start, &end, bases, type)) { // find the contig while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) { i++; } if(c->n == i) { fprintf(stderr, "Error: contig not found [%s]\n", name); exit(1); } else if(c->contigs[i].len <= start) { fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start); exit(1); } else if(c->contigs[i].len < end) { fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end); exit(1); } else if(end <= start) { fprintf(stderr, "Error: end <= start [%s,%u,%u]\n", name, start, end); exit(1); } else if(0 != strcmp("*", bases) && (end - start) != strlen(bases)) { fprintf(stderr, "Error: bases did not match start and end [%s,%u,%u,%s]\n", name, start, end, bases); exit(1); } else if(prev_contig == i && start+1 <= max_end) { fprintf(stderr, "Warning: overlapping entries, ignoring entry [%s\t%u\t%u\t%s\t%s]\n", name, start, end, bases, type); continue; } if(prev_contig != i || max_end < end) { prev_contig = i; max_end = end; } if(m->n == m->mem) { m->mem <<= 1; m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t)); } m->muts[m->n].contig = i; m->muts[m->n].start = start; m->muts[m->n].end = end; // one-based m->muts[m->n].type = get_muttype(type); switch(m->muts[m->n].type) { case SUBSTITUTE: break; case INSERT: if(ins_length_max < end - start) { fprintf(stderr, "Error: insertion of length %d exceeded the maximum supported length of %d\n", end - start, (int32_t)ins_length_max); exit(1); } break; case DELETE: break; default: // error fprintf(stderr, "Error: mutation type unrecognized [%s]\n", type); exit(1); } m->muts[m->n].bases = strdup(bases); m->n++; } if(m->n < m->mem) { m->mem = m->n; m->muts = realloc(m->muts, m->mem * sizeof(mut_bed_t)); } return m; } void muts_bed_destroy(muts_bed_t *m) { int32_t i; for(i=0;in;i++) { free(m->muts[i].bases); } free(m->muts); free(m); } DWGSIM-dwgsim.0.1.11/src/mut_bed.h000066400000000000000000000030621231766373200164470ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MUT_BED_H #define MUT_BED_H typedef struct { uint32_t contig; // zero-based uint32_t start; // zero-based uint32_t end; // zero-based end position uint8_t type; // mutation type char *bases; // mutation bases } mut_bed_t; typedef struct { mut_bed_t *muts; int32_t n; int32_t mem; } muts_bed_t; muts_bed_t *muts_bed_init(FILE *fp, contigs_t *c); void muts_bed_destroy(muts_bed_t *m); #endif DWGSIM-dwgsim.0.1.11/src/mut_input.c000066400000000000000000000045141231766373200170520ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "mut.h" #include "dwgsim.h" #include "mut_txt.h" #include "mut_bed.h" #include "mut_vcf.h" #include "mut_input.h" muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type) { muts_input_t *m = calloc(1, sizeof(muts_input_t)); m->type = type; switch(type) { case MUT_INPUT_BED: m->data.bed = muts_bed_init(fp, c); break; case MUT_INPUT_TXT: m->data.txt = muts_txt_init(fp, c); break; case MUT_INPUT_VCF: m->data.vcf = muts_vcf_init(fp, c); break; default: fprintf(stderr, "Error: mutation input type unrecognized!\n"); exit(1); } return m; } void muts_input_destroy(muts_input_t *m) { switch(m->type) { case MUT_INPUT_BED: muts_bed_destroy(m->data.bed); break; case MUT_INPUT_TXT: muts_txt_destroy(m->data.txt); break; case MUT_INPUT_VCF: muts_vcf_destroy(m->data.vcf); break; default: fprintf(stderr, "Error: mutation input type unrecognized!\n"); exit(1); } free(m); } DWGSIM-dwgsim.0.1.11/src/mut_input.h000066400000000000000000000031061231766373200170530ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MUT_INPUT_H #define MUT_INPUT_H #include "mut_bed.h" #include "mut_txt.h" #include "mut_vcf.h" enum { MUT_INPUT_BED = 0, MUT_INPUT_TXT = 1, MUT_INPUT_VCF = 2 }; typedef struct { int32_t type; union { muts_bed_t *bed; muts_txt_t *txt; muts_vcf_t *vcf; } data; } muts_input_t; muts_input_t *muts_input_init(FILE *fp, contigs_t *c, int32_t type); void muts_input_destroy(muts_input_t *m); #endif DWGSIM-dwgsim.0.1.11/src/mut_txt.c000066400000000000000000000073151231766373200165340ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "mut.h" #include "dwgsim.h" #include "mut_txt.h" muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c) { muts_txt_t *m = NULL; int32_t i; char name[1024]; uint32_t pos, prev_pos = 0, is_hap; char ref, mut[1024]; m = calloc(1, sizeof(muts_txt_t)); m->n = 0; m->mem = 1024; m->muts = malloc(m->mem * sizeof(mut_txt_t)); i = 0; while(0 < fscanf(fp, "%s\t%u\t%c\t%s\t%d", name, &pos, &ref, mut, &is_hap)) { // find the contig while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) { i++; prev_pos = 0; } if(c->n == i) { fprintf(stderr, "Error: contig not found [%s]\n", name); exit(1); } else if(pos <= 0 || c->contigs[i].len < pos) { fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos); exit(1); } else if(pos < prev_pos) { fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos); exit(1); } if(m->n == m->mem) { m->mem <<= 1; m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t)); } m->muts[m->n].contig = i; m->muts[m->n].pos = pos; m->muts[m->n].bases = strdup(mut); if('-' == ref && '-' != mut[0]) { m->muts[m->n].type = INSERT; } else if('-' != ref && '-' == mut[0]) { m->muts[m->n].type = DELETE; } else if('-' != ref && '-' != mut[0]) { m->muts[m->n].type = SUBSTITUTE; if(is_hap < 3) { // heterozygous if(nst_nt4_table[(int)m->muts[m->n].bases[0]] < 4) { fprintf(stderr, "Error: heterozygous bases must be in IUPAC form\n"); exit(1); } m->muts[m->n].bases[0] = iupac_and_base_to_mut(m->muts[m->n].bases[0], ref); if('X' == m->muts[m->n].bases[0]) { fprintf(stderr, "Error: out of range\n"); exit(1); } m->muts[m->n].bases[1] = '\0'; } } else { fprintf(stderr, "Error: out of range\n"); exit(1); } m->muts[m->n].is_hap = is_hap; m->n++; } if(m->n < m->mem) { m->mem = m->n; m->muts = realloc(m->muts, m->mem * sizeof(mut_txt_t)); } return m; } void muts_txt_destroy(muts_txt_t *m) { int32_t i; for(i=0;in;i++) { free(m->muts[i].bases); } free(m->muts); free(m); } DWGSIM-dwgsim.0.1.11/src/mut_txt.h000066400000000000000000000027301231766373200165350ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MUT_TXT_H #define MUT_TXT_H typedef struct { uint32_t contig; uint32_t pos; uint8_t type; uint8_t is_hap; char *bases; } mut_txt_t; typedef struct { mut_txt_t *muts; int32_t n; int32_t mem; } muts_txt_t; muts_txt_t *muts_txt_init(FILE *fp, contigs_t *c); void muts_txt_destroy(muts_txt_t *m); #endif DWGSIM-dwgsim.0.1.11/src/mut_vcf.c000066400000000000000000000205731231766373200164740ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "mut.h" #include "dwgsim.h" #include "mut_vcf.h" #define BUFFER_L 1048576 muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c) { static int32_t warned = 0; muts_vcf_t *m = NULL; int32_t i, j; char name[1024]; // 1. #CHROM uint32_t pos; // 2. POS char id[1024]; // 3. ID char ref[1024]; // 4. REF char alt[1025]; // 5. ALT size_t n_read = -1; uint32_t prev_pos = 0, is_hap; int32_t ref_l, alt_l; char buffer[BUFFER_L]; int32_t s, n, len; m = calloc(1, sizeof(muts_vcf_t)); m->n = 0; m->mem = 1024; m->muts = malloc(m->mem * sizeof(mut_vcf_t)); // read in some buffer n_read = fread(buffer, sizeof(char), BUFFER_L, fp); i = s = n = 0; while(0 != s || 0 != n_read) { len = n_read + s; s = 0; while(s < len) { // while more characters in the buffer // find the next newline for(n=s;nn && 0 != strcmp(name, c->contigs[i].name)) { i++; prev_pos = 0; } if(c->n == i) { fprintf(stderr, "Error: contig not found [%s]\n", name); exit(1); } else if(pos <= 0 || c->contigs[i].len < pos) { fprintf(stderr, "Error: start out of range [%s,%u]\n", name, pos); exit(1); } else if(pos < prev_pos) { fprintf(stderr, "Error: out of order [%s,%u]\n", name, pos); exit(1); } ref_l = strlen(ref); alt_l = strlen(alt); if(1 == ref_l && ref[0] == '.') { ref[0] = '\0'; ref_l = 0; } if(1 == alt_l && alt[0] == '.') { alt[0] = '\0'; alt_l = 0; } if(0 == alt_l && 0 == ref_l) { fprintf(stderr, "Error: empty alleles\n"); exit(1); } // TODO: support multiple alleles for(j=0;jmem <= m->n + ref_l - 1) { m->mem <<= 1; m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); } for(j=0;jmuts[m->n].contig = i; m->muts[m->n].pos = pos + j; m->muts[m->n].type = SUBSTITUTE; m->muts[m->n].is_hap = is_hap; m->muts[m->n].bases = malloc(sizeof(char) * 2); m->muts[m->n].bases[0] = alt[j]; m->muts[m->n].bases[1] = '\0'; m->n++; } } else if(ref_l < alt_l) { // INS if(m->n == m->mem) { m->mem <<= 1; m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); } // skip over match bases for(j=0;jmuts[m->n].contig = i; m->muts[m->n].pos = pos; m->muts[m->n].type = INSERT; m->muts[m->n].is_hap = is_hap; m->muts[m->n].bases = strdup(alt+j); m->n++; } else { // DEL // skip over match bases for(j=0;jmem < m->n + (ref_l - j)) { m->mem <<= 1; m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); } for(;jmuts[m->n].contig = i; m->muts[m->n].pos = pos; m->muts[m->n].type = DELETE; m->muts[m->n].is_hap = is_hap; m->muts[m->n].bases = NULL; m->n++; } } prev_pos = pos; // move to 'n' s = n; } n++; // move past last character // shift unused characters down for(s = 0; n < len; n++, s++) { buffer[s] = buffer[n]; } // read in some buffer n_read = fread(buffer + s, sizeof(char), BUFFER_L - s, fp); } if(m->n < m->mem) { m->mem = m->n; m->muts = realloc(m->muts, m->mem * sizeof(mut_vcf_t)); } return m; } void muts_vcf_destroy(muts_vcf_t *m) { int32_t i; for(i=0;in;i++) { free(m->muts[i].bases); } free(m->muts); free(m); } DWGSIM-dwgsim.0.1.11/src/mut_vcf.h000066400000000000000000000025321231766373200164740ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MUT_VCF_H #define MUT_VCF_H #include "mut_txt.h" typedef muts_txt_t muts_vcf_t; typedef mut_txt_t mut_vcf_t; muts_vcf_t *muts_vcf_init(FILE *fp, contigs_t *c); void muts_vcf_destroy(muts_vcf_t *m); #endif DWGSIM-dwgsim.0.1.11/src/regions_bed.c000066400000000000000000000107031231766373200173030ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "contigs.h" #include "regions_bed.h" regions_bed_txt *regions_bed_init(FILE *fp, contigs_t *c) { regions_bed_txt *r = NULL; char name[1024]; uint32_t start, end, len; int32_t i, prev_contig, prev_start, prev_end, b; r = calloc(1, sizeof(regions_bed_txt)); r->n = 0; r->mem = 4; r->contig = malloc(r->mem * sizeof(uint32_t)); r->start = malloc(r->mem * sizeof(uint32_t)); r->end = malloc(r->mem * sizeof(uint32_t)); i = 0; prev_contig = prev_start = prev_end = -1; while(0 < fscanf(fp, "%s\t%u\t%u", name, &start, &end)) { len = end - start + 1; // find the contig while(i < c->n && 0 != strcmp(name, c->contigs[i].name)) { i++; } if(c->n == i) { fprintf(stderr, "Error: contig not found [%s]\n", name); exit(1); } else if(c->contigs[i].len < start) { fprintf(stderr, "Error: start out of range [%s,%u]\n", name, start); exit(1); } else if(c->contigs[i].len < end) { fprintf(stderr, "Error: end out of range [%s,%u]\n", name, end); exit(1); } else if(end < start) { fprintf(stderr, "Error: end < start [%s,%u,%u]\n", name, start, end); exit(1); } else if(prev_contig == i && start < prev_start) { fprintf(stderr, "Error: the input was not sorted [%s,%u,%u,%u]\n", name, start, end, len); exit(1); } if(end - start + 1 != len) { fprintf(stderr, "Warning: len != end - start + 1 [%s,%u,%u,%u]\n", name, start, end, len); } if(prev_contig == i && start <= prev_end && prev_start <= start) { if(prev_end < end) { r->end[r->n-1] = end; prev_end = end; } } else { prev_contig = i; prev_start = start; prev_end = end; while(r->mem <= r->n) { r->mem <<= 1; r->contig = realloc(r->contig, r->mem * sizeof(uint32_t)); r->start = realloc(r->start, r->mem * sizeof(uint32_t)); r->end = realloc(r->end, r->mem * sizeof(uint32_t)); } r->contig[r->n] = i; r->start[r->n] = start; r->end[r->n] = end; r->n++; } // move to the end of the line while(EOF != (b = fgetc(fp))) { if('\n' == b || '\r' == b) break; } } return r; } void regions_bed_destroy(regions_bed_txt *r) { free(r->contig); free(r->start); free(r->end); free(r); } int32_t regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end) { int32_t low, high, mid; if(NULL == r) return 1; low = 0; high = r->n-1; while(low <= high) { mid = (low + high) / 2; if(contig < r->contig[mid] || (contig == r->contig[mid] && start < r->start[mid])) { high = mid - 1; } else if(r->contig[mid] < contig || (r->contig[mid] == contig && r->end[mid] < end)) { low = mid + 1; } else if(r->contig[mid] == contig && r->start[mid] <= start && end <= r->end[mid]) { return 1; } else { break; } } return 0; } DWGSIM-dwgsim.0.1.11/src/regions_bed.h000066400000000000000000000006071231766373200173120ustar00rootroot00000000000000#ifndef REGIONS_BED_H #define REGIONS_BED_H typedef struct { uint32_t *contig; uint32_t *start; uint32_t *end; uint32_t n; uint32_t mem; } regions_bed_txt; regions_bed_txt * regions_bed_init(FILE *fp, contigs_t *c); void regions_bed_destroy(regions_bed_txt *r); int32_t regions_bed_query(regions_bed_txt *r, uint32_t contig, uint32_t start, uint32_t end); #endif